nfs_socket.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 1989, 1991, 1993, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to Berkeley by
    6  * Rick Macklem at The University of Guelph.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 4. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
   33  */
   34 
   35 #include <sys/cdefs.h>
   36 __FBSDID("$FreeBSD: releng/5.3/sys/nfsclient/nfs_socket.c 134536 2004-08-30 21:58:31Z rwatson $");
   37 
   38 /*
   39  * Socket operations for use by nfs
   40  */
   41 
   42 #include "opt_inet6.h"
   43 
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/kernel.h>
   47 #include <sys/lock.h>
   48 #include <sys/malloc.h>
   49 #include <sys/mbuf.h>
   50 #include <sys/mount.h>
   51 #include <sys/mutex.h>
   52 #include <sys/proc.h>
   53 #include <sys/protosw.h>
   54 #include <sys/signalvar.h>
   55 #include <sys/socket.h>
   56 #include <sys/socketvar.h>
   57 #include <sys/sysctl.h>
   58 #include <sys/syslog.h>
   59 #include <sys/vnode.h>
   60 
   61 #include <netinet/in.h>
   62 #include <netinet/tcp.h>
   63 
   64 #include <rpc/rpcclnt.h>
   65 
   66 #include <nfs/rpcv2.h>
   67 #include <nfs/nfsproto.h>
   68 #include <nfsclient/nfs.h>
   69 #include <nfs/xdr_subs.h>
   70 #include <nfsclient/nfsm_subs.h>
   71 #include <nfsclient/nfsmount.h>
   72 #include <nfsclient/nfsnode.h>
   73 
   74 #include <nfs4client/nfs4.h>
   75 
   76 #define TRUE    1
   77 #define FALSE   0
   78 
   79 /*
   80  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
   81  * Use the mean and mean deviation of rtt for the appropriate type of rpc
   82  * for the frequent rpcs and a default for the others.
   83  * The justification for doing "other" this way is that these rpcs
   84  * happen so infrequently that timer est. would probably be stale.
   85  * Also, since many of these rpcs are
   86  * non-idempotent, a conservative timeout is desired.
   87  * getattr, lookup - A+2D
   88  * read, write     - A+4D
   89  * other           - nm_timeo
   90  */
   91 #define NFS_RTO(n, t) \
   92         ((t) == 0 ? (n)->nm_timeo : \
   93          ((t) < 3 ? \
   94           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
   95           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
   96 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
   97 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
   98 
   99 /*
  100  * Defines which timer to use for the procnum.
  101  * 0 - default
  102  * 1 - getattr
  103  * 2 - lookup
  104  * 3 - read
  105  * 4 - write
  106  */
  107 static int proct[NFS_NPROCS] = {
  108         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
  109 };
  110 
  111 static int      nfs_realign_test;
  112 static int      nfs_realign_count;
  113 static int      nfs_bufpackets = 4;
  114 static int      nfs_reconnects;
  115 
  116 SYSCTL_DECL(_vfs_nfs);
  117 
  118 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
  119 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
  120 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
  121 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
  122     "number of times the nfs client has had to reconnect");
  123 
  124 
  125 /*
  126  * There is a congestion window for outstanding rpcs maintained per mount
  127  * point. The cwnd size is adjusted in roughly the way that:
  128  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
  129  * SIGCOMM '88". ACM, August 1988.
  130  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
  131  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
  132  * of rpcs is in progress.
  133  * (The sent count and cwnd are scaled for integer arith.)
  134  * Variants of "slow start" were tried and were found to be too much of a
  135  * performance hit (ave. rtt 3 times larger),
  136  * I suspect due to the large rtt that nfs rpcs have.
  137  */
  138 #define NFS_CWNDSCALE   256
  139 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
  140 #define NFS_NBACKOFF    8
  141 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
  142 struct callout  nfs_callout;
  143 
  144 static int      nfs_msg(struct thread *, const char *, const char *, int);
  145 static int      nfs_rcvlock(struct nfsreq *);
  146 static void     nfs_rcvunlock(struct nfsreq *);
  147 static void     nfs_realign(struct mbuf **pm, int hsiz);
  148 static int      nfs_receive(struct nfsreq *rep, struct sockaddr **aname,
  149                     struct mbuf **mp);
  150 static int      nfs_reply(struct nfsreq *);
  151 static void     nfs_softterm(struct nfsreq *rep);
  152 static int      nfs_reconnect(struct nfsreq *rep);
  153 
  154 /*
  155  * Initialize sockets and congestion for a new NFS connection.
  156  * We do not free the sockaddr if error.
  157  */
  158 int
  159 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
  160 {
  161         struct socket *so;
  162         int error, rcvreserve, sndreserve;
  163         int pktscale;
  164         struct sockaddr *saddr;
  165         struct thread *td = &thread0; /* only used for socreate and sobind */
  166 
  167         NET_ASSERT_GIANT();
  168 
  169         nmp->nm_so = NULL;
  170         saddr = nmp->nm_nam;
  171         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
  172                 nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td);
  173         if (error)
  174                 goto bad;
  175         so = nmp->nm_so;
  176         nmp->nm_soflags = so->so_proto->pr_flags;
  177 
  178         /*
  179          * Some servers require that the client port be a reserved port number.
  180          */
  181         if (nmp->nm_flag & NFSMNT_RESVPORT) {
  182                 struct sockopt sopt;
  183                 int ip, ip2, len;
  184                 struct sockaddr_in6 ssin;
  185                 struct sockaddr *sa;
  186 
  187                 bzero(&sopt, sizeof sopt);
  188                 switch(saddr->sa_family) {
  189                 case AF_INET:
  190                         sopt.sopt_level = IPPROTO_IP;
  191                         sopt.sopt_name = IP_PORTRANGE;
  192                         ip = IP_PORTRANGE_LOW;
  193                         ip2 = IP_PORTRANGE_DEFAULT;
  194                         len = sizeof (struct sockaddr_in);
  195                         break;
  196 #ifdef INET6
  197                 case AF_INET6:
  198                         sopt.sopt_level = IPPROTO_IPV6;
  199                         sopt.sopt_name = IPV6_PORTRANGE;
  200                         ip = IPV6_PORTRANGE_LOW;
  201                         ip2 = IPV6_PORTRANGE_DEFAULT;
  202                         len = sizeof (struct sockaddr_in6);
  203                         break;
  204 #endif
  205                 default:
  206                         goto noresvport;
  207                 }
  208                 sa = (struct sockaddr *)&ssin;
  209                 bzero(sa, len);
  210                 sa->sa_len = len;
  211                 sa->sa_family = saddr->sa_family;
  212                 sopt.sopt_dir = SOPT_SET;
  213                 sopt.sopt_val = (void *)&ip;
  214                 sopt.sopt_valsize = sizeof(ip);
  215                 error = sosetopt(so, &sopt);
  216                 if (error)
  217                         goto bad;
  218                 error = sobind(so, sa, td);
  219                 if (error)
  220                         goto bad;
  221                 ip = ip2;
  222                 error = sosetopt(so, &sopt);
  223                 if (error)
  224                         goto bad;
  225         noresvport: ;
  226         }
  227 
  228         /*
  229          * Protocols that do not require connections may be optionally left
  230          * unconnected for servers that reply from a port other than NFS_PORT.
  231          */
  232         if (nmp->nm_flag & NFSMNT_NOCONN) {
  233                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
  234                         error = ENOTCONN;
  235                         goto bad;
  236                 }
  237         } else {
  238                 error = soconnect(so, nmp->nm_nam, td);
  239                 if (error)
  240                         goto bad;
  241 
  242                 /*
  243                  * Wait for the connection to complete. Cribbed from the
  244                  * connect system call but with the wait timing out so
  245                  * that interruptible mounts don't hang here for a long time.
  246                  */
  247                 SOCK_LOCK(so);
  248                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
  249                         (void) msleep(&so->so_timeo, SOCK_MTX(so),
  250                             PSOCK, "nfscon", 2 * hz);
  251                         if ((so->so_state & SS_ISCONNECTING) &&
  252                             so->so_error == 0 && rep &&
  253                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
  254                                 so->so_state &= ~SS_ISCONNECTING;
  255                                 SOCK_UNLOCK(so);
  256                                 goto bad;
  257                         }
  258                 }
  259                 if (so->so_error) {
  260                         error = so->so_error;
  261                         so->so_error = 0;
  262                         SOCK_UNLOCK(so);
  263                         goto bad;
  264                 }
  265                 SOCK_UNLOCK(so);
  266         }
  267         so->so_rcv.sb_timeo = 12 * hz;
  268         so->so_snd.sb_timeo = 5 * hz;
  269 
  270         /*
  271          * Get buffer reservation size from sysctl, but impose reasonable
  272          * limits.
  273          */
  274         pktscale = nfs_bufpackets;
  275         if (pktscale < 2)
  276                 pktscale = 2;
  277         if (pktscale > 64)
  278                 pktscale = 64;
  279 
  280         if (nmp->nm_sotype == SOCK_DGRAM) {
  281                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  282                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  283                     NFS_MAXPKTHDR) * pktscale;
  284         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
  285                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  286                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  287                     NFS_MAXPKTHDR) * pktscale;
  288         } else {
  289                 if (nmp->nm_sotype != SOCK_STREAM)
  290                         panic("nfscon sotype");
  291                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  292                         struct sockopt sopt;
  293                         int val;
  294 
  295                         bzero(&sopt, sizeof sopt);
  296                         sopt.sopt_dir = SOPT_SET;
  297                         sopt.sopt_level = SOL_SOCKET;
  298                         sopt.sopt_name = SO_KEEPALIVE;
  299                         sopt.sopt_val = &val;
  300                         sopt.sopt_valsize = sizeof val;
  301                         val = 1;
  302                         sosetopt(so, &sopt);
  303                 }
  304                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
  305                         struct sockopt sopt;
  306                         int val;
  307 
  308                         bzero(&sopt, sizeof sopt);
  309                         sopt.sopt_dir = SOPT_SET;
  310                         sopt.sopt_level = IPPROTO_TCP;
  311                         sopt.sopt_name = TCP_NODELAY;
  312                         sopt.sopt_val = &val;
  313                         sopt.sopt_valsize = sizeof val;
  314                         val = 1;
  315                         sosetopt(so, &sopt);
  316                 }
  317                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
  318                     sizeof (u_int32_t)) * pktscale;
  319                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
  320                     sizeof (u_int32_t)) * pktscale;
  321         }
  322         error = soreserve(so, sndreserve, rcvreserve);
  323         if (error)
  324                 goto bad;
  325         SOCKBUF_LOCK(&so->so_rcv);
  326         so->so_rcv.sb_flags |= SB_NOINTR;
  327         SOCKBUF_UNLOCK(&so->so_rcv);
  328         SOCKBUF_LOCK(&so->so_snd);
  329         so->so_snd.sb_flags |= SB_NOINTR;
  330         SOCKBUF_UNLOCK(&so->so_snd);
  331 
  332         /* Initialize other non-zero congestion variables */
  333         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
  334                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
  335         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
  336                 nmp->nm_sdrtt[3] = 0;
  337         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
  338         nmp->nm_sent = 0;
  339         nmp->nm_timeouts = 0;
  340         return (0);
  341 
  342 bad:
  343         nfs_disconnect(nmp);
  344         return (error);
  345 }
  346 
  347 /*
  348  * Reconnect routine:
  349  * Called when a connection is broken on a reliable protocol.
  350  * - clean up the old socket
  351  * - nfs_connect() again
  352  * - set R_MUSTRESEND for all outstanding requests on mount point
  353  * If this fails the mount point is DEAD!
  354  * nb: Must be called with the nfs_sndlock() set on the mount point.
  355  */
  356 static int
  357 nfs_reconnect(struct nfsreq *rep)
  358 {
  359         struct nfsreq *rp;
  360         struct nfsmount *nmp = rep->r_nmp;
  361         int error;
  362 
  363         nfs_reconnects++;
  364         nfs_disconnect(nmp);
  365         while ((error = nfs_connect(nmp, rep)) != 0) {
  366                 if (error == ERESTART)
  367                         error = EINTR;
  368                 if (error == EIO || error == EINTR)
  369                         return (error);
  370                 (void) tsleep(&lbolt, PSOCK, "nfscon", 0);
  371         }
  372 
  373         /*
  374          * Loop through outstanding request list and fix up all requests
  375          * on old socket.
  376          */
  377         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
  378                 if (rp->r_nmp == nmp)
  379                         rp->r_flags |= R_MUSTRESEND;
  380         }
  381         return (0);
  382 }
  383 
  384 /*
  385  * NFS disconnect. Clean up and unlink.
  386  */
  387 void
  388 nfs_disconnect(struct nfsmount *nmp)
  389 {
  390         struct socket *so;
  391 
  392         NET_ASSERT_GIANT();
  393 
  394         if (nmp->nm_so) {
  395                 so = nmp->nm_so;
  396                 nmp->nm_so = NULL;
  397                 soshutdown(so, SHUT_RDWR);
  398                 soclose(so);
  399         }
  400 }
  401 
  402 void
  403 nfs_safedisconnect(struct nfsmount *nmp)
  404 {
  405         struct nfsreq dummyreq;
  406 
  407         bzero(&dummyreq, sizeof(dummyreq));
  408         dummyreq.r_nmp = nmp;
  409         nfs_rcvlock(&dummyreq);
  410         nfs_disconnect(nmp);
  411         nfs_rcvunlock(&dummyreq);
  412 }
  413 
  414 /*
  415  * This is the nfs send routine. For connection based socket types, it
  416  * must be called with an nfs_sndlock() on the socket.
  417  * - return EINTR if the RPC is terminated, 0 otherwise
  418  * - set R_MUSTRESEND if the send fails for any reason
  419  * - do any cleanup required by recoverable socket errors (?)
  420  */
  421 int
  422 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
  423     struct nfsreq *rep)
  424 {
  425         struct sockaddr *sendnam;
  426         int error, error2, soflags, flags;
  427 
  428         NET_ASSERT_GIANT();
  429 
  430         KASSERT(rep, ("nfs_send: called with rep == NULL"));
  431 
  432         error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
  433         if (error) {
  434                 m_freem(top);
  435                 return (error);
  436         }
  437         if ((so = rep->r_nmp->nm_so) == NULL) {
  438                 rep->r_flags |= R_MUSTRESEND;
  439                 m_freem(top);
  440                 return (0);
  441         }
  442         rep->r_flags &= ~R_MUSTRESEND;
  443         soflags = rep->r_nmp->nm_soflags;
  444 
  445         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
  446                 sendnam = NULL;
  447         else
  448                 sendnam = nam;
  449         if (so->so_type == SOCK_SEQPACKET)
  450                 flags = MSG_EOR;
  451         else
  452                 flags = 0;
  453 
  454         error = so->so_proto->pr_usrreqs->pru_sosend(so, sendnam, 0, top, 0,
  455                                                      flags, curthread /*XXX*/);
  456         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
  457                 error = 0;
  458                 rep->r_flags |= R_MUSTRESEND;
  459         }
  460 
  461         if (error) {
  462                 /*
  463                  * Don't report EPIPE errors on nfs sockets.
  464                  * These can be due to idle tcp mounts which will be closed by
  465                  * netapp, solaris, etc. if left idle too long.
  466                  */
  467                 if (error != EPIPE) {
  468                         log(LOG_INFO, "nfs send error %d for server %s\n",
  469                             error,
  470                             rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  471                 }
  472                 /*
  473                  * Deal with errors for the client side.
  474                  */
  475                 error2 = NFS_SIGREP(rep);
  476                 if (error2)
  477                         error = error2;
  478                 else
  479                         rep->r_flags |= R_MUSTRESEND;
  480 
  481                 /*
  482                  * Handle any recoverable (soft) socket errors here. (?)
  483                  */
  484                 if (error != EINTR && error != ERESTART && error != EIO &&
  485                         error != EWOULDBLOCK && error != EPIPE)
  486                         error = 0;
  487         }
  488         return (error);
  489 }
  490 
  491 /*
  492  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
  493  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
  494  * Mark and consolidate the data into a new mbuf list.
  495  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
  496  *     small mbufs.
  497  * For SOCK_STREAM we must be very careful to read an entire record once
  498  * we have read any of it, even if the system call has been interrupted.
  499  */
  500 static int
  501 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
  502 {
  503         struct socket *so;
  504         struct uio auio;
  505         struct iovec aio;
  506         struct mbuf *m;
  507         struct mbuf *control;
  508         u_int32_t len;
  509         struct sockaddr **getnam;
  510         int error, error2, sotype, rcvflg;
  511         struct thread *td = curthread;  /* XXX */
  512 
  513         NET_ASSERT_GIANT();
  514 
  515         /*
  516          * Set up arguments for soreceive()
  517          */
  518         *mp = NULL;
  519         *aname = NULL;
  520         sotype = rep->r_nmp->nm_sotype;
  521 
  522         /*
  523          * For reliable protocols, lock against other senders/receivers
  524          * in case a reconnect is necessary.
  525          * For SOCK_STREAM, first get the Record Mark to find out how much
  526          * more there is to get.
  527          * We must lock the socket against other receivers
  528          * until we have an entire rpc request/reply.
  529          */
  530         if (sotype != SOCK_DGRAM) {
  531                 error = nfs_sndlock(rep);
  532                 if (error)
  533                         return (error);
  534 tryagain:
  535                 /*
  536                  * Check for fatal errors and resending request.
  537                  */
  538                 /*
  539                  * Ugh: If a reconnect attempt just happened, nm_so
  540                  * would have changed. NULL indicates a failed
  541                  * attempt that has essentially shut down this
  542                  * mount point.
  543                  */
  544                 if (rep->r_mrep || (error = NFS_SIGREP(rep)) != 0) {
  545                         nfs_sndunlock(rep);
  546                         return (error == 0 ? EINTR : error);
  547                 }
  548                 so = rep->r_nmp->nm_so;
  549                 if (!so) {
  550                         error = nfs_reconnect(rep);
  551                         if (error) {
  552                                 nfs_sndunlock(rep);
  553                                 return (error);
  554                         }
  555                         goto tryagain;
  556                 }
  557                 while (rep->r_flags & R_MUSTRESEND) {
  558                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_TRYWAIT);
  559                         nfsstats.rpcretries++;
  560                         error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
  561                         if (error) {
  562                                 if (error == EINTR || error == ERESTART ||
  563                                     error == EIO ||
  564                                     (error = nfs_reconnect(rep)) != 0) {
  565                                         nfs_sndunlock(rep);
  566                                         return (error);
  567                                 }
  568                                 goto tryagain;
  569                         }
  570                 }
  571                 nfs_sndunlock(rep);
  572                 if (sotype == SOCK_STREAM) {
  573                         aio.iov_base = (caddr_t) &len;
  574                         aio.iov_len = sizeof(u_int32_t);
  575                         auio.uio_iov = &aio;
  576                         auio.uio_iovcnt = 1;
  577                         auio.uio_segflg = UIO_SYSSPACE;
  578                         auio.uio_rw = UIO_READ;
  579                         auio.uio_offset = 0;
  580                         auio.uio_resid = sizeof(u_int32_t);
  581                         auio.uio_td = td;
  582                         do {
  583                            rcvflg = MSG_WAITALL;
  584                            error = so->so_proto->pr_usrreqs->pru_soreceive
  585                                    (so, NULL, &auio, NULL, NULL, &rcvflg);
  586                            if (error == EWOULDBLOCK) {
  587                                    error2 = NFS_SIGREP(rep);
  588                                    if (error2)
  589                                            return (error2);
  590                            }
  591                         } while (0);
  592                         if (!error && auio.uio_resid > 0) {
  593                             /*
  594                              * Don't log a 0 byte receive; it means
  595                              * that the socket has been closed, and
  596                              * can happen during normal operation
  597                              * (forcible unmount or Solaris server).
  598                              */
  599                             if (auio.uio_resid != sizeof (u_int32_t))
  600                             log(LOG_INFO,
  601                                  "short receive (%d/%d) from nfs server %s\n",
  602                                  (int)(sizeof(u_int32_t) - auio.uio_resid),
  603                                  (int)sizeof(u_int32_t),
  604                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  605                             error = EPIPE;
  606                         }
  607                         if (error)
  608                                 goto errout;
  609                         len = ntohl(len) & ~0x80000000;
  610                         /*
  611                          * This is SERIOUS! We are out of sync with the sender
  612                          * and forcing a disconnect/reconnect is all I can do.
  613                          */
  614                         if (len > NFS_MAXPACKET) {
  615                             log(LOG_ERR, "%s (%d) from nfs server %s\n",
  616                                 "impossible packet length",
  617                                 len,
  618                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  619                             error = EFBIG;
  620                             goto errout;
  621                         }
  622                         auio.uio_resid = len;
  623                         do {
  624                             rcvflg = MSG_WAITALL;
  625                             error =  so->so_proto->pr_usrreqs->pru_soreceive
  626                                     (so, NULL,
  627                                      &auio, mp, NULL, &rcvflg);
  628                         } while (0);
  629                         if (!error && auio.uio_resid > 0) {
  630                             if (len != auio.uio_resid)
  631                             log(LOG_INFO,
  632                                 "short receive (%d/%d) from nfs server %s\n",
  633                                 len - auio.uio_resid, len,
  634                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  635                             error = EPIPE;
  636                         }
  637                 } else {
  638                         /*
  639                          * NB: Since uio_resid is big, MSG_WAITALL is ignored
  640                          * and soreceive() will return when it has either a
  641                          * control msg or a data msg.
  642                          * We have no use for control msg., but must grab them
  643                          * and then throw them away so we know what is going
  644                          * on.
  645                          */
  646                         auio.uio_resid = len = 100000000; /* Anything Big */
  647                         auio.uio_td = td;
  648                         do {
  649                             rcvflg = 0;
  650                             error =  so->so_proto->pr_usrreqs->pru_soreceive
  651                                     (so, NULL,
  652                                 &auio, mp, &control, &rcvflg);
  653                             if (control)
  654                                 m_freem(control);
  655                             if (error == EWOULDBLOCK && rep) {
  656                                    error2 = NFS_SIGREP(rep);
  657                                    if (error2)
  658                                            return (error2);
  659                             }
  660                         } while (!error && *mp == NULL && control);
  661                         if ((rcvflg & MSG_EOR) == 0)
  662                                 printf("Egad!!\n");
  663                         if (!error && *mp == NULL)
  664                                 error = EPIPE;
  665                         len -= auio.uio_resid;
  666                 }
  667 errout:
  668                 if (error && error != EINTR && error != EIO &&
  669                     error != ERESTART) {
  670                         m_freem(*mp);
  671                         *mp = NULL;
  672                         if (error != EPIPE && error != EWOULDBLOCK)
  673                                 log(LOG_INFO,
  674                                     "receive error %d from nfs server %s\n",
  675                                     error,
  676                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  677                         error = nfs_sndlock(rep);
  678                         if (!error) {
  679                                 error = nfs_reconnect(rep);
  680                                 if (!error)
  681                                         goto tryagain;
  682                                 else
  683                                         nfs_sndunlock(rep);
  684                         }
  685                 }
  686         } else {
  687                 /*
  688                  * We may have failed while rebinding the datagram socket
  689                  * so attempt a rebind here.
  690                  */
  691                 if ((so = rep->r_nmp->nm_so) == NULL) {
  692                         error = nfs_sndlock(rep);
  693                         if (!error) {
  694                                 error = nfs_reconnect(rep);
  695                                 nfs_sndunlock(rep);
  696                         }
  697                         if (error)
  698                                 return (error);
  699                         so = rep->r_nmp->nm_so;
  700                 }
  701                 if (so->so_state & SS_ISCONNECTED)
  702                         getnam = NULL;
  703                 else
  704                         getnam = aname;
  705                 auio.uio_resid = len = 1000000;
  706                 auio.uio_td = td;
  707                 do {
  708                         rcvflg = 0;
  709                         error =  so->so_proto->pr_usrreqs->pru_soreceive
  710                                 (so, getnam, &auio, mp,
  711                                 NULL, &rcvflg);
  712                         if (error) {
  713                                 error2 = NFS_SIGREP(rep);
  714                                 if (error2) {
  715                                         error = error2;
  716                                         goto dgramout;
  717                                 }
  718                         }
  719                         if (error) {
  720                                 error2 = nfs_sndlock(rep);
  721                                 if (!error2) {
  722                                         error2 = nfs_reconnect(rep);
  723                                         if (error2)
  724                                                 error = error2;
  725                                         else
  726                                                 so = rep->r_nmp->nm_so;
  727                                         nfs_sndunlock(rep);
  728                                 } else {
  729                                         error = error2;
  730                                 }
  731                         }
  732                 } while (error == EWOULDBLOCK);
  733 dgramout:
  734                 len -= auio.uio_resid;
  735         }
  736         if (error) {
  737                 m_freem(*mp);
  738                 *mp = NULL;
  739         }
  740         /*
  741          * Search for any mbufs that are not a multiple of 4 bytes long
  742          * or with m_data not longword aligned.
  743          * These could cause pointer alignment problems, so copy them to
  744          * well aligned mbufs.
  745          */
  746         nfs_realign(mp, 5 * NFSX_UNSIGNED);
  747         return (error);
  748 }
  749 
  750 /*
  751  * Implement receipt of reply on a socket.
  752  * We must search through the list of received datagrams matching them
  753  * with outstanding requests using the xid, until ours is found.
  754  */
  755 /* ARGSUSED */
  756 static int
  757 nfs_reply(struct nfsreq *myrep)
  758 {
  759         struct nfsreq *rep;
  760         struct nfsmount *nmp = myrep->r_nmp;
  761         int32_t t1;
  762         struct mbuf *mrep, *md;
  763         struct sockaddr *nam;
  764         u_int32_t rxid, *tl;
  765         caddr_t dpos;
  766         int error;
  767 
  768         /*
  769          * Loop around until we get our own reply
  770          */
  771         for (;;) {
  772                 /*
  773                  * Lock against other receivers so that I don't get stuck in
  774                  * sbwait() after someone else has received my reply for me.
  775                  * Also necessary for connection based protocols to avoid
  776                  * race conditions during a reconnect.
  777                  * If nfs_rcvlock() returns EALREADY, that means that
  778                  * the reply has already been recieved by another
  779                  * process and we can return immediately.  In this
  780                  * case, the lock is not taken to avoid races with
  781                  * other processes.
  782                  */
  783                 error = nfs_rcvlock(myrep);
  784                 if (error == EALREADY)
  785                         return (0);
  786                 if (error)
  787                         return (error);
  788                 /*
  789                  * Get the next Rpc reply off the socket
  790                  */
  791                 error = nfs_receive(myrep, &nam, &mrep);
  792                 nfs_rcvunlock(myrep);
  793                 if (error) {
  794 
  795                         /*
  796                          * Ignore routing errors on connectionless protocols??
  797                          */
  798                         if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
  799                                 nmp->nm_so->so_error = 0;
  800                                 if (myrep->r_flags & R_GETONEREP)
  801                                         return (0);
  802                                 continue;
  803                         }
  804                         return (error);
  805                 }
  806                 if (nam)
  807                         FREE(nam, M_SONAME);
  808 
  809                 /*
  810                  * Get the xid and check that it is an rpc reply
  811                  */
  812                 md = mrep;
  813                 dpos = mtod(md, caddr_t);
  814                 tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
  815                 rxid = *tl++;
  816                 if (*tl != rpc_reply) {
  817                         nfsstats.rpcinvalid++;
  818                         m_freem(mrep);
  819 nfsmout:
  820                         if (myrep->r_flags & R_GETONEREP)
  821                                 return (0);
  822                         continue;
  823                 }
  824 
  825                 /*
  826                  * Loop through the request list to match up the reply
  827                  * Iff no match, just drop the datagram
  828                  */
  829                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
  830                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
  831                                 /* Found it.. */
  832                                 rep->r_mrep = mrep;
  833                                 rep->r_md = md;
  834                                 rep->r_dpos = dpos;
  835                                 /*
  836                                  * Update congestion window.
  837                                  * Do the additive increase of
  838                                  * one rpc/rtt.
  839                                  */
  840                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
  841                                         nmp->nm_cwnd +=
  842                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
  843                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
  844                                         if (nmp->nm_cwnd > NFS_MAXCWND)
  845                                                 nmp->nm_cwnd = NFS_MAXCWND;
  846                                 }
  847                                 if (rep->r_flags & R_SENT) {
  848                                         rep->r_flags &= ~R_SENT;
  849                                         nmp->nm_sent -= NFS_CWNDSCALE;
  850                                 }
  851                                 /*
  852                                  * Update rtt using a gain of 0.125 on the mean
  853                                  * and a gain of 0.25 on the deviation.
  854                                  */
  855                                 if (rep->r_flags & R_TIMING) {
  856                                         /*
  857                                          * Since the timer resolution of
  858                                          * NFS_HZ is so course, it can often
  859                                          * result in r_rtt == 0. Since
  860                                          * r_rtt == N means that the actual
  861                                          * rtt is between N+dt and N+2-dt ticks,
  862                                          * add 1.
  863                                          */
  864                                         t1 = rep->r_rtt + 1;
  865                                         t1 -= (NFS_SRTT(rep) >> 3);
  866                                         NFS_SRTT(rep) += t1;
  867                                         if (t1 < 0)
  868                                                 t1 = -t1;
  869                                         t1 -= (NFS_SDRTT(rep) >> 2);
  870                                         NFS_SDRTT(rep) += t1;
  871                                 }
  872                                 nmp->nm_timeouts = 0;
  873                                 break;
  874                         }
  875                 }
  876                 /*
  877                  * If not matched to a request, drop it.
  878                  * If it's mine, get out.
  879                  */
  880                 if (rep == 0) {
  881                         nfsstats.rpcunexpected++;
  882                         m_freem(mrep);
  883                 } else if (rep == myrep) {
  884                         if (rep->r_mrep == NULL)
  885                                 panic("nfsreply nil");
  886                         return (0);
  887                 }
  888                 if (myrep->r_flags & R_GETONEREP)
  889                         return (0);
  890         }
  891 }
  892 
  893 /*
  894  * nfs_request - goes something like this
  895  *      - fill in request struct
  896  *      - links it into list
  897  *      - calls nfs_send() for first transmit
  898  *      - calls nfs_receive() to get reply
  899  *      - break down rpc header and return with nfs reply pointed to
  900  *        by mrep or error
  901  * nb: always frees up mreq mbuf list
  902  */
  903 /* XXX overloaded before */
  904 #define NQ_TRYLATERDEL  15      /* Initial try later delay (sec) */
  905 
  906 int
  907 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
  908     struct thread *td, struct ucred *cred, struct mbuf **mrp,
  909     struct mbuf **mdp, caddr_t *dposp)
  910 {
  911         struct mbuf *mrep, *m2;
  912         struct nfsreq *rep;
  913         u_int32_t *tl;
  914         int i;
  915         struct nfsmount *nmp;
  916         struct mbuf *m, *md, *mheadend;
  917         time_t waituntil;
  918         caddr_t dpos;
  919         int s, error = 0, mrest_len, auth_len, auth_type;
  920         int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0;
  921         struct timeval now;
  922         u_int32_t xid;
  923 
  924         /* Reject requests while attempting a forced unmount. */
  925         if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
  926                 m_freem(mrest);
  927                 return (ESTALE);
  928         }
  929         nmp = VFSTONFS(vp->v_mount);
  930         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
  931                 return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp);
  932         MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
  933         rep->r_nmp = nmp;
  934         rep->r_vp = vp;
  935         rep->r_td = td;
  936         rep->r_procnum = procnum;
  937 
  938         getmicrouptime(&now);
  939         rep->r_lastmsg = now.tv_sec -
  940             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
  941         mrest_len = m_length(mrest, NULL);
  942 
  943         /*
  944          * Get the RPC header with authorization.
  945          */
  946         auth_type = RPCAUTH_UNIX;
  947         if (cred->cr_ngroups < 1)
  948                 panic("nfsreq nogrps");
  949         auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
  950                 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
  951                 5 * NFSX_UNSIGNED;
  952         m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
  953              mrest, mrest_len, &mheadend, &xid);
  954 
  955         /*
  956          * For stream protocols, insert a Sun RPC Record Mark.
  957          */
  958         if (nmp->nm_sotype == SOCK_STREAM) {
  959                 M_PREPEND(m, NFSX_UNSIGNED, M_TRYWAIT);
  960                 *mtod(m, u_int32_t *) = htonl(0x80000000 |
  961                          (m->m_pkthdr.len - NFSX_UNSIGNED));
  962         }
  963         rep->r_mreq = m;
  964         rep->r_xid = xid;
  965 tryagain:
  966         if (nmp->nm_flag & NFSMNT_SOFT)
  967                 rep->r_retry = nmp->nm_retry;
  968         else
  969                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
  970         rep->r_rtt = rep->r_rexmit = 0;
  971         if (proct[procnum] > 0)
  972                 rep->r_flags = R_TIMING;
  973         else
  974                 rep->r_flags = 0;
  975         rep->r_mrep = NULL;
  976 
  977         /*
  978          * Do the client side RPC.
  979          */
  980         nfsstats.rpcrequests++;
  981         /*
  982          * Chain request into list of outstanding requests. Be sure
  983          * to put it LAST so timer finds oldest requests first.
  984          */
  985         s = splsoftclock();
  986         if (TAILQ_EMPTY(&nfs_reqq))
  987                 callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
  988         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
  989 
  990         /*
  991          * If backing off another request or avoiding congestion, don't
  992          * send this one now but let timer do it. If not timing a request,
  993          * do it now.
  994          */
  995         if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
  996                 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
  997                 nmp->nm_sent < nmp->nm_cwnd)) {
  998                 splx(s);
  999                 error = nfs_sndlock(rep);
 1000                 if (!error) {
 1001                         m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
 1002                         error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
 1003                         nfs_sndunlock(rep);
 1004                 }
 1005                 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
 1006                         nmp->nm_sent += NFS_CWNDSCALE;
 1007                         rep->r_flags |= R_SENT;
 1008                 }
 1009         } else {
 1010                 splx(s);
 1011                 rep->r_rtt = -1;
 1012         }
 1013 
 1014         /*
 1015          * Wait for the reply from our send or the timer's.
 1016          */
 1017         if (!error || error == EPIPE)
 1018                 error = nfs_reply(rep);
 1019 
 1020         /*
 1021          * RPC done, unlink the request.
 1022          */
 1023         s = splsoftclock();
 1024         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
 1025         if (TAILQ_EMPTY(&nfs_reqq))
 1026                 callout_stop(&nfs_callout);
 1027         splx(s);
 1028 
 1029         /*
 1030          * Decrement the outstanding request count.
 1031          */
 1032         if (rep->r_flags & R_SENT) {
 1033                 rep->r_flags &= ~R_SENT;        /* paranoia */
 1034                 nmp->nm_sent -= NFS_CWNDSCALE;
 1035         }
 1036 
 1037         /*
 1038          * If there was a successful reply and a tprintf msg.
 1039          * tprintf a response.
 1040          */
 1041         if (!error)
 1042                 nfs_up(rep, nmp, rep->r_td, "is alive again", NFSSTA_TIMEO);
 1043         mrep = rep->r_mrep;
 1044         md = rep->r_md;
 1045         dpos = rep->r_dpos;
 1046         if (error) {
 1047                 m_freem(rep->r_mreq);
 1048                 free((caddr_t)rep, M_NFSREQ);
 1049                 return (error);
 1050         }
 1051 
 1052         /*
 1053          * break down the rpc header and check if ok
 1054          */
 1055         tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 1056         if (*tl++ == rpc_msgdenied) {
 1057                 if (*tl == rpc_mismatch)
 1058                         error = EOPNOTSUPP;
 1059                 else
 1060                         error = EACCES;
 1061                 m_freem(mrep);
 1062                 m_freem(rep->r_mreq);
 1063                 free((caddr_t)rep, M_NFSREQ);
 1064                 return (error);
 1065         }
 1066 
 1067         /*
 1068          * Just throw away any verifyer (ie: kerberos etc).
 1069          */
 1070         i = fxdr_unsigned(int, *tl++);          /* verf type */
 1071         i = fxdr_unsigned(int32_t, *tl);        /* len */
 1072         if (i > 0)
 1073                 nfsm_adv(nfsm_rndup(i));
 1074         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1075         /* 0 == ok */
 1076         if (*tl == 0) {
 1077                 tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1078                 if (*tl != 0) {
 1079                         error = fxdr_unsigned(int, *tl);
 1080                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 1081                                 error == NFSERR_TRYLATER) {
 1082                                 m_freem(mrep);
 1083                                 error = 0;
 1084                                 waituntil = time_second + trylater_delay;
 1085                                 while (time_second < waituntil)
 1086                                         (void) tsleep(&lbolt,
 1087                                                 PSOCK, "nqnfstry", 0);
 1088                                 trylater_delay *= nfs_backoff[trylater_cnt];
 1089                                 if (trylater_cnt < NFS_NBACKOFF - 1)
 1090                                         trylater_cnt++;
 1091                                 goto tryagain;
 1092                         }
 1093 
 1094                         /*
 1095                          * If the File Handle was stale, invalidate the
 1096                          * lookup cache, just in case.
 1097                          */
 1098                         if (error == ESTALE)
 1099                                 cache_purge(vp);
 1100                         if (nmp->nm_flag & NFSMNT_NFSV3) {
 1101                                 *mrp = mrep;
 1102                                 *mdp = md;
 1103                                 *dposp = dpos;
 1104                                 error |= NFSERR_RETERR;
 1105                         } else
 1106                                 m_freem(mrep);
 1107                         m_freem(rep->r_mreq);
 1108                         free((caddr_t)rep, M_NFSREQ);
 1109                         return (error);
 1110                 }
 1111 
 1112                 *mrp = mrep;
 1113                 *mdp = md;
 1114                 *dposp = dpos;
 1115                 m_freem(rep->r_mreq);
 1116                 FREE((caddr_t)rep, M_NFSREQ);
 1117                 return (0);
 1118         }
 1119         m_freem(mrep);
 1120         error = EPROTONOSUPPORT;
 1121 nfsmout:
 1122         m_freem(rep->r_mreq);
 1123         free((caddr_t)rep, M_NFSREQ);
 1124         return (error);
 1125 }
 1126 
 1127 /*
 1128  * Nfs timer routine
 1129  * Scan the nfsreq list and retranmit any requests that have timed out
 1130  * To avoid retransmission attempts on STREAM sockets (in the future) make
 1131  * sure to set the r_retry field to 0 (implies nm_retry == 0).
 1132  */
 1133 void
 1134 nfs_timer(void *arg)
 1135 {
 1136         struct nfsreq *rep;
 1137         struct mbuf *m;
 1138         struct socket *so;
 1139         struct nfsmount *nmp;
 1140         int timeo;
 1141         int s, error;
 1142         struct timeval now;
 1143 
 1144         getmicrouptime(&now);
 1145         s = splnet();
 1146         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
 1147                 nmp = rep->r_nmp;
 1148                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
 1149                         continue;
 1150                 if (nfs_sigintr(nmp, rep, rep->r_td))
 1151                         continue;
 1152                 if (nmp->nm_tprintf_initial_delay != 0 &&
 1153                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
 1154                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
 1155                         rep->r_lastmsg = now.tv_sec;
 1156                         nfs_down(rep, nmp, rep->r_td, "not responding",
 1157                             0, NFSSTA_TIMEO);
 1158 #if 0
 1159                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 1160                                 /* we're not yet completely mounted and */
 1161                                 /* we can't complete an RPC, so we fail */
 1162                                 nfsstats.rpctimeouts++;
 1163                                 nfs_softterm(rep);
 1164                                 continue;
 1165                         }
 1166 #endif
 1167                 }
 1168                 if (rep->r_rtt >= 0) {
 1169                         rep->r_rtt++;
 1170                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
 1171                                 timeo = nmp->nm_timeo;
 1172                         else
 1173                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
 1174                         if (nmp->nm_timeouts > 0)
 1175                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
 1176                         if (rep->r_rtt <= timeo)
 1177                                 continue;
 1178                         if (nmp->nm_timeouts < NFS_NBACKOFF)
 1179                                 nmp->nm_timeouts++;
 1180                 }
 1181                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
 1182                         nfsstats.rpctimeouts++;
 1183                         nfs_softterm(rep);
 1184                         continue;
 1185                 }
 1186                 if (nmp->nm_sotype != SOCK_DGRAM) {
 1187                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1188                                 rep->r_rexmit = NFS_MAXREXMIT;
 1189                         continue;
 1190                 }
 1191                 if ((so = nmp->nm_so) == NULL)
 1192                         continue;
 1193 
 1194                 /*
 1195                  * If there is enough space and the window allows..
 1196                  *      Resend it
 1197                  * Set r_rtt to -1 in case we fail to send it now.
 1198                  */
 1199                 rep->r_rtt = -1;
 1200                 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
 1201                    ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
 1202                     (rep->r_flags & R_SENT) ||
 1203                     nmp->nm_sent < nmp->nm_cwnd) &&
 1204                    (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
 1205                         if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
 1206                             error = (*so->so_proto->pr_usrreqs->pru_send)
 1207                                     (so, 0, m, NULL, NULL, curthread);
 1208                         else
 1209                             error = (*so->so_proto->pr_usrreqs->pru_send)
 1210                                     (so, 0, m, nmp->nm_nam, NULL, curthread);
 1211                         if (error) {
 1212                                 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
 1213                                         so->so_error = 0;
 1214                                 rep->r_flags |= R_RESENDERR;
 1215                         } else {
 1216                                 /*
 1217                                  * Iff first send, start timing
 1218                                  * else turn timing off, backoff timer
 1219                                  * and divide congestion window by 2.
 1220                                  */
 1221                                 rep->r_flags &= ~R_RESENDERR;
 1222                                 if (rep->r_flags & R_SENT) {
 1223                                         rep->r_flags &= ~R_TIMING;
 1224                                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1225                                                 rep->r_rexmit = NFS_MAXREXMIT;
 1226                                         nmp->nm_cwnd >>= 1;
 1227                                         if (nmp->nm_cwnd < NFS_CWNDSCALE)
 1228                                                 nmp->nm_cwnd = NFS_CWNDSCALE;
 1229                                         nfsstats.rpcretries++;
 1230                                 } else {
 1231                                         rep->r_flags |= R_SENT;
 1232                                         nmp->nm_sent += NFS_CWNDSCALE;
 1233                                 }
 1234                                 rep->r_rtt = 0;
 1235                         }
 1236                 }
 1237         }
 1238         splx(s);
 1239         callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
 1240 }
 1241 
 1242 /*
 1243  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
 1244  * wait for all requests to complete. This is used by forced unmounts
 1245  * to terminate any outstanding RPCs.
 1246  */
 1247 int
 1248 nfs_nmcancelreqs(nmp)
 1249         struct nfsmount *nmp;
 1250 {
 1251         struct nfsreq *req;
 1252         int i, s;
 1253 
 1254         s = splnet();
 1255         TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1256                 if (nmp != req->r_nmp || req->r_mrep != NULL ||
 1257                     (req->r_flags & R_SOFTTERM))
 1258                         continue;
 1259                 nfs_softterm(req);
 1260         }
 1261         splx(s);
 1262 
 1263         for (i = 0; i < 30; i++) {
 1264                 s = splnet();
 1265                 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1266                         if (nmp == req->r_nmp)
 1267                                 break;
 1268                 }
 1269                 splx(s);
 1270                 if (req == NULL)
 1271                         return (0);
 1272                 tsleep(&lbolt, PSOCK, "nfscancel", 0);
 1273         }
 1274         return (EBUSY);
 1275 }
 1276 
 1277 /*
 1278  * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
 1279  * The nm_send count is decremented now to avoid deadlocks when the process in
 1280  * soreceive() hasn't yet managed to send its own request.
 1281  */
 1282 
 1283 static void
 1284 nfs_softterm(struct nfsreq *rep)
 1285 {
 1286 
 1287         rep->r_flags |= R_SOFTTERM;
 1288         if (rep->r_flags & R_SENT) {
 1289                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
 1290                 rep->r_flags &= ~R_SENT;
 1291         }
 1292 }
 1293 
 1294 /*
 1295  * Test for a termination condition pending on the process.
 1296  * This is used for NFSMNT_INT mounts.
 1297  */
 1298 int
 1299 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
 1300 {
 1301         struct proc *p;
 1302         sigset_t tmpset;
 1303 
 1304         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 1305                 return nfs4_sigintr(nmp, rep, td);
 1306         if (rep && (rep->r_flags & R_SOFTTERM))
 1307                 return (EIO);
 1308         /* Terminate all requests while attempting a forced unmount. */
 1309         if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 1310                 return (EIO);
 1311         if (!(nmp->nm_flag & NFSMNT_INT))
 1312                 return (0);
 1313         if (td == NULL)
 1314                 return (0);
 1315 
 1316         p = td->td_proc;
 1317         PROC_LOCK(p);
 1318         tmpset = p->p_siglist;
 1319         SIGSETNAND(tmpset, td->td_sigmask);
 1320         mtx_lock(&p->p_sigacts->ps_mtx);
 1321         SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 1322         mtx_unlock(&p->p_sigacts->ps_mtx);
 1323         if (SIGNOTEMPTY(p->p_siglist) && NFSINT_SIGMASK(tmpset)) {
 1324                 PROC_UNLOCK(p);
 1325                 return (EINTR);
 1326         }
 1327         PROC_UNLOCK(p);
 1328 
 1329         return (0);
 1330 }
 1331 
 1332 /*
 1333  * Lock a socket against others.
 1334  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
 1335  * and also to avoid race conditions between the processes with nfs requests
 1336  * in progress when a reconnect is necessary.
 1337  */
 1338 int
 1339 nfs_sndlock(struct nfsreq *rep)
 1340 {
 1341         int *statep = &rep->r_nmp->nm_state;
 1342         struct thread *td;
 1343         int error, slpflag = 0, slptimeo = 0;
 1344 
 1345         td = rep->r_td;
 1346         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1347                 slpflag = PCATCH;
 1348         while (*statep & NFSSTA_SNDLOCK) {
 1349                 error = nfs_sigintr(rep->r_nmp, rep, td);
 1350                 if (error)
 1351                         return (error);
 1352                 *statep |= NFSSTA_WANTSND;
 1353                 (void) tsleep(statep, slpflag | (PZERO - 1),
 1354                         "nfsndlck", slptimeo);
 1355                 if (slpflag == PCATCH) {
 1356                         slpflag = 0;
 1357                         slptimeo = 2 * hz;
 1358                 }
 1359         }
 1360         *statep |= NFSSTA_SNDLOCK;
 1361         return (0);
 1362 }
 1363 
 1364 /*
 1365  * Unlock the stream socket for others.
 1366  */
 1367 void
 1368 nfs_sndunlock(struct nfsreq *rep)
 1369 {
 1370         int *statep = &rep->r_nmp->nm_state;
 1371 
 1372         if ((*statep & NFSSTA_SNDLOCK) == 0)
 1373                 panic("nfs sndunlock");
 1374         *statep &= ~NFSSTA_SNDLOCK;
 1375         if (*statep & NFSSTA_WANTSND) {
 1376                 *statep &= ~NFSSTA_WANTSND;
 1377                 wakeup(statep);
 1378         }
 1379 }
 1380 
 1381 static int
 1382 nfs_rcvlock(struct nfsreq *rep)
 1383 {
 1384         int *statep = &rep->r_nmp->nm_state;
 1385         int error, slpflag, slptimeo = 0;
 1386 
 1387         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1388                 slpflag = PCATCH;
 1389         else
 1390                 slpflag = 0;
 1391         while (*statep & NFSSTA_RCVLOCK) {
 1392                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
 1393                 if (error)
 1394                         return (error);
 1395                 *statep |= NFSSTA_WANTRCV;
 1396                 (void) tsleep(statep, slpflag | (PZERO - 1), "nfsrcvlk",
 1397                         slptimeo);
 1398                 /*
 1399                  * If our reply was recieved while we were sleeping,
 1400                  * then just return without taking the lock to avoid a
 1401                  * situation where a single iod could 'capture' the
 1402                  * recieve lock.
 1403                  */
 1404                 if (rep->r_mrep != NULL)
 1405                         return (EALREADY);
 1406                 if (slpflag == PCATCH) {
 1407                         slpflag = 0;
 1408                         slptimeo = 2 * hz;
 1409                 }
 1410         }
 1411         /* Always fail if our request has been cancelled. */
 1412         if (rep != NULL && (error = NFS_SIGREP(rep)) != 0)
 1413                 return (error);
 1414         *statep |= NFSSTA_RCVLOCK;
 1415         return (0);
 1416 }
 1417 
 1418 /*
 1419  * Unlock the stream socket for others.
 1420  */
 1421 static void
 1422 nfs_rcvunlock(struct nfsreq *rep)
 1423 {
 1424         int *statep = &rep->r_nmp->nm_state;
 1425 
 1426         if ((*statep & NFSSTA_RCVLOCK) == 0)
 1427                 panic("nfs rcvunlock");
 1428         *statep &= ~NFSSTA_RCVLOCK;
 1429         if (*statep & NFSSTA_WANTRCV) {
 1430                 *statep &= ~NFSSTA_WANTRCV;
 1431                 wakeup(statep);
 1432         }
 1433 }
 1434 
 1435 /*
 1436  *      nfs_realign:
 1437  *
 1438  *      Check for badly aligned mbuf data and realign by copying the unaligned
 1439  *      portion of the data into a new mbuf chain and freeing the portions
 1440  *      of the old chain that were replaced.
 1441  *
 1442  *      We cannot simply realign the data within the existing mbuf chain
 1443  *      because the underlying buffers may contain other rpc commands and
 1444  *      we cannot afford to overwrite them.
 1445  *
 1446  *      We would prefer to avoid this situation entirely.  The situation does
 1447  *      not occur with NFS/UDP and is supposed to only occassionally occur
 1448  *      with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
 1449  */
 1450 static void
 1451 nfs_realign(struct mbuf **pm, int hsiz)
 1452 {
 1453         struct mbuf *m;
 1454         struct mbuf *n = NULL;
 1455         int off = 0;
 1456 
 1457         ++nfs_realign_test;
 1458         while ((m = *pm) != NULL) {
 1459                 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
 1460                         MGET(n, M_TRYWAIT, MT_DATA);
 1461                         if (m->m_len >= MINCLSIZE) {
 1462                                 MCLGET(n, M_TRYWAIT);
 1463                         }
 1464                         n->m_len = 0;
 1465                         break;
 1466                 }
 1467                 pm = &m->m_next;
 1468         }
 1469         /*
 1470          * If n is non-NULL, loop on m copying data, then replace the
 1471          * portion of the chain that had to be realigned.
 1472          */
 1473         if (n != NULL) {
 1474                 ++nfs_realign_count;
 1475                 while (m) {
 1476                         m_copyback(n, off, m->m_len, mtod(m, caddr_t));
 1477                         off += m->m_len;
 1478                         m = m->m_next;
 1479                 }
 1480                 m_freem(*pm);
 1481                 *pm = n;
 1482         }
 1483 }
 1484 
 1485 
 1486 static int
 1487 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
 1488 {
 1489         struct proc *p;
 1490 
 1491         p = td ? td->td_proc : NULL;
 1492         if (error) {
 1493                 tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
 1494                     msg, error);
 1495         } else {
 1496                 tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
 1497         }
 1498         return (0);
 1499 }
 1500 
 1501 void
 1502 nfs_down(rep, nmp, td, msg, error, flags)
 1503         struct nfsreq *rep;
 1504         struct nfsmount *nmp;
 1505         struct thread *td;
 1506         const char *msg;
 1507         int error, flags;
 1508 {
 1509 
 1510         if (nmp == NULL)
 1511                 return;
 1512         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
 1513                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1514                     VQ_NOTRESP, 0);
 1515                 nmp->nm_state |= NFSSTA_TIMEO;
 1516         }
 1517 #ifdef NFSSTA_LOCKTIMEO
 1518         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 1519                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1520                     VQ_NOTRESPLOCK, 0);
 1521                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
 1522         }
 1523 #endif
 1524         if (rep)
 1525                 rep->r_flags |= R_TPRINTFMSG;
 1526         nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
 1527 }
 1528 
 1529 void
 1530 nfs_up(rep, nmp, td, msg, flags)
 1531         struct nfsreq *rep;
 1532         struct nfsmount *nmp;
 1533         struct thread *td;
 1534         const char *msg;
 1535         int flags;
 1536 {
 1537         if (nmp == NULL)
 1538                 return;
 1539         if ((rep == NULL) || (rep->r_flags & R_TPRINTFMSG) != 0)
 1540                 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
 1541         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
 1542                 nmp->nm_state &= ~NFSSTA_TIMEO;
 1543                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1544                     VQ_NOTRESP, 1);
 1545         }
 1546 #ifdef NFSSTA_LOCKTIMEO
 1547         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 1548                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 1549                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1550                     VQ_NOTRESPLOCK, 1);
 1551         }
 1552 #endif
 1553 }
 1554
Cache object: 20a725e137d05151bf8f4b3654f34fb6
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/nfsclient/nfs_socket.c

FreeBSD/Linux Kernel Cross Reference
sys/nfsclient/nfs_socket.c