nfs_socket.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1989, 1991, 1993, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to Berkeley by
    6  * Rick Macklem at The University of Guelph.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 4. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
   33  */
   34 
   35 #include <sys/cdefs.h>
   36 __FBSDID("$FreeBSD: src/sys/nfsclient/nfs_socket.c,v 1.114.2.3 2006/04/02 04:15:34 cel Exp $");
   37 
   38 /*
   39  * Socket operations for use by nfs
   40  */
   41 
   42 #include "opt_inet6.h"
   43 
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/kernel.h>
   47 #include <sys/lock.h>
   48 #include <sys/malloc.h>
   49 #include <sys/mbuf.h>
   50 #include <sys/mount.h>
   51 #include <sys/mutex.h>
   52 #include <sys/proc.h>
   53 #include <sys/protosw.h>
   54 #include <sys/signalvar.h>
   55 #include <sys/socket.h>
   56 #include <sys/socketvar.h>
   57 #include <sys/sysctl.h>
   58 #include <sys/syslog.h>
   59 #include <sys/vnode.h>
   60 
   61 #include <netinet/in.h>
   62 #include <netinet/tcp.h>
   63 
   64 #include <rpc/rpcclnt.h>
   65 
   66 #include <nfs/rpcv2.h>
   67 #include <nfs/nfsproto.h>
   68 #include <nfsclient/nfs.h>
   69 #include <nfs/xdr_subs.h>
   70 #include <nfsclient/nfsm_subs.h>
   71 #include <nfsclient/nfsmount.h>
   72 #include <nfsclient/nfsnode.h>
   73 
   74 #include <nfs4client/nfs4.h>
   75 
   76 #define TRUE    1
   77 #define FALSE   0
   78 
   79 /*
   80  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
   81  * Use the mean and mean deviation of rtt for the appropriate type of rpc
   82  * for the frequent rpcs and a default for the others.
   83  * The justification for doing "other" this way is that these rpcs
   84  * happen so infrequently that timer est. would probably be stale.
   85  * Also, since many of these rpcs are
   86  * non-idempotent, a conservative timeout is desired.
   87  * getattr, lookup - A+2D
   88  * read, write     - A+4D
   89  * other           - nm_timeo
   90  */
   91 #define NFS_RTO(n, t) \
   92         ((t) == 0 ? (n)->nm_timeo : \
   93          ((t) < 3 ? \
   94           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
   95           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
   96 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
   97 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
   98 
   99 /*
  100  * Defines which timer to use for the procnum.
  101  * 0 - default
  102  * 1 - getattr
  103  * 2 - lookup
  104  * 3 - read
  105  * 4 - write
  106  */
  107 static int proct[NFS_NPROCS] = {
  108         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
  109 };
  110 
  111 static int      nfs_realign_test;
  112 static int      nfs_realign_count;
  113 static int      nfs_bufpackets = 4;
  114 static int      nfs_reconnects;
  115 static int      nfs3_jukebox_delay = 10;
  116 
  117 SYSCTL_DECL(_vfs_nfs);
  118 
  119 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
  120 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
  121 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
  122 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
  123     "number of times the nfs client has had to reconnect");
  124 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
  125     "number of seconds to delay a retry after receiving EJUKEBOX");
  126 
  127 
  128 /*
  129  * There is a congestion window for outstanding rpcs maintained per mount
  130  * point. The cwnd size is adjusted in roughly the way that:
  131  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
  132  * SIGCOMM '88". ACM, August 1988.
  133  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
  134  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
  135  * of rpcs is in progress.
  136  * (The sent count and cwnd are scaled for integer arith.)
  137  * Variants of "slow start" were tried and were found to be too much of a
  138  * performance hit (ave. rtt 3 times larger),
  139  * I suspect due to the large rtt that nfs rpcs have.
  140  */
  141 #define NFS_CWNDSCALE   256
  142 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
  143 #define NFS_NBACKOFF    8
  144 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
  145 struct callout  nfs_callout;
  146 
  147 static int      nfs_msg(struct thread *, const char *, const char *, int);
  148 static int      nfs_rcvlock(struct nfsreq *);
  149 static void     nfs_rcvunlock(struct nfsreq *);
  150 static void     nfs_realign(struct mbuf **pm, int hsiz);
  151 static int      nfs_receive(struct nfsreq *rep, struct sockaddr **aname,
  152                     struct mbuf **mp);
  153 static int      nfs_reply(struct nfsreq *);
  154 static void     nfs_softterm(struct nfsreq *rep);
  155 static int      nfs_reconnect(struct nfsreq *rep);
  156 
  157 /*
  158  * Initialize sockets and congestion for a new NFS connection.
  159  * We do not free the sockaddr if error.
  160  */
  161 int
  162 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
  163 {
  164         struct socket *so;
  165         int error, rcvreserve, sndreserve;
  166         int pktscale;
  167         struct sockaddr *saddr;
  168         struct thread *td = &thread0; /* only used for socreate and sobind */
  169 
  170         NET_ASSERT_GIANT();
  171 
  172         nmp->nm_so = NULL;
  173         saddr = nmp->nm_nam;
  174         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
  175                 nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td);
  176         if (error)
  177                 goto bad;
  178         so = nmp->nm_so;
  179         nmp->nm_soflags = so->so_proto->pr_flags;
  180 
  181         /*
  182          * Some servers require that the client port be a reserved port number.
  183          */
  184         if (nmp->nm_flag & NFSMNT_RESVPORT) {
  185                 struct sockopt sopt;
  186                 int ip, ip2, len;
  187                 struct sockaddr_in6 ssin;
  188                 struct sockaddr *sa;
  189 
  190                 bzero(&sopt, sizeof sopt);
  191                 switch(saddr->sa_family) {
  192                 case AF_INET:
  193                         sopt.sopt_level = IPPROTO_IP;
  194                         sopt.sopt_name = IP_PORTRANGE;
  195                         ip = IP_PORTRANGE_LOW;
  196                         ip2 = IP_PORTRANGE_DEFAULT;
  197                         len = sizeof (struct sockaddr_in);
  198                         break;
  199 #ifdef INET6
  200                 case AF_INET6:
  201                         sopt.sopt_level = IPPROTO_IPV6;
  202                         sopt.sopt_name = IPV6_PORTRANGE;
  203                         ip = IPV6_PORTRANGE_LOW;
  204                         ip2 = IPV6_PORTRANGE_DEFAULT;
  205                         len = sizeof (struct sockaddr_in6);
  206                         break;
  207 #endif
  208                 default:
  209                         goto noresvport;
  210                 }
  211                 sa = (struct sockaddr *)&ssin;
  212                 bzero(sa, len);
  213                 sa->sa_len = len;
  214                 sa->sa_family = saddr->sa_family;
  215                 sopt.sopt_dir = SOPT_SET;
  216                 sopt.sopt_val = (void *)&ip;
  217                 sopt.sopt_valsize = sizeof(ip);
  218                 error = sosetopt(so, &sopt);
  219                 if (error)
  220                         goto bad;
  221                 error = sobind(so, sa, td);
  222                 if (error)
  223                         goto bad;
  224                 ip = ip2;
  225                 error = sosetopt(so, &sopt);
  226                 if (error)
  227                         goto bad;
  228         noresvport: ;
  229         }
  230 
  231         /*
  232          * Protocols that do not require connections may be optionally left
  233          * unconnected for servers that reply from a port other than NFS_PORT.
  234          */
  235         if (nmp->nm_flag & NFSMNT_NOCONN) {
  236                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
  237                         error = ENOTCONN;
  238                         goto bad;
  239                 }
  240         } else {
  241                 error = soconnect(so, nmp->nm_nam, td);
  242                 if (error)
  243                         goto bad;
  244 
  245                 /*
  246                  * Wait for the connection to complete. Cribbed from the
  247                  * connect system call but with the wait timing out so
  248                  * that interruptible mounts don't hang here for a long time.
  249                  */
  250                 SOCK_LOCK(so);
  251                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
  252                         (void) msleep(&so->so_timeo, SOCK_MTX(so),
  253                             PSOCK, "nfscon", 2 * hz);
  254                         if ((so->so_state & SS_ISCONNECTING) &&
  255                             so->so_error == 0 && rep &&
  256                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
  257                                 so->so_state &= ~SS_ISCONNECTING;
  258                                 SOCK_UNLOCK(so);
  259                                 goto bad;
  260                         }
  261                 }
  262                 if (so->so_error) {
  263                         error = so->so_error;
  264                         so->so_error = 0;
  265                         SOCK_UNLOCK(so);
  266                         goto bad;
  267                 }
  268                 SOCK_UNLOCK(so);
  269         }
  270         so->so_rcv.sb_timeo = 12 * hz;
  271         so->so_snd.sb_timeo = 5 * hz;
  272 
  273         /*
  274          * Get buffer reservation size from sysctl, but impose reasonable
  275          * limits.
  276          */
  277         pktscale = nfs_bufpackets;
  278         if (pktscale < 2)
  279                 pktscale = 2;
  280         if (pktscale > 64)
  281                 pktscale = 64;
  282 
  283         if (nmp->nm_sotype == SOCK_DGRAM) {
  284                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  285                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  286                     NFS_MAXPKTHDR) * pktscale;
  287         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
  288                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  289                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  290                     NFS_MAXPKTHDR) * pktscale;
  291         } else {
  292                 if (nmp->nm_sotype != SOCK_STREAM)
  293                         panic("nfscon sotype");
  294                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  295                         struct sockopt sopt;
  296                         int val;
  297 
  298                         bzero(&sopt, sizeof sopt);
  299                         sopt.sopt_dir = SOPT_SET;
  300                         sopt.sopt_level = SOL_SOCKET;
  301                         sopt.sopt_name = SO_KEEPALIVE;
  302                         sopt.sopt_val = &val;
  303                         sopt.sopt_valsize = sizeof val;
  304                         val = 1;
  305                         sosetopt(so, &sopt);
  306                 }
  307                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
  308                         struct sockopt sopt;
  309                         int val;
  310 
  311                         bzero(&sopt, sizeof sopt);
  312                         sopt.sopt_dir = SOPT_SET;
  313                         sopt.sopt_level = IPPROTO_TCP;
  314                         sopt.sopt_name = TCP_NODELAY;
  315                         sopt.sopt_val = &val;
  316                         sopt.sopt_valsize = sizeof val;
  317                         val = 1;
  318                         sosetopt(so, &sopt);
  319                 }
  320                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
  321                     sizeof (u_int32_t)) * pktscale;
  322                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
  323                     sizeof (u_int32_t)) * pktscale;
  324         }
  325         error = soreserve(so, sndreserve, rcvreserve);
  326         if (error)
  327                 goto bad;
  328         SOCKBUF_LOCK(&so->so_rcv);
  329         so->so_rcv.sb_flags |= SB_NOINTR;
  330         SOCKBUF_UNLOCK(&so->so_rcv);
  331         SOCKBUF_LOCK(&so->so_snd);
  332         so->so_snd.sb_flags |= SB_NOINTR;
  333         SOCKBUF_UNLOCK(&so->so_snd);
  334 
  335         /* Initialize other non-zero congestion variables */
  336         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
  337                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
  338         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
  339                 nmp->nm_sdrtt[3] = 0;
  340         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
  341         nmp->nm_sent = 0;
  342         nmp->nm_timeouts = 0;
  343         return (0);
  344 
  345 bad:
  346         nfs_disconnect(nmp);
  347         return (error);
  348 }
  349 
  350 /*
  351  * Reconnect routine:
  352  * Called when a connection is broken on a reliable protocol.
  353  * - clean up the old socket
  354  * - nfs_connect() again
  355  * - set R_MUSTRESEND for all outstanding requests on mount point
  356  * If this fails the mount point is DEAD!
  357  * nb: Must be called with the nfs_sndlock() set on the mount point.
  358  */
  359 static int
  360 nfs_reconnect(struct nfsreq *rep)
  361 {
  362         struct nfsreq *rp;
  363         struct nfsmount *nmp = rep->r_nmp;
  364         int error;
  365 
  366         nfs_reconnects++;
  367         nfs_disconnect(nmp);
  368         while ((error = nfs_connect(nmp, rep)) != 0) {
  369                 if (error == ERESTART)
  370                         error = EINTR;
  371                 if (error == EIO || error == EINTR)
  372                         return (error);
  373                 (void) tsleep(&lbolt, PSOCK, "nfscon", 0);
  374         }
  375 
  376         /*
  377          * Loop through outstanding request list and fix up all requests
  378          * on old socket.
  379          */
  380         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
  381                 if (rp->r_nmp == nmp)
  382                         rp->r_flags |= R_MUSTRESEND;
  383         }
  384         return (0);
  385 }
  386 
  387 /*
  388  * NFS disconnect. Clean up and unlink.
  389  */
  390 void
  391 nfs_disconnect(struct nfsmount *nmp)
  392 {
  393         struct socket *so;
  394 
  395         NET_ASSERT_GIANT();
  396 
  397         if (nmp->nm_so) {
  398                 so = nmp->nm_so;
  399                 nmp->nm_so = NULL;
  400                 soshutdown(so, SHUT_RDWR);
  401                 soclose(so);
  402         }
  403 }
  404 
  405 void
  406 nfs_safedisconnect(struct nfsmount *nmp)
  407 {
  408         struct nfsreq dummyreq;
  409 
  410         bzero(&dummyreq, sizeof(dummyreq));
  411         dummyreq.r_nmp = nmp;
  412         nfs_rcvlock(&dummyreq);
  413         nfs_disconnect(nmp);
  414         nfs_rcvunlock(&dummyreq);
  415 }
  416 
  417 /*
  418  * This is the nfs send routine. For connection based socket types, it
  419  * must be called with an nfs_sndlock() on the socket.
  420  * - return EINTR if the RPC is terminated, 0 otherwise
  421  * - set R_MUSTRESEND if the send fails for any reason
  422  * - do any cleanup required by recoverable socket errors (?)
  423  */
  424 int
  425 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
  426     struct nfsreq *rep)
  427 {
  428         struct sockaddr *sendnam;
  429         int error, error2, soflags, flags;
  430 
  431         NET_ASSERT_GIANT();
  432 
  433         KASSERT(rep, ("nfs_send: called with rep == NULL"));
  434 
  435         error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
  436         if (error) {
  437                 m_freem(top);
  438                 return (error);
  439         }
  440         if ((so = rep->r_nmp->nm_so) == NULL) {
  441                 rep->r_flags |= R_MUSTRESEND;
  442                 m_freem(top);
  443                 return (0);
  444         }
  445         rep->r_flags &= ~R_MUSTRESEND;
  446         soflags = rep->r_nmp->nm_soflags;
  447 
  448         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
  449                 sendnam = NULL;
  450         else
  451                 sendnam = nam;
  452         if (so->so_type == SOCK_SEQPACKET)
  453                 flags = MSG_EOR;
  454         else
  455                 flags = 0;
  456 
  457         error = so->so_proto->pr_usrreqs->pru_sosend(so, sendnam, 0, top, 0,
  458                                                      flags, curthread /*XXX*/);
  459         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
  460                 error = 0;
  461                 rep->r_flags |= R_MUSTRESEND;
  462         }
  463 
  464         if (error) {
  465                 /*
  466                  * Don't report EPIPE errors on nfs sockets.
  467                  * These can be due to idle tcp mounts which will be closed by
  468                  * netapp, solaris, etc. if left idle too long.
  469                  */
  470                 if (error != EPIPE) {
  471                         log(LOG_INFO, "nfs send error %d for server %s\n",
  472                             error,
  473                             rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  474                 }
  475                 /*
  476                  * Deal with errors for the client side.
  477                  */
  478                 error2 = NFS_SIGREP(rep);
  479                 if (error2)
  480                         error = error2;
  481                 else
  482                         rep->r_flags |= R_MUSTRESEND;
  483 
  484                 /*
  485                  * Handle any recoverable (soft) socket errors here. (?)
  486                  */
  487                 if (error != EINTR && error != ERESTART && error != EIO &&
  488                         error != EWOULDBLOCK && error != EPIPE)
  489                         error = 0;
  490         }
  491         return (error);
  492 }
  493 
  494 /*
  495  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
  496  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
  497  * Mark and consolidate the data into a new mbuf list.
  498  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
  499  *     small mbufs.
  500  * For SOCK_STREAM we must be very careful to read an entire record once
  501  * we have read any of it, even if the system call has been interrupted.
  502  */
  503 static int
  504 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
  505 {
  506         struct socket *so;
  507         struct uio auio;
  508         struct iovec aio;
  509         struct mbuf *m;
  510         struct mbuf *control;
  511         u_int32_t len;
  512         struct sockaddr **getnam;
  513         int error, error2, sotype, rcvflg;
  514         struct thread *td = curthread;  /* XXX */
  515 
  516         NET_ASSERT_GIANT();
  517 
  518         /*
  519          * Set up arguments for soreceive()
  520          */
  521         *mp = NULL;
  522         *aname = NULL;
  523         sotype = rep->r_nmp->nm_sotype;
  524 
  525         /*
  526          * For reliable protocols, lock against other senders/receivers
  527          * in case a reconnect is necessary.
  528          * For SOCK_STREAM, first get the Record Mark to find out how much
  529          * more there is to get.
  530          * We must lock the socket against other receivers
  531          * until we have an entire rpc request/reply.
  532          */
  533         if (sotype != SOCK_DGRAM) {
  534                 error = nfs_sndlock(rep);
  535                 if (error)
  536                         return (error);
  537 tryagain:
  538                 /*
  539                  * Check for fatal errors and resending request.
  540                  */
  541                 /*
  542                  * Ugh: If a reconnect attempt just happened, nm_so
  543                  * would have changed. NULL indicates a failed
  544                  * attempt that has essentially shut down this
  545                  * mount point.
  546                  */
  547                 if (rep->r_mrep || (error = NFS_SIGREP(rep)) != 0) {
  548                         nfs_sndunlock(rep);
  549                         return (error == 0 ? EINTR : error);
  550                 }
  551                 so = rep->r_nmp->nm_so;
  552                 if (!so) {
  553                         error = nfs_reconnect(rep);
  554                         if (error) {
  555                                 nfs_sndunlock(rep);
  556                                 return (error);
  557                         }
  558                         goto tryagain;
  559                 }
  560                 while (rep->r_flags & R_MUSTRESEND) {
  561                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_TRYWAIT);
  562                         nfsstats.rpcretries++;
  563                         error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
  564                         if (error) {
  565                                 if (error == EINTR || error == ERESTART ||
  566                                     error == EIO ||
  567                                     (error = nfs_reconnect(rep)) != 0) {
  568                                         nfs_sndunlock(rep);
  569                                         return (error);
  570                                 }
  571                                 goto tryagain;
  572                         }
  573                 }
  574                 nfs_sndunlock(rep);
  575                 if (sotype == SOCK_STREAM) {
  576                         aio.iov_base = (caddr_t) &len;
  577                         aio.iov_len = sizeof(u_int32_t);
  578                         auio.uio_iov = &aio;
  579                         auio.uio_iovcnt = 1;
  580                         auio.uio_segflg = UIO_SYSSPACE;
  581                         auio.uio_rw = UIO_READ;
  582                         auio.uio_offset = 0;
  583                         auio.uio_resid = sizeof(u_int32_t);
  584                         auio.uio_td = td;
  585                         do {
  586                            rcvflg = MSG_WAITALL;
  587                            error = so->so_proto->pr_usrreqs->pru_soreceive
  588                                    (so, NULL, &auio, NULL, NULL, &rcvflg);
  589                            if (error == EWOULDBLOCK) {
  590                                    error2 = NFS_SIGREP(rep);
  591                                    if (error2)
  592                                            return (error2);
  593                            }
  594                         } while (0);
  595                         if (!error && auio.uio_resid > 0) {
  596                             /*
  597                              * Don't log a 0 byte receive; it means
  598                              * that the socket has been closed, and
  599                              * can happen during normal operation
  600                              * (forcible unmount or Solaris server).
  601                              */
  602                             if (auio.uio_resid != sizeof (u_int32_t))
  603                             log(LOG_INFO,
  604                                  "short receive (%d/%d) from nfs server %s\n",
  605                                  (int)(sizeof(u_int32_t) - auio.uio_resid),
  606                                  (int)sizeof(u_int32_t),
  607                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  608                             error = EPIPE;
  609                         }
  610                         if (error)
  611                                 goto errout;
  612                         len = ntohl(len) & ~0x80000000;
  613                         /*
  614                          * This is SERIOUS! We are out of sync with the sender
  615                          * and forcing a disconnect/reconnect is all I can do.
  616                          */
  617                         if (len > NFS_MAXPACKET) {
  618                             log(LOG_ERR, "%s (%d) from nfs server %s\n",
  619                                 "impossible packet length",
  620                                 len,
  621                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  622                             error = EFBIG;
  623                             goto errout;
  624                         }
  625                         auio.uio_resid = len;
  626                         do {
  627                             rcvflg = MSG_WAITALL;
  628                             error =  so->so_proto->pr_usrreqs->pru_soreceive
  629                                     (so, NULL,
  630                                      &auio, mp, NULL, &rcvflg);
  631                         } while (0);
  632                         if (!error && auio.uio_resid > 0) {
  633                             if (len != auio.uio_resid)
  634                             log(LOG_INFO,
  635                                 "short receive (%d/%d) from nfs server %s\n",
  636                                 len - auio.uio_resid, len,
  637                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  638                             error = EPIPE;
  639                         }
  640                 } else {
  641                         /*
  642                          * NB: Since uio_resid is big, MSG_WAITALL is ignored
  643                          * and soreceive() will return when it has either a
  644                          * control msg or a data msg.
  645                          * We have no use for control msg., but must grab them
  646                          * and then throw them away so we know what is going
  647                          * on.
  648                          */
  649                         auio.uio_resid = len = 100000000; /* Anything Big */
  650                         auio.uio_td = td;
  651                         do {
  652                             rcvflg = 0;
  653                             error =  so->so_proto->pr_usrreqs->pru_soreceive
  654                                     (so, NULL,
  655                                 &auio, mp, &control, &rcvflg);
  656                             if (control)
  657                                 m_freem(control);
  658                             if (error == EWOULDBLOCK && rep) {
  659                                    error2 = NFS_SIGREP(rep);
  660                                    if (error2)
  661                                            return (error2);
  662                             }
  663                         } while (!error && *mp == NULL && control);
  664                         if ((rcvflg & MSG_EOR) == 0)
  665                                 printf("Egad!!\n");
  666                         if (!error && *mp == NULL)
  667                                 error = EPIPE;
  668                         len -= auio.uio_resid;
  669                 }
  670 errout:
  671                 if (error && error != EINTR && error != EIO &&
  672                     error != ERESTART) {
  673                         m_freem(*mp);
  674                         *mp = NULL;
  675                         if (error != EPIPE && error != EWOULDBLOCK)
  676                                 log(LOG_INFO,
  677                                     "receive error %d from nfs server %s\n",
  678                                     error,
  679                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  680                         error = nfs_sndlock(rep);
  681                         if (!error) {
  682                                 error = nfs_reconnect(rep);
  683                                 if (!error)
  684                                         goto tryagain;
  685                                 else
  686                                         nfs_sndunlock(rep);
  687                         }
  688                 }
  689         } else {
  690                 /*
  691                  * We may have failed while rebinding the datagram socket
  692                  * so attempt a rebind here.
  693                  */
  694                 if ((so = rep->r_nmp->nm_so) == NULL) {
  695                         error = nfs_sndlock(rep);
  696                         if (!error) {
  697                                 error = nfs_reconnect(rep);
  698                                 nfs_sndunlock(rep);
  699                         }
  700                         if (error)
  701                                 return (error);
  702                         so = rep->r_nmp->nm_so;
  703                 }
  704                 if (so->so_state & SS_ISCONNECTED)
  705                         getnam = NULL;
  706                 else
  707                         getnam = aname;
  708                 auio.uio_resid = len = 1000000;
  709                 auio.uio_td = td;
  710                 do {
  711                         rcvflg = 0;
  712                         error =  so->so_proto->pr_usrreqs->pru_soreceive
  713                                 (so, getnam, &auio, mp,
  714                                 NULL, &rcvflg);
  715                         if (error) {
  716                                 error2 = NFS_SIGREP(rep);
  717                                 if (error2) {
  718                                         error = error2;
  719                                         goto dgramout;
  720                                 }
  721                         }
  722                         if (error) {
  723                                 error2 = nfs_sndlock(rep);
  724                                 if (!error2) {
  725                                         error2 = nfs_reconnect(rep);
  726                                         if (error2)
  727                                                 error = error2;
  728                                         else
  729                                                 so = rep->r_nmp->nm_so;
  730                                         nfs_sndunlock(rep);
  731                                 } else {
  732                                         error = error2;
  733                                 }
  734                         }
  735                 } while (error == EWOULDBLOCK);
  736 dgramout:
  737                 len -= auio.uio_resid;
  738         }
  739         if (error) {
  740                 m_freem(*mp);
  741                 *mp = NULL;
  742         }
  743         /*
  744          * Search for any mbufs that are not a multiple of 4 bytes long
  745          * or with m_data not longword aligned.
  746          * These could cause pointer alignment problems, so copy them to
  747          * well aligned mbufs.
  748          */
  749         nfs_realign(mp, 5 * NFSX_UNSIGNED);
  750         return (error);
  751 }
  752 
  753 /*
  754  * Implement receipt of reply on a socket.
  755  * We must search through the list of received datagrams matching them
  756  * with outstanding requests using the xid, until ours is found.
  757  */
  758 /* ARGSUSED */
  759 static int
  760 nfs_reply(struct nfsreq *myrep)
  761 {
  762         struct nfsreq *rep;
  763         struct nfsmount *nmp = myrep->r_nmp;
  764         int32_t t1;
  765         struct mbuf *mrep, *md;
  766         struct sockaddr *nam;
  767         u_int32_t rxid, *tl;
  768         caddr_t dpos;
  769         int error;
  770 
  771         /*
  772          * Loop around until we get our own reply
  773          */
  774         for (;;) {
  775                 /*
  776                  * Lock against other receivers so that I don't get stuck in
  777                  * sbwait() after someone else has received my reply for me.
  778                  * Also necessary for connection based protocols to avoid
  779                  * race conditions during a reconnect.
  780                  * If nfs_rcvlock() returns EALREADY, that means that
  781                  * the reply has already been recieved by another
  782                  * process and we can return immediately.  In this
  783                  * case, the lock is not taken to avoid races with
  784                  * other processes.
  785                  */
  786                 error = nfs_rcvlock(myrep);
  787                 if (error == EALREADY)
  788                         return (0);
  789                 if (error)
  790                         return (error);
  791                 /*
  792                  * Get the next Rpc reply off the socket
  793                  */
  794                 error = nfs_receive(myrep, &nam, &mrep);
  795                 nfs_rcvunlock(myrep);
  796                 if (error) {
  797 
  798                         /*
  799                          * Ignore routing errors on connectionless protocols??
  800                          */
  801                         if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
  802                                 nmp->nm_so->so_error = 0;
  803                                 if (myrep->r_flags & R_GETONEREP)
  804                                         return (0);
  805                                 continue;
  806                         }
  807                         return (error);
  808                 }
  809                 if (nam)
  810                         FREE(nam, M_SONAME);
  811 
  812                 /*
  813                  * Get the xid and check that it is an rpc reply
  814                  */
  815                 md = mrep;
  816                 dpos = mtod(md, caddr_t);
  817                 tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
  818                 rxid = *tl++;
  819                 if (*tl != rpc_reply) {
  820                         nfsstats.rpcinvalid++;
  821                         m_freem(mrep);
  822 nfsmout:
  823                         if (myrep->r_flags & R_GETONEREP)
  824                                 return (0);
  825                         continue;
  826                 }
  827 
  828                 /*
  829                  * Loop through the request list to match up the reply
  830                  * Iff no match, just drop the datagram
  831                  */
  832                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
  833                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
  834                                 /* Found it.. */
  835                                 rep->r_mrep = mrep;
  836                                 rep->r_md = md;
  837                                 rep->r_dpos = dpos;
  838                                 /*
  839                                  * Update congestion window.
  840                                  * Do the additive increase of
  841                                  * one rpc/rtt.
  842                                  */
  843                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
  844                                         nmp->nm_cwnd +=
  845                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
  846                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
  847                                         if (nmp->nm_cwnd > NFS_MAXCWND)
  848                                                 nmp->nm_cwnd = NFS_MAXCWND;
  849                                 }
  850                                 if (rep->r_flags & R_SENT) {
  851                                         rep->r_flags &= ~R_SENT;
  852                                         nmp->nm_sent -= NFS_CWNDSCALE;
  853                                 }
  854                                 /*
  855                                  * Update rtt using a gain of 0.125 on the mean
  856                                  * and a gain of 0.25 on the deviation.
  857                                  */
  858                                 if (rep->r_flags & R_TIMING) {
  859                                         /*
  860                                          * Since the timer resolution of
  861                                          * NFS_HZ is so course, it can often
  862                                          * result in r_rtt == 0. Since
  863                                          * r_rtt == N means that the actual
  864                                          * rtt is between N+dt and N+2-dt ticks,
  865                                          * add 1.
  866                                          */
  867                                         t1 = rep->r_rtt + 1;
  868                                         t1 -= (NFS_SRTT(rep) >> 3);
  869                                         NFS_SRTT(rep) += t1;
  870                                         if (t1 < 0)
  871                                                 t1 = -t1;
  872                                         t1 -= (NFS_SDRTT(rep) >> 2);
  873                                         NFS_SDRTT(rep) += t1;
  874                                 }
  875                                 nmp->nm_timeouts = 0;
  876                                 break;
  877                         }
  878                 }
  879                 /*
  880                  * If not matched to a request, drop it.
  881                  * If it's mine, get out.
  882                  */
  883                 if (rep == 0) {
  884                         nfsstats.rpcunexpected++;
  885                         m_freem(mrep);
  886                 } else if (rep == myrep) {
  887                         if (rep->r_mrep == NULL)
  888                                 panic("nfsreply nil");
  889                         return (0);
  890                 }
  891                 if (myrep->r_flags & R_GETONEREP)
  892                         return (0);
  893         }
  894 }
  895 
  896 /*
  897  * nfs_request - goes something like this
  898  *      - fill in request struct
  899  *      - links it into list
  900  *      - calls nfs_send() for first transmit
  901  *      - calls nfs_receive() to get reply
  902  *      - break down rpc header and return with nfs reply pointed to
  903  *        by mrep or error
  904  * nb: always frees up mreq mbuf list
  905  */
  906 int
  907 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
  908     struct thread *td, struct ucred *cred, struct mbuf **mrp,
  909     struct mbuf **mdp, caddr_t *dposp)
  910 {
  911         struct mbuf *mrep, *m2;
  912         struct nfsreq *rep;
  913         u_int32_t *tl;
  914         int i;
  915         struct nfsmount *nmp;
  916         struct mbuf *m, *md, *mheadend;
  917         time_t waituntil;
  918         caddr_t dpos;
  919         int s, error = 0, mrest_len, auth_len, auth_type;
  920         struct timeval now;
  921         u_int32_t xid;
  922 
  923         /* Reject requests while attempting a forced unmount. */
  924         if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
  925                 m_freem(mrest);
  926                 return (ESTALE);
  927         }
  928         nmp = VFSTONFS(vp->v_mount);
  929         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
  930                 return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp);
  931         MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
  932         rep->r_nmp = nmp;
  933         rep->r_vp = vp;
  934         rep->r_td = td;
  935         rep->r_procnum = procnum;
  936 
  937         getmicrouptime(&now);
  938         rep->r_lastmsg = now.tv_sec -
  939             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
  940         mrest_len = m_length(mrest, NULL);
  941 
  942         /*
  943          * Get the RPC header with authorization.
  944          */
  945         auth_type = RPCAUTH_UNIX;
  946         if (cred->cr_ngroups < 1)
  947                 panic("nfsreq nogrps");
  948         auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
  949                 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
  950                 5 * NFSX_UNSIGNED;
  951         m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
  952              mrest, mrest_len, &mheadend, &xid);
  953 
  954         /*
  955          * For stream protocols, insert a Sun RPC Record Mark.
  956          */
  957         if (nmp->nm_sotype == SOCK_STREAM) {
  958                 M_PREPEND(m, NFSX_UNSIGNED, M_TRYWAIT);
  959                 *mtod(m, u_int32_t *) = htonl(0x80000000 |
  960                          (m->m_pkthdr.len - NFSX_UNSIGNED));
  961         }
  962         rep->r_mreq = m;
  963         rep->r_xid = xid;
  964 tryagain:
  965         if (nmp->nm_flag & NFSMNT_SOFT)
  966                 rep->r_retry = nmp->nm_retry;
  967         else
  968                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
  969         rep->r_rtt = rep->r_rexmit = 0;
  970         if (proct[procnum] > 0)
  971                 rep->r_flags = R_TIMING;
  972         else
  973                 rep->r_flags = 0;
  974         rep->r_mrep = NULL;
  975 
  976         /*
  977          * Do the client side RPC.
  978          */
  979         nfsstats.rpcrequests++;
  980         /*
  981          * Chain request into list of outstanding requests. Be sure
  982          * to put it LAST so timer finds oldest requests first.
  983          */
  984         s = splsoftclock();
  985         if (TAILQ_EMPTY(&nfs_reqq))
  986                 callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
  987         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
  988 
  989         /*
  990          * If backing off another request or avoiding congestion, don't
  991          * send this one now but let timer do it. If not timing a request,
  992          * do it now.
  993          */
  994         if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
  995                 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
  996                 nmp->nm_sent < nmp->nm_cwnd)) {
  997                 splx(s);
  998                 error = nfs_sndlock(rep);
  999                 if (!error) {
 1000                         m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
 1001                         error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
 1002                         nfs_sndunlock(rep);
 1003                 }
 1004                 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
 1005                         nmp->nm_sent += NFS_CWNDSCALE;
 1006                         rep->r_flags |= R_SENT;
 1007                 }
 1008         } else {
 1009                 splx(s);
 1010                 rep->r_rtt = -1;
 1011         }
 1012 
 1013         /*
 1014          * Wait for the reply from our send or the timer's.
 1015          */
 1016         if (!error || error == EPIPE)
 1017                 error = nfs_reply(rep);
 1018 
 1019         /*
 1020          * RPC done, unlink the request.
 1021          */
 1022         s = splsoftclock();
 1023         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
 1024         if (TAILQ_EMPTY(&nfs_reqq))
 1025                 callout_stop(&nfs_callout);
 1026         splx(s);
 1027 
 1028         /*
 1029          * Decrement the outstanding request count.
 1030          */
 1031         if (rep->r_flags & R_SENT) {
 1032                 rep->r_flags &= ~R_SENT;        /* paranoia */
 1033                 nmp->nm_sent -= NFS_CWNDSCALE;
 1034         }
 1035 
 1036         /*
 1037          * If there was a successful reply and a tprintf msg.
 1038          * tprintf a response.
 1039          */
 1040         if (!error)
 1041                 nfs_up(rep, nmp, rep->r_td, "is alive again", NFSSTA_TIMEO);
 1042         mrep = rep->r_mrep;
 1043         md = rep->r_md;
 1044         dpos = rep->r_dpos;
 1045         if (error) {
 1046                 m_freem(rep->r_mreq);
 1047                 free((caddr_t)rep, M_NFSREQ);
 1048                 return (error);
 1049         }
 1050 
 1051         /*
 1052          * break down the rpc header and check if ok
 1053          */
 1054         tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 1055         if (*tl++ == rpc_msgdenied) {
 1056                 if (*tl == rpc_mismatch)
 1057                         error = EOPNOTSUPP;
 1058                 else
 1059                         error = EACCES;
 1060                 m_freem(mrep);
 1061                 m_freem(rep->r_mreq);
 1062                 free((caddr_t)rep, M_NFSREQ);
 1063                 return (error);
 1064         }
 1065 
 1066         /*
 1067          * Just throw away any verifyer (ie: kerberos etc).
 1068          */
 1069         i = fxdr_unsigned(int, *tl++);          /* verf type */
 1070         i = fxdr_unsigned(int32_t, *tl);        /* len */
 1071         if (i > 0)
 1072                 nfsm_adv(nfsm_rndup(i));
 1073         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1074         /* 0 == ok */
 1075         if (*tl == 0) {
 1076                 tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1077                 if (*tl != 0) {
 1078                         error = fxdr_unsigned(int, *tl);
 1079                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 1080                                 error == NFSERR_TRYLATER) {
 1081                                 m_freem(mrep);
 1082                                 error = 0;
 1083                                 waituntil = time_second + nfs3_jukebox_delay;
 1084                                 while (time_second < waituntil)
 1085                                         (void) tsleep(&lbolt,
 1086                                                 PSOCK, "nqnfstry", 0);
 1087                                 goto tryagain;
 1088                         }
 1089 
 1090                         /*
 1091                          * If the File Handle was stale, invalidate the
 1092                          * lookup cache, just in case.
 1093                          */
 1094                         if (error == ESTALE)
 1095                                 cache_purge(vp);
 1096                         if (nmp->nm_flag & NFSMNT_NFSV3) {
 1097                                 *mrp = mrep;
 1098                                 *mdp = md;
 1099                                 *dposp = dpos;
 1100                                 error |= NFSERR_RETERR;
 1101                         } else
 1102                                 m_freem(mrep);
 1103                         m_freem(rep->r_mreq);
 1104                         free((caddr_t)rep, M_NFSREQ);
 1105                         return (error);
 1106                 }
 1107 
 1108                 *mrp = mrep;
 1109                 *mdp = md;
 1110                 *dposp = dpos;
 1111                 m_freem(rep->r_mreq);
 1112                 FREE((caddr_t)rep, M_NFSREQ);
 1113                 return (0);
 1114         }
 1115         m_freem(mrep);
 1116         error = EPROTONOSUPPORT;
 1117 nfsmout:
 1118         m_freem(rep->r_mreq);
 1119         free((caddr_t)rep, M_NFSREQ);
 1120         return (error);
 1121 }
 1122 
 1123 /*
 1124  * Nfs timer routine
 1125  * Scan the nfsreq list and retranmit any requests that have timed out
 1126  * To avoid retransmission attempts on STREAM sockets (in the future) make
 1127  * sure to set the r_retry field to 0 (implies nm_retry == 0).
 1128  */
 1129 void
 1130 nfs_timer(void *arg)
 1131 {
 1132         struct nfsreq *rep;
 1133         struct mbuf *m;
 1134         struct socket *so;
 1135         struct nfsmount *nmp;
 1136         int timeo;
 1137         int s, error;
 1138         struct timeval now;
 1139 
 1140         getmicrouptime(&now);
 1141         s = splnet();
 1142         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
 1143                 nmp = rep->r_nmp;
 1144                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
 1145                         continue;
 1146                 if (nfs_sigintr(nmp, rep, rep->r_td))
 1147                         continue;
 1148                 if (nmp->nm_tprintf_initial_delay != 0 &&
 1149                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
 1150                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
 1151                         rep->r_lastmsg = now.tv_sec;
 1152                         nfs_down(rep, nmp, rep->r_td, "not responding",
 1153                             0, NFSSTA_TIMEO);
 1154 #if 0
 1155                         if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 1156                                 /* we're not yet completely mounted and */
 1157                                 /* we can't complete an RPC, so we fail */
 1158                                 nfsstats.rpctimeouts++;
 1159                                 nfs_softterm(rep);
 1160                                 continue;
 1161                         }
 1162 #endif
 1163                 }
 1164                 if (rep->r_rtt >= 0) {
 1165                         rep->r_rtt++;
 1166                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
 1167                                 timeo = nmp->nm_timeo;
 1168                         else
 1169                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
 1170                         if (nmp->nm_timeouts > 0)
 1171                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
 1172                         if (rep->r_rtt <= timeo)
 1173                                 continue;
 1174                         if (nmp->nm_timeouts < NFS_NBACKOFF)
 1175                                 nmp->nm_timeouts++;
 1176                 }
 1177                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
 1178                         nfsstats.rpctimeouts++;
 1179                         nfs_softterm(rep);
 1180                         continue;
 1181                 }
 1182                 if (nmp->nm_sotype != SOCK_DGRAM) {
 1183                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1184                                 rep->r_rexmit = NFS_MAXREXMIT;
 1185                         continue;
 1186                 }
 1187                 if ((so = nmp->nm_so) == NULL)
 1188                         continue;
 1189 
 1190                 /*
 1191                  * If there is enough space and the window allows..
 1192                  *      Resend it
 1193                  * Set r_rtt to -1 in case we fail to send it now.
 1194                  */
 1195                 rep->r_rtt = -1;
 1196                 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
 1197                    ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
 1198                     (rep->r_flags & R_SENT) ||
 1199                     nmp->nm_sent < nmp->nm_cwnd) &&
 1200                    (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
 1201                         if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
 1202                             error = (*so->so_proto->pr_usrreqs->pru_send)
 1203                                     (so, 0, m, NULL, NULL, curthread);
 1204                         else
 1205                             error = (*so->so_proto->pr_usrreqs->pru_send)
 1206                                     (so, 0, m, nmp->nm_nam, NULL, curthread);
 1207                         if (error) {
 1208                                 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
 1209                                         so->so_error = 0;
 1210                                 rep->r_flags |= R_RESENDERR;
 1211                         } else {
 1212                                 /*
 1213                                  * Iff first send, start timing
 1214                                  * else turn timing off, backoff timer
 1215                                  * and divide congestion window by 2.
 1216                                  */
 1217                                 rep->r_flags &= ~R_RESENDERR;
 1218                                 if (rep->r_flags & R_SENT) {
 1219                                         rep->r_flags &= ~R_TIMING;
 1220                                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1221                                                 rep->r_rexmit = NFS_MAXREXMIT;
 1222                                         nmp->nm_cwnd >>= 1;
 1223                                         if (nmp->nm_cwnd < NFS_CWNDSCALE)
 1224                                                 nmp->nm_cwnd = NFS_CWNDSCALE;
 1225                                         nfsstats.rpcretries++;
 1226                                 } else {
 1227                                         rep->r_flags |= R_SENT;
 1228                                         nmp->nm_sent += NFS_CWNDSCALE;
 1229                                 }
 1230                                 rep->r_rtt = 0;
 1231                         }
 1232                 }
 1233         }
 1234         splx(s);
 1235         callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
 1236 }
 1237 
 1238 /*
 1239  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
 1240  * wait for all requests to complete. This is used by forced unmounts
 1241  * to terminate any outstanding RPCs.
 1242  */
 1243 int
 1244 nfs_nmcancelreqs(nmp)
 1245         struct nfsmount *nmp;
 1246 {
 1247         struct nfsreq *req;
 1248         int i, s;
 1249 
 1250         s = splnet();
 1251         TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1252                 if (nmp != req->r_nmp || req->r_mrep != NULL ||
 1253                     (req->r_flags & R_SOFTTERM))
 1254                         continue;
 1255                 nfs_softterm(req);
 1256         }
 1257         splx(s);
 1258 
 1259         for (i = 0; i < 30; i++) {
 1260                 s = splnet();
 1261                 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1262                         if (nmp == req->r_nmp)
 1263                                 break;
 1264                 }
 1265                 splx(s);
 1266                 if (req == NULL)
 1267                         return (0);
 1268                 tsleep(&lbolt, PSOCK, "nfscancel", 0);
 1269         }
 1270         return (EBUSY);
 1271 }
 1272 
 1273 /*
 1274  * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
 1275  * The nm_send count is decremented now to avoid deadlocks when the process in
 1276  * soreceive() hasn't yet managed to send its own request.
 1277  */
 1278 
 1279 static void
 1280 nfs_softterm(struct nfsreq *rep)
 1281 {
 1282 
 1283         rep->r_flags |= R_SOFTTERM;
 1284         if (rep->r_flags & R_SENT) {
 1285                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
 1286                 rep->r_flags &= ~R_SENT;
 1287         }
 1288 }
 1289 
 1290 /*
 1291  * Test for a termination condition pending on the process.
 1292  * This is used for NFSMNT_INT mounts.
 1293  */
 1294 int
 1295 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
 1296 {
 1297         struct proc *p;
 1298         sigset_t tmpset;
 1299 
 1300         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 1301                 return nfs4_sigintr(nmp, rep, td);
 1302         if (rep && (rep->r_flags & R_SOFTTERM))
 1303                 return (EIO);
 1304         /* Terminate all requests while attempting a forced unmount. */
 1305         if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 1306                 return (EIO);
 1307         if (!(nmp->nm_flag & NFSMNT_INT))
 1308                 return (0);
 1309         if (td == NULL)
 1310                 return (0);
 1311 
 1312         p = td->td_proc;
 1313         PROC_LOCK(p);
 1314         tmpset = p->p_siglist;
 1315         SIGSETNAND(tmpset, td->td_sigmask);
 1316         mtx_lock(&p->p_sigacts->ps_mtx);
 1317         SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 1318         mtx_unlock(&p->p_sigacts->ps_mtx);
 1319         if (SIGNOTEMPTY(p->p_siglist) && NFSINT_SIGMASK(tmpset)) {
 1320                 PROC_UNLOCK(p);
 1321                 return (EINTR);
 1322         }
 1323         PROC_UNLOCK(p);
 1324 
 1325         return (0);
 1326 }
 1327 
 1328 /*
 1329  * Lock a socket against others.
 1330  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
 1331  * and also to avoid race conditions between the processes with nfs requests
 1332  * in progress when a reconnect is necessary.
 1333  */
 1334 int
 1335 nfs_sndlock(struct nfsreq *rep)
 1336 {
 1337         int *statep = &rep->r_nmp->nm_state;
 1338         struct thread *td;
 1339         int error, slpflag = 0, slptimeo = 0;
 1340 
 1341         td = rep->r_td;
 1342         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1343                 slpflag = PCATCH;
 1344         while (*statep & NFSSTA_SNDLOCK) {
 1345                 error = nfs_sigintr(rep->r_nmp, rep, td);
 1346                 if (error)
 1347                         return (error);
 1348                 *statep |= NFSSTA_WANTSND;
 1349                 (void) tsleep(statep, slpflag | (PZERO - 1),
 1350                         "nfsndlck", slptimeo);
 1351                 if (slpflag == PCATCH) {
 1352                         slpflag = 0;
 1353                         slptimeo = 2 * hz;
 1354                 }
 1355         }
 1356         *statep |= NFSSTA_SNDLOCK;
 1357         return (0);
 1358 }
 1359 
 1360 /*
 1361  * Unlock the stream socket for others.
 1362  */
 1363 void
 1364 nfs_sndunlock(struct nfsreq *rep)
 1365 {
 1366         int *statep = &rep->r_nmp->nm_state;
 1367 
 1368         if ((*statep & NFSSTA_SNDLOCK) == 0)
 1369                 panic("nfs sndunlock");
 1370         *statep &= ~NFSSTA_SNDLOCK;
 1371         if (*statep & NFSSTA_WANTSND) {
 1372                 *statep &= ~NFSSTA_WANTSND;
 1373                 wakeup(statep);
 1374         }
 1375 }
 1376 
 1377 static int
 1378 nfs_rcvlock(struct nfsreq *rep)
 1379 {
 1380         int *statep = &rep->r_nmp->nm_state;
 1381         int error, slpflag, slptimeo = 0;
 1382 
 1383         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1384                 slpflag = PCATCH;
 1385         else
 1386                 slpflag = 0;
 1387         while (*statep & NFSSTA_RCVLOCK) {
 1388                 error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
 1389                 if (error)
 1390                         return (error);
 1391                 *statep |= NFSSTA_WANTRCV;
 1392                 (void) tsleep(statep, slpflag | (PZERO - 1), "nfsrcvlk",
 1393                         slptimeo);
 1394                 /*
 1395                  * If our reply was recieved while we were sleeping,
 1396                  * then just return without taking the lock to avoid a
 1397                  * situation where a single iod could 'capture' the
 1398                  * recieve lock.
 1399                  */
 1400                 if (rep->r_mrep != NULL)
 1401                         return (EALREADY);
 1402                 if (slpflag == PCATCH) {
 1403                         slpflag = 0;
 1404                         slptimeo = 2 * hz;
 1405                 }
 1406         }
 1407         /* Always fail if our request has been cancelled. */
 1408         if (rep != NULL && (error = NFS_SIGREP(rep)) != 0)
 1409                 return (error);
 1410         *statep |= NFSSTA_RCVLOCK;
 1411         return (0);
 1412 }
 1413 
 1414 /*
 1415  * Unlock the stream socket for others.
 1416  */
 1417 static void
 1418 nfs_rcvunlock(struct nfsreq *rep)
 1419 {
 1420         int *statep = &rep->r_nmp->nm_state;
 1421 
 1422         if ((*statep & NFSSTA_RCVLOCK) == 0)
 1423                 panic("nfs rcvunlock");
 1424         *statep &= ~NFSSTA_RCVLOCK;
 1425         if (*statep & NFSSTA_WANTRCV) {
 1426                 *statep &= ~NFSSTA_WANTRCV;
 1427                 wakeup(statep);
 1428         }
 1429 }
 1430 
 1431 /*
 1432  *      nfs_realign:
 1433  *
 1434  *      Check for badly aligned mbuf data and realign by copying the unaligned
 1435  *      portion of the data into a new mbuf chain and freeing the portions
 1436  *      of the old chain that were replaced.
 1437  *
 1438  *      We cannot simply realign the data within the existing mbuf chain
 1439  *      because the underlying buffers may contain other rpc commands and
 1440  *      we cannot afford to overwrite them.
 1441  *
 1442  *      We would prefer to avoid this situation entirely.  The situation does
 1443  *      not occur with NFS/UDP and is supposed to only occassionally occur
 1444  *      with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
 1445  */
 1446 static void
 1447 nfs_realign(struct mbuf **pm, int hsiz)
 1448 {
 1449         struct mbuf *m;
 1450         struct mbuf *n = NULL;
 1451         int off = 0;
 1452 
 1453         ++nfs_realign_test;
 1454         while ((m = *pm) != NULL) {
 1455                 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
 1456                         MGET(n, M_TRYWAIT, MT_DATA);
 1457                         if (m->m_len >= MINCLSIZE) {
 1458                                 MCLGET(n, M_TRYWAIT);
 1459                         }
 1460                         n->m_len = 0;
 1461                         break;
 1462                 }
 1463                 pm = &m->m_next;
 1464         }
 1465         /*
 1466          * If n is non-NULL, loop on m copying data, then replace the
 1467          * portion of the chain that had to be realigned.
 1468          */
 1469         if (n != NULL) {
 1470                 ++nfs_realign_count;
 1471                 while (m) {
 1472                         m_copyback(n, off, m->m_len, mtod(m, caddr_t));
 1473                         off += m->m_len;
 1474                         m = m->m_next;
 1475                 }
 1476                 m_freem(*pm);
 1477                 *pm = n;
 1478         }
 1479 }
 1480 
 1481 
 1482 static int
 1483 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
 1484 {
 1485         struct proc *p;
 1486 
 1487         p = td ? td->td_proc : NULL;
 1488         if (error) {
 1489                 tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
 1490                     msg, error);
 1491         } else {
 1492                 tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
 1493         }
 1494         return (0);
 1495 }
 1496 
 1497 void
 1498 nfs_down(rep, nmp, td, msg, error, flags)
 1499         struct nfsreq *rep;
 1500         struct nfsmount *nmp;
 1501         struct thread *td;
 1502         const char *msg;
 1503         int error, flags;
 1504 {
 1505 
 1506         if (nmp == NULL)
 1507                 return;
 1508         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
 1509                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1510                     VQ_NOTRESP, 0);
 1511                 nmp->nm_state |= NFSSTA_TIMEO;
 1512         }
 1513 #ifdef NFSSTA_LOCKTIMEO
 1514         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 1515                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1516                     VQ_NOTRESPLOCK, 0);
 1517                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
 1518         }
 1519 #endif
 1520         if (rep)
 1521                 rep->r_flags |= R_TPRINTFMSG;
 1522         nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
 1523 }
 1524 
 1525 void
 1526 nfs_up(rep, nmp, td, msg, flags)
 1527         struct nfsreq *rep;
 1528         struct nfsmount *nmp;
 1529         struct thread *td;
 1530         const char *msg;
 1531         int flags;
 1532 {
 1533         if (nmp == NULL)
 1534                 return;
 1535         if ((rep == NULL) || (rep->r_flags & R_TPRINTFMSG) != 0)
 1536                 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
 1537         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
 1538                 nmp->nm_state &= ~NFSSTA_TIMEO;
 1539                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1540                     VQ_NOTRESP, 1);
 1541         }
 1542 #ifdef NFSSTA_LOCKTIMEO
 1543         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 1544                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 1545                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1546                     VQ_NOTRESPLOCK, 1);
 1547         }
 1548 #endif
 1549 }
 1550
Cache object: 9035f994d6a6a0267dd0ef7203124dd4
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/nfsclient/nfs_socket.c

FreeBSD/Linux Kernel Cross Reference
sys/nfsclient/nfs_socket.c