nfs_socket.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 1989, 1991, 1993, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to Berkeley by
    6  * Rick Macklem at The University of Guelph.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 3. All advertising materials mentioning features or use of this software
   17  *    must display the following acknowledgement:
   18  *      This product includes software developed by the University of
   19  *      California, Berkeley and its contributors.
   20  * 4. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
   37  */
   38 
   39 #include <sys/cdefs.h>
   40 __FBSDID("$FreeBSD: releng/5.2/sys/nfsclient/nfs_socket.c 122698 2003-11-14 20:54:10Z alfred $");
   41 
   42 /*
   43  * Socket operations for use by nfs
   44  */
   45 
   46 #include "opt_inet6.h"
   47 
   48 #include <sys/param.h>
   49 #include <sys/systm.h>
   50 #include <sys/kernel.h>
   51 #include <sys/lock.h>
   52 #include <sys/malloc.h>
   53 #include <sys/mbuf.h>
   54 #include <sys/mount.h>
   55 #include <sys/mutex.h>
   56 #include <sys/proc.h>
   57 #include <sys/protosw.h>
   58 #include <sys/signalvar.h>
   59 #include <sys/socket.h>
   60 #include <sys/socketvar.h>
   61 #include <sys/sysctl.h>
   62 #include <sys/syslog.h>
   63 #include <sys/vnode.h>
   64 
   65 #include <netinet/in.h>
   66 #include <netinet/tcp.h>
   67 
   68 #include <rpc/rpcclnt.h>
   69 
   70 #include <nfs/rpcv2.h>
   71 #include <nfs/nfsproto.h>
   72 #include <nfsclient/nfs.h>
   73 #include <nfs/xdr_subs.h>
   74 #include <nfsclient/nfsm_subs.h>
   75 #include <nfsclient/nfsmount.h>
   76 #include <nfsclient/nfsnode.h>
   77 
   78 #include <nfs4client/nfs4.h>
   79 
   80 #define TRUE    1
   81 #define FALSE   0
   82 
   83 /*
   84  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
   85  * Use the mean and mean deviation of rtt for the appropriate type of rpc
   86  * for the frequent rpcs and a default for the others.
   87  * The justification for doing "other" this way is that these rpcs
   88  * happen so infrequently that timer est. would probably be stale.
   89  * Also, since many of these rpcs are
   90  * non-idempotent, a conservative timeout is desired.
   91  * getattr, lookup - A+2D
   92  * read, write     - A+4D
   93  * other           - nm_timeo
   94  */
   95 #define NFS_RTO(n, t) \
   96         ((t) == 0 ? (n)->nm_timeo : \
   97          ((t) < 3 ? \
   98           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
   99           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
  100 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
  101 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
  102 
  103 /*
  104  * Defines which timer to use for the procnum.
  105  * 0 - default
  106  * 1 - getattr
  107  * 2 - lookup
  108  * 3 - read
  109  * 4 - write
  110  */
  111 static int proct[NFS_NPROCS] = {
  112         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
  113 };
  114 
  115 static int      nfs_realign_test;
  116 static int      nfs_realign_count;
  117 static int      nfs_bufpackets = 4;
  118 
  119 SYSCTL_DECL(_vfs_nfs);
  120 
  121 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
  122 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
  123 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
  124 
  125 
  126 /*
  127  * There is a congestion window for outstanding rpcs maintained per mount
  128  * point. The cwnd size is adjusted in roughly the way that:
  129  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
  130  * SIGCOMM '88". ACM, August 1988.
  131  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
  132  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
  133  * of rpcs is in progress.
  134  * (The sent count and cwnd are scaled for integer arith.)
  135  * Variants of "slow start" were tried and were found to be too much of a
  136  * performance hit (ave. rtt 3 times larger),
  137  * I suspect due to the large rtt that nfs rpcs have.
  138  */
  139 #define NFS_CWNDSCALE   256
  140 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
  141 #define NFS_NBACKOFF    8
  142 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
  143 struct callout_handle   nfs_timer_handle;
  144 
  145 static int      nfs_msg(struct thread *, char *, char *);
  146 static int      nfs_rcvlock(struct nfsreq *);
  147 static void     nfs_rcvunlock(struct nfsreq *);
  148 static void     nfs_realign(struct mbuf **pm, int hsiz);
  149 static int      nfs_receive(struct nfsreq *rep, struct sockaddr **aname,
  150                     struct mbuf **mp);
  151 static int      nfs_reply(struct nfsreq *);
  152 static void     nfs_softterm(struct nfsreq *rep);
  153 static int      nfs_reconnect(struct nfsreq *rep);
  154 
  155 /*
  156  * Initialize sockets and congestion for a new NFS connection.
  157  * We do not free the sockaddr if error.
  158  */
  159 int
  160 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
  161 {
  162         struct socket *so;
  163         int s, error, rcvreserve, sndreserve;
  164         int pktscale;
  165         struct sockaddr *saddr;
  166         struct thread *td = &thread0; /* only used for socreate and sobind */
  167 
  168         GIANT_REQUIRED;         /* XXX until socket locking done */
  169 
  170         nmp->nm_so = NULL;
  171         saddr = nmp->nm_nam;
  172         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
  173                 nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td);
  174         if (error)
  175                 goto bad;
  176         so = nmp->nm_so;
  177         nmp->nm_soflags = so->so_proto->pr_flags;
  178 
  179         /*
  180          * Some servers require that the client port be a reserved port number.
  181          */
  182         if (nmp->nm_flag & NFSMNT_RESVPORT) {
  183                 struct sockopt sopt;
  184                 int ip, ip2, len;
  185                 struct sockaddr_in6 ssin;
  186                 struct sockaddr *sa;
  187 
  188                 bzero(&sopt, sizeof sopt);
  189                 switch(saddr->sa_family) {
  190                 case AF_INET:
  191                         sopt.sopt_level = IPPROTO_IP;
  192                         sopt.sopt_name = IP_PORTRANGE;
  193                         ip = IP_PORTRANGE_LOW;
  194                         ip2 = IP_PORTRANGE_DEFAULT;
  195                         len = sizeof (struct sockaddr_in);
  196                         break;
  197 #ifdef INET6
  198                 case AF_INET6:
  199                         sopt.sopt_level = IPPROTO_IPV6;
  200                         sopt.sopt_name = IPV6_PORTRANGE;
  201                         ip = IPV6_PORTRANGE_LOW;
  202                         ip2 = IPV6_PORTRANGE_DEFAULT;
  203                         len = sizeof (struct sockaddr_in6);
  204                         break;
  205 #endif
  206                 default:
  207                         goto noresvport;
  208                 }
  209                 sa = (struct sockaddr *)&ssin;
  210                 bzero(sa, len);
  211                 sa->sa_len = len;
  212                 sa->sa_family = saddr->sa_family;
  213                 sopt.sopt_dir = SOPT_SET;
  214                 sopt.sopt_val = (void *)&ip;
  215                 sopt.sopt_valsize = sizeof(ip);
  216                 error = sosetopt(so, &sopt);
  217                 if (error)
  218                         goto bad;
  219                 error = sobind(so, sa, td);
  220                 if (error)
  221                         goto bad;
  222                 ip = ip2;
  223                 error = sosetopt(so, &sopt);
  224                 if (error)
  225                         goto bad;
  226         noresvport: ;
  227         }
  228 
  229         /*
  230          * Protocols that do not require connections may be optionally left
  231          * unconnected for servers that reply from a port other than NFS_PORT.
  232          */
  233         if (nmp->nm_flag & NFSMNT_NOCONN) {
  234                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
  235                         error = ENOTCONN;
  236                         goto bad;
  237                 }
  238         } else {
  239                 error = soconnect(so, nmp->nm_nam, td);
  240                 if (error)
  241                         goto bad;
  242 
  243                 /*
  244                  * Wait for the connection to complete. Cribbed from the
  245                  * connect system call but with the wait timing out so
  246                  * that interruptible mounts don't hang here for a long time.
  247                  */
  248                 s = splnet();
  249                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
  250                         (void) tsleep(&so->so_timeo,
  251                             PSOCK, "nfscon", 2 * hz);
  252                         if ((so->so_state & SS_ISCONNECTING) &&
  253                             so->so_error == 0 && rep &&
  254                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
  255                                 so->so_state &= ~SS_ISCONNECTING;
  256                                 splx(s);
  257                                 goto bad;
  258                         }
  259                 }
  260                 if (so->so_error) {
  261                         error = so->so_error;
  262                         so->so_error = 0;
  263                         splx(s);
  264                         goto bad;
  265                 }
  266                 splx(s);
  267         }
  268         so->so_rcv.sb_timeo = 5 * hz;
  269         so->so_snd.sb_timeo = 5 * hz;
  270 
  271         /*
  272          * Get buffer reservation size from sysctl, but impose reasonable
  273          * limits.
  274          */
  275         pktscale = nfs_bufpackets;
  276         if (pktscale < 2)
  277                 pktscale = 2;
  278         if (pktscale > 64)
  279                 pktscale = 64;
  280 
  281         if (nmp->nm_sotype == SOCK_DGRAM) {
  282                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  283                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  284                     NFS_MAXPKTHDR) * pktscale;
  285         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
  286                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  287                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  288                     NFS_MAXPKTHDR) * pktscale;
  289         } else {
  290                 if (nmp->nm_sotype != SOCK_STREAM)
  291                         panic("nfscon sotype");
  292                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  293                         struct sockopt sopt;
  294                         int val;
  295 
  296                         bzero(&sopt, sizeof sopt);
  297                         sopt.sopt_dir = SOPT_SET;
  298                         sopt.sopt_level = SOL_SOCKET;
  299                         sopt.sopt_name = SO_KEEPALIVE;
  300                         sopt.sopt_val = &val;
  301                         sopt.sopt_valsize = sizeof val;
  302                         val = 1;
  303                         sosetopt(so, &sopt);
  304                 }
  305                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
  306                         struct sockopt sopt;
  307                         int val;
  308 
  309                         bzero(&sopt, sizeof sopt);
  310                         sopt.sopt_dir = SOPT_SET;
  311                         sopt.sopt_level = IPPROTO_TCP;
  312                         sopt.sopt_name = TCP_NODELAY;
  313                         sopt.sopt_val = &val;
  314                         sopt.sopt_valsize = sizeof val;
  315                         val = 1;
  316                         sosetopt(so, &sopt);
  317                 }
  318                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
  319                     sizeof (u_int32_t)) * pktscale;
  320                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
  321                     sizeof (u_int32_t)) * pktscale;
  322         }
  323         error = soreserve(so, sndreserve, rcvreserve);
  324         if (error)
  325                 goto bad;
  326         so->so_rcv.sb_flags |= SB_NOINTR;
  327         so->so_snd.sb_flags |= SB_NOINTR;
  328 
  329         /* Initialize other non-zero congestion variables */
  330         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
  331                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
  332         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
  333                 nmp->nm_sdrtt[3] = 0;
  334         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
  335         nmp->nm_sent = 0;
  336         nmp->nm_timeouts = 0;
  337         return (0);
  338 
  339 bad:
  340         nfs_disconnect(nmp);
  341         return (error);
  342 }
  343 
  344 /*
  345  * Reconnect routine:
  346  * Called when a connection is broken on a reliable protocol.
  347  * - clean up the old socket
  348  * - nfs_connect() again
  349  * - set R_MUSTRESEND for all outstanding requests on mount point
  350  * If this fails the mount point is DEAD!
  351  * nb: Must be called with the nfs_sndlock() set on the mount point.
  352  */
  353 static int
  354 nfs_reconnect(struct nfsreq *rep)
  355 {
  356         struct nfsreq *rp;
  357         struct nfsmount *nmp = rep->r_nmp;
  358         int error;
  359 
  360         nfs_disconnect(nmp);
  361         while ((error = nfs_connect(nmp, rep)) != 0) {
  362                 if (error == EINTR || error == ERESTART)
  363                         return (EINTR);
  364                 (void) tsleep(&lbolt, PSOCK, "nfscon", 0);
  365         }
  366 
  367         /*
  368          * Loop through outstanding request list and fix up all requests
  369          * on old socket.
  370          */
  371         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
  372                 if (rp->r_nmp == nmp)
  373                         rp->r_flags |= R_MUSTRESEND;
  374         }
  375         return (0);
  376 }
  377 
  378 /*
  379  * NFS disconnect. Clean up and unlink.
  380  */
  381 void
  382 nfs_disconnect(struct nfsmount *nmp)
  383 {
  384         struct socket *so;
  385 
  386         GIANT_REQUIRED;         /* XXX until socket locking done */
  387 
  388         if (nmp->nm_so) {
  389                 so = nmp->nm_so;
  390                 nmp->nm_so = NULL;
  391                 soshutdown(so, 2);
  392                 soclose(so);
  393         }
  394 }
  395 
  396 void
  397 nfs_safedisconnect(struct nfsmount *nmp)
  398 {
  399         struct nfsreq dummyreq;
  400 
  401         bzero(&dummyreq, sizeof(dummyreq));
  402         dummyreq.r_nmp = nmp;
  403         nfs_rcvlock(&dummyreq);
  404         nfs_disconnect(nmp);
  405         nfs_rcvunlock(&dummyreq);
  406 }
  407 
  408 /*
  409  * This is the nfs send routine. For connection based socket types, it
  410  * must be called with an nfs_sndlock() on the socket.
  411  * - return EINTR if the RPC is terminated, 0 otherwise
  412  * - set R_MUSTRESEND if the send fails for any reason
  413  * - do any cleanup required by recoverable socket errors (?)
  414  */
  415 int
  416 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
  417     struct nfsreq *rep)
  418 {
  419         struct sockaddr *sendnam;
  420         int error, soflags, flags;
  421 
  422         GIANT_REQUIRED;         /* XXX until socket locking done */
  423 
  424         KASSERT(rep, ("nfs_send: called with rep == NULL"));
  425 
  426         if (rep->r_flags & R_SOFTTERM) {
  427                 m_freem(top);
  428                 return (EINTR);
  429         }
  430         if ((so = rep->r_nmp->nm_so) == NULL) {
  431                 rep->r_flags |= R_MUSTRESEND;
  432                 m_freem(top);
  433                 return (0);
  434         }
  435         rep->r_flags &= ~R_MUSTRESEND;
  436         soflags = rep->r_nmp->nm_soflags;
  437 
  438         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
  439                 sendnam = NULL;
  440         else
  441                 sendnam = nam;
  442         if (so->so_type == SOCK_SEQPACKET)
  443                 flags = MSG_EOR;
  444         else
  445                 flags = 0;
  446 
  447         error = so->so_proto->pr_usrreqs->pru_sosend(so, sendnam, 0, top, 0,
  448                                                      flags, curthread /*XXX*/);
  449         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
  450                 error = 0;
  451                 rep->r_flags |= R_MUSTRESEND;
  452         }
  453 
  454         if (error) {
  455                 log(LOG_INFO, "nfs send error %d for server %s\n", error,
  456                     rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  457                 /*
  458                  * Deal with errors for the client side.
  459                  */
  460                 if (rep->r_flags & R_SOFTTERM)
  461                         error = EINTR;
  462                 else
  463                         rep->r_flags |= R_MUSTRESEND;
  464 
  465                 /*
  466                  * Handle any recoverable (soft) socket errors here. (?)
  467                  */
  468                 if (error != EINTR && error != ERESTART &&
  469                         error != EWOULDBLOCK && error != EPIPE)
  470                         error = 0;
  471         }
  472         return (error);
  473 }
  474 
  475 /*
  476  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
  477  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
  478  * Mark and consolidate the data into a new mbuf list.
  479  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
  480  *     small mbufs.
  481  * For SOCK_STREAM we must be very careful to read an entire record once
  482  * we have read any of it, even if the system call has been interrupted.
  483  */
  484 static int
  485 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
  486 {
  487         struct socket *so;
  488         struct uio auio;
  489         struct iovec aio;
  490         struct mbuf *m;
  491         struct mbuf *control;
  492         u_int32_t len;
  493         struct sockaddr **getnam;
  494         int error, sotype, rcvflg;
  495         struct thread *td = curthread;  /* XXX */
  496 
  497         GIANT_REQUIRED;         /* XXX until socket locking done */
  498 
  499         /*
  500          * Set up arguments for soreceive()
  501          */
  502         *mp = NULL;
  503         *aname = NULL;
  504         sotype = rep->r_nmp->nm_sotype;
  505 
  506         /*
  507          * For reliable protocols, lock against other senders/receivers
  508          * in case a reconnect is necessary.
  509          * For SOCK_STREAM, first get the Record Mark to find out how much
  510          * more there is to get.
  511          * We must lock the socket against other receivers
  512          * until we have an entire rpc request/reply.
  513          */
  514         if (sotype != SOCK_DGRAM) {
  515                 error = nfs_sndlock(rep);
  516                 if (error)
  517                         return (error);
  518 tryagain:
  519                 /*
  520                  * Check for fatal errors and resending request.
  521                  */
  522                 /*
  523                  * Ugh: If a reconnect attempt just happened, nm_so
  524                  * would have changed. NULL indicates a failed
  525                  * attempt that has essentially shut down this
  526                  * mount point.
  527                  */
  528                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
  529                         nfs_sndunlock(rep);
  530                         return (EINTR);
  531                 }
  532                 so = rep->r_nmp->nm_so;
  533                 if (!so) {
  534                         error = nfs_reconnect(rep);
  535                         if (error) {
  536                                 nfs_sndunlock(rep);
  537                                 return (error);
  538                         }
  539                         goto tryagain;
  540                 }
  541                 while (rep->r_flags & R_MUSTRESEND) {
  542                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_TRYWAIT);
  543                         nfsstats.rpcretries++;
  544                         error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
  545                         if (error) {
  546                                 if (error == EINTR || error == ERESTART ||
  547                                     (error = nfs_reconnect(rep)) != 0) {
  548                                         nfs_sndunlock(rep);
  549                                         return (error);
  550                                 }
  551                                 goto tryagain;
  552                         }
  553                 }
  554                 nfs_sndunlock(rep);
  555                 if (sotype == SOCK_STREAM) {
  556                         aio.iov_base = (caddr_t) &len;
  557                         aio.iov_len = sizeof(u_int32_t);
  558                         auio.uio_iov = &aio;
  559                         auio.uio_iovcnt = 1;
  560                         auio.uio_segflg = UIO_SYSSPACE;
  561                         auio.uio_rw = UIO_READ;
  562                         auio.uio_offset = 0;
  563                         auio.uio_resid = sizeof(u_int32_t);
  564                         auio.uio_td = td;
  565                         do {
  566                            rcvflg = MSG_WAITALL;
  567                            error = so->so_proto->pr_usrreqs->pru_soreceive
  568                                    (so, NULL, &auio, NULL, NULL, &rcvflg);
  569                            if (error == EWOULDBLOCK && rep) {
  570                                 if (rep->r_flags & R_SOFTTERM)
  571                                         return (EINTR);
  572                            }
  573                         } while (error == EWOULDBLOCK);
  574                         if (!error && auio.uio_resid > 0) {
  575                             /*
  576                              * Don't log a 0 byte receive; it means
  577                              * that the socket has been closed, and
  578                              * can happen during normal operation
  579                              * (forcible unmount or Solaris server).
  580                              */
  581                             if (auio.uio_resid != sizeof (u_int32_t))
  582                             log(LOG_INFO,
  583                                  "short receive (%d/%d) from nfs server %s\n",
  584                                  (int)(sizeof(u_int32_t) - auio.uio_resid),
  585                                  (int)sizeof(u_int32_t),
  586                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  587                             error = EPIPE;
  588                         }
  589                         if (error)
  590                                 goto errout;
  591                         len = ntohl(len) & ~0x80000000;
  592                         /*
  593                          * This is SERIOUS! We are out of sync with the sender
  594                          * and forcing a disconnect/reconnect is all I can do.
  595                          */
  596                         if (len > NFS_MAXPACKET) {
  597                             log(LOG_ERR, "%s (%d) from nfs server %s\n",
  598                                 "impossible packet length",
  599                                 len,
  600                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  601                             error = EFBIG;
  602                             goto errout;
  603                         }
  604                         auio.uio_resid = len;
  605                         do {
  606                             rcvflg = MSG_WAITALL;
  607                             error =  so->so_proto->pr_usrreqs->pru_soreceive
  608                                     (so, NULL,
  609                                      &auio, mp, NULL, &rcvflg);
  610                         } while (error == EWOULDBLOCK || error == EINTR ||
  611                                  error == ERESTART);
  612                         if (!error && auio.uio_resid > 0) {
  613                             if (len != auio.uio_resid)
  614                             log(LOG_INFO,
  615                                 "short receive (%d/%d) from nfs server %s\n",
  616                                 len - auio.uio_resid, len,
  617                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  618                             error = EPIPE;
  619                         }
  620                 } else {
  621                         /*
  622                          * NB: Since uio_resid is big, MSG_WAITALL is ignored
  623                          * and soreceive() will return when it has either a
  624                          * control msg or a data msg.
  625                          * We have no use for control msg., but must grab them
  626                          * and then throw them away so we know what is going
  627                          * on.
  628                          */
  629                         auio.uio_resid = len = 100000000; /* Anything Big */
  630                         auio.uio_td = td;
  631                         do {
  632                             rcvflg = 0;
  633                             error =  so->so_proto->pr_usrreqs->pru_soreceive
  634                                     (so, NULL,
  635                                 &auio, mp, &control, &rcvflg);
  636                             if (control)
  637                                 m_freem(control);
  638                             if (error == EWOULDBLOCK && rep) {
  639                                 if (rep->r_flags & R_SOFTTERM)
  640                                         return (EINTR);
  641                             }
  642                         } while (error == EWOULDBLOCK ||
  643                                  (!error && *mp == NULL && control));
  644                         if ((rcvflg & MSG_EOR) == 0)
  645                                 printf("Egad!!\n");
  646                         if (!error && *mp == NULL)
  647                                 error = EPIPE;
  648                         len -= auio.uio_resid;
  649                 }
  650 errout:
  651                 if (error && error != EINTR && error != ERESTART) {
  652                         m_freem(*mp);
  653                         *mp = NULL;
  654                         if (error != EPIPE)
  655                                 log(LOG_INFO,
  656                                     "receive error %d from nfs server %s\n",
  657                                     error,
  658                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  659                         error = nfs_sndlock(rep);
  660                         if (!error) {
  661                                 error = nfs_reconnect(rep);
  662                                 if (!error)
  663                                         goto tryagain;
  664                                 else
  665                                         nfs_sndunlock(rep);
  666                         }
  667                 }
  668         } else {
  669                 if ((so = rep->r_nmp->nm_so) == NULL)
  670                         return (EACCES);
  671                 if (so->so_state & SS_ISCONNECTED)
  672                         getnam = NULL;
  673                 else
  674                         getnam = aname;
  675                 auio.uio_resid = len = 1000000;
  676                 auio.uio_td = td;
  677                 do {
  678                         rcvflg = 0;
  679                         error =  so->so_proto->pr_usrreqs->pru_soreceive
  680                                 (so, getnam, &auio, mp,
  681                                 NULL, &rcvflg);
  682                         if (error == EWOULDBLOCK &&
  683                             (rep->r_flags & R_SOFTTERM))
  684                                 return (EINTR);
  685                 } while (error == EWOULDBLOCK);
  686                 len -= auio.uio_resid;
  687         }
  688         if (error) {
  689                 m_freem(*mp);
  690                 *mp = NULL;
  691         }
  692         /*
  693          * Search for any mbufs that are not a multiple of 4 bytes long
  694          * or with m_data not longword aligned.
  695          * These could cause pointer alignment problems, so copy them to
  696          * well aligned mbufs.
  697          */
  698         nfs_realign(mp, 5 * NFSX_UNSIGNED);
  699         return (error);
  700 }
  701 
  702 /*
  703  * Implement receipt of reply on a socket.
  704  * We must search through the list of received datagrams matching them
  705  * with outstanding requests using the xid, until ours is found.
  706  */
  707 /* ARGSUSED */
  708 static int
  709 nfs_reply(struct nfsreq *myrep)
  710 {
  711         struct nfsreq *rep;
  712         struct nfsmount *nmp = myrep->r_nmp;
  713         int32_t t1;
  714         struct mbuf *mrep, *md;
  715         struct sockaddr *nam;
  716         u_int32_t rxid, *tl;
  717         caddr_t dpos;
  718         int error;
  719 
  720         /*
  721          * Loop around until we get our own reply
  722          */
  723         for (;;) {
  724                 /*
  725                  * Lock against other receivers so that I don't get stuck in
  726                  * sbwait() after someone else has received my reply for me.
  727                  * Also necessary for connection based protocols to avoid
  728                  * race conditions during a reconnect.
  729                  * If nfs_rcvlock() returns EALREADY, that means that
  730                  * the reply has already been recieved by another
  731                  * process and we can return immediately.  In this
  732                  * case, the lock is not taken to avoid races with
  733                  * other processes.
  734                  */
  735                 error = nfs_rcvlock(myrep);
  736                 if (error == EALREADY)
  737                         return (0);
  738                 if (error)
  739                         return (error);
  740                 /*
  741                  * Get the next Rpc reply off the socket
  742                  */
  743                 error = nfs_receive(myrep, &nam, &mrep);
  744                 nfs_rcvunlock(myrep);
  745                 if (error) {
  746 
  747                         /*
  748                          * Ignore routing errors on connectionless protocols??
  749                          */
  750                         if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
  751                                 nmp->nm_so->so_error = 0;
  752                                 if (myrep->r_flags & R_GETONEREP)
  753                                         return (0);
  754                                 continue;
  755                         }
  756                         return (error);
  757                 }
  758                 if (nam)
  759                         FREE(nam, M_SONAME);
  760 
  761                 /*
  762                  * Get the xid and check that it is an rpc reply
  763                  */
  764                 md = mrep;
  765                 dpos = mtod(md, caddr_t);
  766                 tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
  767                 rxid = *tl++;
  768                 if (*tl != rpc_reply) {
  769                         nfsstats.rpcinvalid++;
  770                         m_freem(mrep);
  771 nfsmout:
  772                         if (myrep->r_flags & R_GETONEREP)
  773                                 return (0);
  774                         continue;
  775                 }
  776 
  777                 /*
  778                  * Loop through the request list to match up the reply
  779                  * Iff no match, just drop the datagram
  780                  */
  781                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
  782                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
  783                                 /* Found it.. */
  784                                 rep->r_mrep = mrep;
  785                                 rep->r_md = md;
  786                                 rep->r_dpos = dpos;
  787                                 /*
  788                                  * Update congestion window.
  789                                  * Do the additive increase of
  790                                  * one rpc/rtt.
  791                                  */
  792                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
  793                                         nmp->nm_cwnd +=
  794                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
  795                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
  796                                         if (nmp->nm_cwnd > NFS_MAXCWND)
  797                                                 nmp->nm_cwnd = NFS_MAXCWND;
  798                                 }
  799                                 if (rep->r_flags & R_SENT) {
  800                                         rep->r_flags &= ~R_SENT;
  801                                         nmp->nm_sent -= NFS_CWNDSCALE;
  802                                 }
  803                                 /*
  804                                  * Update rtt using a gain of 0.125 on the mean
  805                                  * and a gain of 0.25 on the deviation.
  806                                  */
  807                                 if (rep->r_flags & R_TIMING) {
  808                                         /*
  809                                          * Since the timer resolution of
  810                                          * NFS_HZ is so course, it can often
  811                                          * result in r_rtt == 0. Since
  812                                          * r_rtt == N means that the actual
  813                                          * rtt is between N+dt and N+2-dt ticks,
  814                                          * add 1.
  815                                          */
  816                                         t1 = rep->r_rtt + 1;
  817                                         t1 -= (NFS_SRTT(rep) >> 3);
  818                                         NFS_SRTT(rep) += t1;
  819                                         if (t1 < 0)
  820                                                 t1 = -t1;
  821                                         t1 -= (NFS_SDRTT(rep) >> 2);
  822                                         NFS_SDRTT(rep) += t1;
  823                                 }
  824                                 nmp->nm_timeouts = 0;
  825                                 break;
  826                         }
  827                 }
  828                 /*
  829                  * If not matched to a request, drop it.
  830                  * If it's mine, get out.
  831                  */
  832                 if (rep == 0) {
  833                         nfsstats.rpcunexpected++;
  834                         m_freem(mrep);
  835                 } else if (rep == myrep) {
  836                         if (rep->r_mrep == NULL)
  837                                 panic("nfsreply nil");
  838                         return (0);
  839                 }
  840                 if (myrep->r_flags & R_GETONEREP)
  841                         return (0);
  842         }
  843 }
  844 
  845 /*
  846  * nfs_request - goes something like this
  847  *      - fill in request struct
  848  *      - links it into list
  849  *      - calls nfs_send() for first transmit
  850  *      - calls nfs_receive() to get reply
  851  *      - break down rpc header and return with nfs reply pointed to
  852  *        by mrep or error
  853  * nb: always frees up mreq mbuf list
  854  */
  855 /* XXX overloaded before */
  856 #define NQ_TRYLATERDEL  15      /* Initial try later delay (sec) */
  857 
  858 int
  859 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
  860     struct thread *td, struct ucred *cred, struct mbuf **mrp,
  861     struct mbuf **mdp, caddr_t *dposp)
  862 {
  863         struct mbuf *mrep, *m2;
  864         struct nfsreq *rep;
  865         u_int32_t *tl;
  866         int i;
  867         struct nfsmount *nmp;
  868         struct mbuf *m, *md, *mheadend;
  869         time_t waituntil;
  870         caddr_t dpos;
  871         int s, error = 0, mrest_len, auth_len, auth_type;
  872         int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0;
  873         u_int32_t xid;
  874 
  875         /* Reject requests while attempting a forced unmount. */
  876         if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
  877                 m_freem(mrest);
  878                 return (ESTALE);
  879         }
  880         nmp = VFSTONFS(vp->v_mount);
  881         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
  882                 return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp);
  883         MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
  884         rep->r_nmp = nmp;
  885         rep->r_vp = vp;
  886         rep->r_td = td;
  887         rep->r_procnum = procnum;
  888         mrest_len = m_length(mrest, NULL);
  889 
  890         /*
  891          * Get the RPC header with authorization.
  892          */
  893         auth_type = RPCAUTH_UNIX;
  894         if (cred->cr_ngroups < 1)
  895                 panic("nfsreq nogrps");
  896         auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
  897                 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
  898                 5 * NFSX_UNSIGNED;
  899         m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
  900              mrest, mrest_len, &mheadend, &xid);
  901 
  902         /*
  903          * For stream protocols, insert a Sun RPC Record Mark.
  904          */
  905         if (nmp->nm_sotype == SOCK_STREAM) {
  906                 M_PREPEND(m, NFSX_UNSIGNED, M_TRYWAIT);
  907                 *mtod(m, u_int32_t *) = htonl(0x80000000 |
  908                          (m->m_pkthdr.len - NFSX_UNSIGNED));
  909         }
  910         rep->r_mreq = m;
  911         rep->r_xid = xid;
  912 tryagain:
  913         if (nmp->nm_flag & NFSMNT_SOFT)
  914                 rep->r_retry = nmp->nm_retry;
  915         else
  916                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
  917         rep->r_rtt = rep->r_rexmit = 0;
  918         if (proct[procnum] > 0)
  919                 rep->r_flags = R_TIMING;
  920         else
  921                 rep->r_flags = 0;
  922         rep->r_mrep = NULL;
  923 
  924         /*
  925          * Do the client side RPC.
  926          */
  927         nfsstats.rpcrequests++;
  928         /*
  929          * Chain request into list of outstanding requests. Be sure
  930          * to put it LAST so timer finds oldest requests first.
  931          */
  932         s = splsoftclock();
  933         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
  934 
  935         /*
  936          * If backing off another request or avoiding congestion, don't
  937          * send this one now but let timer do it. If not timing a request,
  938          * do it now.
  939          */
  940         if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
  941                 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
  942                 nmp->nm_sent < nmp->nm_cwnd)) {
  943                 splx(s);
  944                 if (nmp->nm_soflags & PR_CONNREQUIRED)
  945                         error = nfs_sndlock(rep);
  946                 if (!error) {
  947                         m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
  948                         error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
  949                         if (nmp->nm_soflags & PR_CONNREQUIRED)
  950                                 nfs_sndunlock(rep);
  951                 }
  952                 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
  953                         nmp->nm_sent += NFS_CWNDSCALE;
  954                         rep->r_flags |= R_SENT;
  955                 }
  956         } else {
  957                 splx(s);
  958                 rep->r_rtt = -1;
  959         }
  960 
  961         /*
  962          * Wait for the reply from our send or the timer's.
  963          */
  964         if (!error || error == EPIPE)
  965                 error = nfs_reply(rep);
  966 
  967         /*
  968          * RPC done, unlink the request.
  969          */
  970         s = splsoftclock();
  971         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
  972         splx(s);
  973 
  974         /*
  975          * Decrement the outstanding request count.
  976          */
  977         if (rep->r_flags & R_SENT) {
  978                 rep->r_flags &= ~R_SENT;        /* paranoia */
  979                 nmp->nm_sent -= NFS_CWNDSCALE;
  980         }
  981 
  982         /*
  983          * If there was a successful reply and a tprintf msg.
  984          * tprintf a response.
  985          */
  986         if (!error && (rep->r_flags & R_TPRINTFMSG))
  987                 nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
  988                     "is alive again");
  989         mrep = rep->r_mrep;
  990         md = rep->r_md;
  991         dpos = rep->r_dpos;
  992         if (error) {
  993                 m_freem(rep->r_mreq);
  994                 free((caddr_t)rep, M_NFSREQ);
  995                 return (error);
  996         }
  997 
  998         /*
  999          * break down the rpc header and check if ok
 1000          */
 1001         tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 1002         if (*tl++ == rpc_msgdenied) {
 1003                 if (*tl == rpc_mismatch)
 1004                         error = EOPNOTSUPP;
 1005                 else
 1006                         error = EACCES;
 1007                 m_freem(mrep);
 1008                 m_freem(rep->r_mreq);
 1009                 free((caddr_t)rep, M_NFSREQ);
 1010                 return (error);
 1011         }
 1012 
 1013         /*
 1014          * Just throw away any verifyer (ie: kerberos etc).
 1015          */
 1016         i = fxdr_unsigned(int, *tl++);          /* verf type */
 1017         i = fxdr_unsigned(int32_t, *tl);        /* len */
 1018         if (i > 0)
 1019                 nfsm_adv(nfsm_rndup(i));
 1020         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1021         /* 0 == ok */
 1022         if (*tl == 0) {
 1023                 tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1024                 if (*tl != 0) {
 1025                         error = fxdr_unsigned(int, *tl);
 1026                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 1027                                 error == NFSERR_TRYLATER) {
 1028                                 m_freem(mrep);
 1029                                 error = 0;
 1030                                 waituntil = time_second + trylater_delay;
 1031                                 while (time_second < waituntil)
 1032                                         (void) tsleep(&lbolt,
 1033                                                 PSOCK, "nqnfstry", 0);
 1034                                 trylater_delay *= nfs_backoff[trylater_cnt];
 1035                                 if (trylater_cnt < NFS_NBACKOFF - 1)
 1036                                         trylater_cnt++;
 1037                                 goto tryagain;
 1038                         }
 1039 
 1040                         /*
 1041                          * If the File Handle was stale, invalidate the
 1042                          * lookup cache, just in case.
 1043                          */
 1044                         if (error == ESTALE)
 1045                                 cache_purge(vp);
 1046                         if (nmp->nm_flag & NFSMNT_NFSV3) {
 1047                                 *mrp = mrep;
 1048                                 *mdp = md;
 1049                                 *dposp = dpos;
 1050                                 error |= NFSERR_RETERR;
 1051                         } else
 1052                                 m_freem(mrep);
 1053                         m_freem(rep->r_mreq);
 1054                         free((caddr_t)rep, M_NFSREQ);
 1055                         return (error);
 1056                 }
 1057 
 1058                 *mrp = mrep;
 1059                 *mdp = md;
 1060                 *dposp = dpos;
 1061                 m_freem(rep->r_mreq);
 1062                 FREE((caddr_t)rep, M_NFSREQ);
 1063                 return (0);
 1064         }
 1065         m_freem(mrep);
 1066         error = EPROTONOSUPPORT;
 1067 nfsmout:
 1068         m_freem(rep->r_mreq);
 1069         free((caddr_t)rep, M_NFSREQ);
 1070         return (error);
 1071 }
 1072 
 1073 /*
 1074  * Nfs timer routine
 1075  * Scan the nfsreq list and retranmit any requests that have timed out
 1076  * To avoid retransmission attempts on STREAM sockets (in the future) make
 1077  * sure to set the r_retry field to 0 (implies nm_retry == 0).
 1078  */
 1079 void
 1080 nfs_timer(void *arg)
 1081 {
 1082         struct nfsreq *rep;
 1083         struct mbuf *m;
 1084         struct socket *so;
 1085         struct nfsmount *nmp;
 1086         int timeo;
 1087         int s, error;
 1088         struct thread *td;
 1089 
 1090         td = &thread0; /* XXX for credentials, may break if sleep */
 1091         s = splnet();
 1092         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
 1093                 nmp = rep->r_nmp;
 1094                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
 1095                         continue;
 1096                 if (nfs_sigintr(nmp, rep, rep->r_td)) {
 1097                         nfs_softterm(rep);
 1098                         continue;
 1099                 }
 1100                 if (rep->r_rtt >= 0) {
 1101                         rep->r_rtt++;
 1102                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
 1103                                 timeo = nmp->nm_timeo;
 1104                         else
 1105                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
 1106                         if (nmp->nm_timeouts > 0)
 1107                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
 1108                         if (rep->r_rtt <= timeo)
 1109                                 continue;
 1110                         if (nmp->nm_timeouts < NFS_NBACKOFF)
 1111                                 nmp->nm_timeouts++;
 1112                 }
 1113                 /*
 1114                  * Check for server not responding
 1115                  */
 1116                 if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
 1117                      rep->r_rexmit > nmp->nm_deadthresh) {
 1118                         nfs_msg(rep->r_td,
 1119                             nmp->nm_mountp->mnt_stat.f_mntfromname,
 1120                             "not responding");
 1121                         rep->r_flags |= R_TPRINTFMSG;
 1122                 }
 1123                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
 1124                         nfsstats.rpctimeouts++;
 1125                         nfs_softterm(rep);
 1126                         continue;
 1127                 }
 1128                 if (nmp->nm_sotype != SOCK_DGRAM) {
 1129                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1130                                 rep->r_rexmit = NFS_MAXREXMIT;
 1131                         continue;
 1132                 }
 1133                 if ((so = nmp->nm_so) == NULL)
 1134                         continue;
 1135 
 1136                 /*
 1137                  * If there is enough space and the window allows..
 1138                  *      Resend it
 1139                  * Set r_rtt to -1 in case we fail to send it now.
 1140                  */
 1141                 rep->r_rtt = -1;
 1142                 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
 1143                    ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
 1144                     (rep->r_flags & R_SENT) ||
 1145                     nmp->nm_sent < nmp->nm_cwnd) &&
 1146                    (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
 1147                         if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
 1148                             error = (*so->so_proto->pr_usrreqs->pru_send)
 1149                                     (so, 0, m, NULL, NULL, td);
 1150                         else
 1151                             error = (*so->so_proto->pr_usrreqs->pru_send)
 1152                                     (so, 0, m, nmp->nm_nam, NULL, td);
 1153                         if (error) {
 1154                                 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
 1155                                         so->so_error = 0;
 1156                         } else {
 1157                                 /*
 1158                                  * Iff first send, start timing
 1159                                  * else turn timing off, backoff timer
 1160                                  * and divide congestion window by 2.
 1161                                  */
 1162                                 if (rep->r_flags & R_SENT) {
 1163                                         rep->r_flags &= ~R_TIMING;
 1164                                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1165                                                 rep->r_rexmit = NFS_MAXREXMIT;
 1166                                         nmp->nm_cwnd >>= 1;
 1167                                         if (nmp->nm_cwnd < NFS_CWNDSCALE)
 1168                                                 nmp->nm_cwnd = NFS_CWNDSCALE;
 1169                                         nfsstats.rpcretries++;
 1170                                 } else {
 1171                                         rep->r_flags |= R_SENT;
 1172                                         nmp->nm_sent += NFS_CWNDSCALE;
 1173                                 }
 1174                                 rep->r_rtt = 0;
 1175                         }
 1176                 }
 1177         }
 1178         splx(s);
 1179         nfs_timer_handle = timeout(nfs_timer, NULL, nfs_ticks);
 1180 }
 1181 
 1182 /*
 1183  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
 1184  * wait for all requests to complete. This is used by forced unmounts
 1185  * to terminate any outstanding RPCs.
 1186  */
 1187 int
 1188 nfs_nmcancelreqs(nmp)
 1189         struct nfsmount *nmp;
 1190 {
 1191         struct nfsreq *req;
 1192         int i, s;
 1193 
 1194         s = splnet();
 1195         TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1196                 if (nmp != req->r_nmp || req->r_mrep != NULL ||
 1197                     (req->r_flags & R_SOFTTERM))
 1198                         continue;
 1199                 nfs_softterm(req);
 1200         }
 1201         splx(s);
 1202 
 1203         for (i = 0; i < 30; i++) {
 1204                 s = splnet();
 1205                 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1206                         if (nmp == req->r_nmp)
 1207                                 break;
 1208                 }
 1209                 splx(s);
 1210                 if (req == NULL)
 1211                         return (0);
 1212                 tsleep(&lbolt, PSOCK, "nfscancel", 0);
 1213         }
 1214         return (EBUSY);
 1215 }
 1216 
 1217 /*
 1218  * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
 1219  * The nm_send count is decremented now to avoid deadlocks when the process in
 1220  * soreceive() hasn't yet managed to send its own request.
 1221  */
 1222 
 1223 static void
 1224 nfs_softterm(struct nfsreq *rep)
 1225 {
 1226 
 1227         rep->r_flags |= R_SOFTTERM;
 1228         if (rep->r_flags & R_SENT) {
 1229                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
 1230                 rep->r_flags &= ~R_SENT;
 1231         }
 1232 }
 1233 
 1234 /*
 1235  * Test for a termination condition pending on the process.
 1236  * This is used for NFSMNT_INT mounts.
 1237  */
 1238 int
 1239 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
 1240 {
 1241         struct proc *p;
 1242         sigset_t tmpset;
 1243 
 1244         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 1245                 return nfs4_sigintr(nmp, rep, td);
 1246         if (rep && (rep->r_flags & R_SOFTTERM))
 1247                 return (EINTR);
 1248         /* Terminate all requests while attempting a forced unmount. */
 1249         if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 1250                 return (EINTR);
 1251         if (!(nmp->nm_flag & NFSMNT_INT))
 1252                 return (0);
 1253         if (td == NULL)
 1254                 return (0);
 1255 
 1256         p = td->td_proc;
 1257         PROC_LOCK(p);
 1258         tmpset = p->p_siglist;
 1259         SIGSETNAND(tmpset, td->td_sigmask);
 1260         mtx_lock(&p->p_sigacts->ps_mtx);
 1261         SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 1262         mtx_unlock(&p->p_sigacts->ps_mtx);
 1263         if (SIGNOTEMPTY(p->p_siglist) && NFSINT_SIGMASK(tmpset)) {
 1264                 PROC_UNLOCK(p);
 1265                 return (EINTR);
 1266         }
 1267         PROC_UNLOCK(p);
 1268 
 1269         return (0);
 1270 }
 1271 
 1272 /*
 1273  * Lock a socket against others.
 1274  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
 1275  * and also to avoid race conditions between the processes with nfs requests
 1276  * in progress when a reconnect is necessary.
 1277  */
 1278 int
 1279 nfs_sndlock(struct nfsreq *rep)
 1280 {
 1281         int *statep = &rep->r_nmp->nm_state;
 1282         struct thread *td;
 1283         int slpflag = 0, slptimeo = 0;
 1284 
 1285         td = rep->r_td;
 1286         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1287                 slpflag = PCATCH;
 1288         while (*statep & NFSSTA_SNDLOCK) {
 1289                 if (nfs_sigintr(rep->r_nmp, rep, td))
 1290                         return (EINTR);
 1291                 *statep |= NFSSTA_WANTSND;
 1292                 (void) tsleep(statep, slpflag | (PZERO - 1),
 1293                         "nfsndlck", slptimeo);
 1294                 if (slpflag == PCATCH) {
 1295                         slpflag = 0;
 1296                         slptimeo = 2 * hz;
 1297                 }
 1298         }
 1299         *statep |= NFSSTA_SNDLOCK;
 1300         return (0);
 1301 }
 1302 
 1303 /*
 1304  * Unlock the stream socket for others.
 1305  */
 1306 void
 1307 nfs_sndunlock(struct nfsreq *rep)
 1308 {
 1309         int *statep = &rep->r_nmp->nm_state;
 1310 
 1311         if ((*statep & NFSSTA_SNDLOCK) == 0)
 1312                 panic("nfs sndunlock");
 1313         *statep &= ~NFSSTA_SNDLOCK;
 1314         if (*statep & NFSSTA_WANTSND) {
 1315                 *statep &= ~NFSSTA_WANTSND;
 1316                 wakeup(statep);
 1317         }
 1318 }
 1319 
 1320 static int
 1321 nfs_rcvlock(struct nfsreq *rep)
 1322 {
 1323         int *statep = &rep->r_nmp->nm_state;
 1324         int slpflag, slptimeo = 0;
 1325 
 1326         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1327                 slpflag = PCATCH;
 1328         else
 1329                 slpflag = 0;
 1330         while (*statep & NFSSTA_RCVLOCK) {
 1331                 if (nfs_sigintr(rep->r_nmp, rep, rep->r_td))
 1332                         return (EINTR);
 1333                 *statep |= NFSSTA_WANTRCV;
 1334                 (void) tsleep(statep, slpflag | (PZERO - 1), "nfsrcvlk",
 1335                         slptimeo);
 1336                 /*
 1337                  * If our reply was recieved while we were sleeping,
 1338                  * then just return without taking the lock to avoid a
 1339                  * situation where a single iod could 'capture' the
 1340                  * recieve lock.
 1341                  */
 1342                 if (rep->r_mrep != NULL)
 1343                         return (EALREADY);
 1344                 if (slpflag == PCATCH) {
 1345                         slpflag = 0;
 1346                         slptimeo = 2 * hz;
 1347                 }
 1348         }
 1349         /* Always fail if our request has been cancelled. */
 1350         if (rep != NULL && (rep->r_flags & R_SOFTTERM))
 1351                 return (EINTR);
 1352         *statep |= NFSSTA_RCVLOCK;
 1353         return (0);
 1354 }
 1355 
 1356 /*
 1357  * Unlock the stream socket for others.
 1358  */
 1359 static void
 1360 nfs_rcvunlock(struct nfsreq *rep)
 1361 {
 1362         int *statep = &rep->r_nmp->nm_state;
 1363 
 1364         if ((*statep & NFSSTA_RCVLOCK) == 0)
 1365                 panic("nfs rcvunlock");
 1366         *statep &= ~NFSSTA_RCVLOCK;
 1367         if (*statep & NFSSTA_WANTRCV) {
 1368                 *statep &= ~NFSSTA_WANTRCV;
 1369                 wakeup(statep);
 1370         }
 1371 }
 1372 
 1373 /*
 1374  *      nfs_realign:
 1375  *
 1376  *      Check for badly aligned mbuf data and realign by copying the unaligned
 1377  *      portion of the data into a new mbuf chain and freeing the portions
 1378  *      of the old chain that were replaced.
 1379  *
 1380  *      We cannot simply realign the data within the existing mbuf chain
 1381  *      because the underlying buffers may contain other rpc commands and
 1382  *      we cannot afford to overwrite them.
 1383  *
 1384  *      We would prefer to avoid this situation entirely.  The situation does
 1385  *      not occur with NFS/UDP and is supposed to only occassionally occur
 1386  *      with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
 1387  */
 1388 static void
 1389 nfs_realign(struct mbuf **pm, int hsiz)
 1390 {
 1391         struct mbuf *m;
 1392         struct mbuf *n = NULL;
 1393         int off = 0;
 1394 
 1395         ++nfs_realign_test;
 1396         while ((m = *pm) != NULL) {
 1397                 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
 1398                         MGET(n, M_TRYWAIT, MT_DATA);
 1399                         if (m->m_len >= MINCLSIZE) {
 1400                                 MCLGET(n, M_TRYWAIT);
 1401                         }
 1402                         n->m_len = 0;
 1403                         break;
 1404                 }
 1405                 pm = &m->m_next;
 1406         }
 1407         /*
 1408          * If n is non-NULL, loop on m copying data, then replace the
 1409          * portion of the chain that had to be realigned.
 1410          */
 1411         if (n != NULL) {
 1412                 ++nfs_realign_count;
 1413                 while (m) {
 1414                         m_copyback(n, off, m->m_len, mtod(m, caddr_t));
 1415                         off += m->m_len;
 1416                         m = m->m_next;
 1417                 }
 1418                 m_freem(*pm);
 1419                 *pm = n;
 1420         }
 1421 }
 1422 
 1423 
 1424 static int
 1425 nfs_msg(struct thread *td, char *server, char *msg)
 1426 {
 1427 
 1428         tprintf(td ? td->td_proc : NULL, LOG_INFO,
 1429             "nfs server %s: %s\n", server, msg);
 1430         return (0);
 1431 }
Cache object: 2b02fedabb5721b27f363f026f809ab8
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/nfsclient/nfs_socket.c

FreeBSD/Linux Kernel Cross Reference
sys/nfsclient/nfs_socket.c