The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/nfsclient/nfs_socket.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1989, 1991, 1993, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to Berkeley by
    6  * Rick Macklem at The University of Guelph.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 3. All advertising materials mentioning features or use of this software
   17  *    must display the following acknowledgement:
   18  *      This product includes software developed by the University of
   19  *      California, Berkeley and its contributors.
   20  * 4. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
   37  */
   38 
   39 #include <sys/cdefs.h>
   40 __FBSDID("$FreeBSD: releng/5.0/sys/nfsclient/nfs_socket.c 104306 2002-10-01 17:15:53Z jmallett $");
   41 
   42 /*
   43  * Socket operations for use by nfs
   44  */
   45 
   46 #include "opt_inet6.h"
   47 
   48 #include <sys/param.h>
   49 #include <sys/systm.h>
   50 #include <sys/kernel.h>
   51 #include <sys/lock.h>
   52 #include <sys/malloc.h>
   53 #include <sys/mbuf.h>
   54 #include <sys/mount.h>
   55 #include <sys/mutex.h>
   56 #include <sys/proc.h>
   57 #include <sys/protosw.h>
   58 #include <sys/signalvar.h>
   59 #include <sys/socket.h>
   60 #include <sys/socketvar.h>
   61 #include <sys/sysctl.h>
   62 #include <sys/syslog.h>
   63 #include <sys/vnode.h>
   64 
   65 #include <netinet/in.h>
   66 #include <netinet/tcp.h>
   67 
   68 #include <nfs/rpcv2.h>
   69 #include <nfs/nfsproto.h>
   70 #include <nfsclient/nfs.h>
   71 #include <nfs/xdr_subs.h>
   72 #include <nfsclient/nfsm_subs.h>
   73 #include <nfsclient/nfsmount.h>
   74 #include <nfsclient/nfsnode.h>
   75 
   76 #define TRUE    1
   77 #define FALSE   0
   78 
   79 /*
   80  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
   81  * Use the mean and mean deviation of rtt for the appropriate type of rpc
   82  * for the frequent rpcs and a default for the others.
   83  * The justification for doing "other" this way is that these rpcs
   84  * happen so infrequently that timer est. would probably be stale.
   85  * Also, since many of these rpcs are
   86  * non-idempotent, a conservative timeout is desired.
   87  * getattr, lookup - A+2D
   88  * read, write     - A+4D
   89  * other           - nm_timeo
   90  */
   91 #define NFS_RTO(n, t) \
   92         ((t) == 0 ? (n)->nm_timeo : \
   93          ((t) < 3 ? \
   94           (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
   95           ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
   96 #define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
   97 #define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
   98 
   99 /*
  100  * Defines which timer to use for the procnum.
  101  * 0 - default
  102  * 1 - getattr
  103  * 2 - lookup
  104  * 3 - read
  105  * 4 - write
  106  */
  107 static int proct[NFS_NPROCS] = {
  108         0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
  109 };
  110 
  111 static int      nfs_realign_test;
  112 static int      nfs_realign_count;
  113 static int      nfs_bufpackets = 4;
  114 
  115 SYSCTL_DECL(_vfs_nfs);
  116 
  117 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
  118 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
  119 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
  120 
  121 
  122 /*
  123  * There is a congestion window for outstanding rpcs maintained per mount
  124  * point. The cwnd size is adjusted in roughly the way that:
  125  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
  126  * SIGCOMM '88". ACM, August 1988.
  127  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
  128  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
  129  * of rpcs is in progress.
  130  * (The sent count and cwnd are scaled for integer arith.)
  131  * Variants of "slow start" were tried and were found to be too much of a
  132  * performance hit (ave. rtt 3 times larger),
  133  * I suspect due to the large rtt that nfs rpcs have.
  134  */
  135 #define NFS_CWNDSCALE   256
  136 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
  137 #define NFS_NBACKOFF    8
  138 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
  139 struct callout_handle   nfs_timer_handle;
  140 
  141 static int      nfs_msg(struct thread *, char *, char *);
  142 static int      nfs_rcvlock(struct nfsreq *);
  143 static void     nfs_rcvunlock(struct nfsreq *);
  144 static void     nfs_realign(struct mbuf **pm, int hsiz);
  145 static int      nfs_receive(struct nfsreq *rep, struct sockaddr **aname,
  146                     struct mbuf **mp);
  147 static int      nfs_reply(struct nfsreq *);
  148 static void     nfs_softterm(struct nfsreq *rep);
  149 static int      nfs_reconnect(struct nfsreq *rep);
  150 
  151 /*
  152  * Initialize sockets and congestion for a new NFS connection.
  153  * We do not free the sockaddr if error.
  154  */
  155 int
  156 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
  157 {
  158         struct socket *so;
  159         int s, error, rcvreserve, sndreserve;
  160         int pktscale;
  161         struct sockaddr *saddr;
  162         struct thread *td = &thread0; /* only used for socreate and sobind */
  163 
  164         nmp->nm_so = NULL;
  165         saddr = nmp->nm_nam;
  166         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
  167                 nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td);
  168         if (error)
  169                 goto bad;
  170         so = nmp->nm_so;
  171         nmp->nm_soflags = so->so_proto->pr_flags;
  172 
  173         /*
  174          * Some servers require that the client port be a reserved port number.
  175          */
  176         if (nmp->nm_flag & NFSMNT_RESVPORT) {
  177                 struct sockopt sopt;
  178                 int ip, ip2, len;
  179                 struct sockaddr_in6 ssin;
  180                 struct sockaddr *sa;
  181 
  182                 bzero(&sopt, sizeof sopt);
  183                 switch(saddr->sa_family) {
  184                 case AF_INET:
  185                         sopt.sopt_level = IPPROTO_IP;
  186                         sopt.sopt_name = IP_PORTRANGE;
  187                         ip = IP_PORTRANGE_LOW;
  188                         ip2 = IP_PORTRANGE_DEFAULT;
  189                         len = sizeof (struct sockaddr_in);
  190                         break;
  191 #ifdef INET6
  192                 case AF_INET6:
  193                         sopt.sopt_level = IPPROTO_IPV6;
  194                         sopt.sopt_name = IPV6_PORTRANGE;
  195                         ip = IPV6_PORTRANGE_LOW;
  196                         ip2 = IPV6_PORTRANGE_DEFAULT;
  197                         len = sizeof (struct sockaddr_in6);
  198                         break;
  199 #endif
  200                 default:
  201                         goto noresvport;
  202                 }
  203                 sa = (struct sockaddr *)&ssin;
  204                 bzero(sa, len);
  205                 sa->sa_len = len;
  206                 sa->sa_family = saddr->sa_family;
  207                 sopt.sopt_dir = SOPT_SET;
  208                 sopt.sopt_val = (void *)&ip;
  209                 sopt.sopt_valsize = sizeof(ip);
  210                 error = sosetopt(so, &sopt);
  211                 if (error)
  212                         goto bad;
  213                 error = sobind(so, sa, td);
  214                 if (error)
  215                         goto bad;
  216                 ip = ip2;
  217                 error = sosetopt(so, &sopt);
  218                 if (error)
  219                         goto bad;
  220         noresvport: ;
  221         }
  222 
  223         /*
  224          * Protocols that do not require connections may be optionally left
  225          * unconnected for servers that reply from a port other than NFS_PORT.
  226          */
  227         if (nmp->nm_flag & NFSMNT_NOCONN) {
  228                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
  229                         error = ENOTCONN;
  230                         goto bad;
  231                 }
  232         } else {
  233                 error = soconnect(so, nmp->nm_nam, td);
  234                 if (error)
  235                         goto bad;
  236 
  237                 /*
  238                  * Wait for the connection to complete. Cribbed from the
  239                  * connect system call but with the wait timing out so
  240                  * that interruptible mounts don't hang here for a long time.
  241                  */
  242                 s = splnet();
  243                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
  244                         (void) tsleep((caddr_t)&so->so_timeo,
  245                             PSOCK, "nfscon", 2 * hz);
  246                         if ((so->so_state & SS_ISCONNECTING) &&
  247                             so->so_error == 0 && rep &&
  248                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
  249                                 so->so_state &= ~SS_ISCONNECTING;
  250                                 splx(s);
  251                                 goto bad;
  252                         }
  253                 }
  254                 if (so->so_error) {
  255                         error = so->so_error;
  256                         so->so_error = 0;
  257                         splx(s);
  258                         goto bad;
  259                 }
  260                 splx(s);
  261         }
  262         so->so_rcv.sb_timeo = 5 * hz;
  263         so->so_snd.sb_timeo = 5 * hz;
  264 
  265         /*
  266          * Get buffer reservation size from sysctl, but impose reasonable
  267          * limits.
  268          */
  269         pktscale = nfs_bufpackets;
  270         if (pktscale < 2)
  271                 pktscale = 2;
  272         if (pktscale > 64)
  273                 pktscale = 64;
  274 
  275         if (nmp->nm_sotype == SOCK_DGRAM) {
  276                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  277                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  278                     NFS_MAXPKTHDR) * pktscale;
  279         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
  280                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  281                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  282                     NFS_MAXPKTHDR) * pktscale;
  283         } else {
  284                 if (nmp->nm_sotype != SOCK_STREAM)
  285                         panic("nfscon sotype");
  286                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  287                         struct sockopt sopt;
  288                         int val;
  289 
  290                         bzero(&sopt, sizeof sopt);
  291                         sopt.sopt_level = SOL_SOCKET;
  292                         sopt.sopt_name = SO_KEEPALIVE;
  293                         sopt.sopt_val = &val;
  294                         sopt.sopt_valsize = sizeof val;
  295                         val = 1;
  296                         sosetopt(so, &sopt);
  297                 }
  298                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
  299                         struct sockopt sopt;
  300                         int val;
  301 
  302                         bzero(&sopt, sizeof sopt);
  303                         sopt.sopt_level = IPPROTO_TCP;
  304                         sopt.sopt_name = TCP_NODELAY;
  305                         sopt.sopt_val = &val;
  306                         sopt.sopt_valsize = sizeof val;
  307                         val = 1;
  308                         sosetopt(so, &sopt);
  309                 }
  310                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
  311                     sizeof (u_int32_t)) * pktscale;
  312                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
  313                     sizeof (u_int32_t)) * pktscale;
  314         }
  315         error = soreserve(so, sndreserve, rcvreserve);
  316         if (error)
  317                 goto bad;
  318         so->so_rcv.sb_flags |= SB_NOINTR;
  319         so->so_snd.sb_flags |= SB_NOINTR;
  320 
  321         /* Initialize other non-zero congestion variables */
  322         nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
  323                 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
  324         nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
  325                 nmp->nm_sdrtt[3] = 0;
  326         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
  327         nmp->nm_sent = 0;
  328         nmp->nm_timeouts = 0;
  329         return (0);
  330 
  331 bad:
  332         nfs_disconnect(nmp);
  333         return (error);
  334 }
  335 
  336 /*
  337  * Reconnect routine:
  338  * Called when a connection is broken on a reliable protocol.
  339  * - clean up the old socket
  340  * - nfs_connect() again
  341  * - set R_MUSTRESEND for all outstanding requests on mount point
  342  * If this fails the mount point is DEAD!
  343  * nb: Must be called with the nfs_sndlock() set on the mount point.
  344  */
  345 static int
  346 nfs_reconnect(struct nfsreq *rep)
  347 {
  348         struct nfsreq *rp;
  349         struct nfsmount *nmp = rep->r_nmp;
  350         int error;
  351 
  352         nfs_disconnect(nmp);
  353         while ((error = nfs_connect(nmp, rep)) != 0) {
  354                 if (error == EINTR || error == ERESTART)
  355                         return (EINTR);
  356                 (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
  357         }
  358 
  359         /*
  360          * Loop through outstanding request list and fix up all requests
  361          * on old socket.
  362          */
  363         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
  364                 if (rp->r_nmp == nmp)
  365                         rp->r_flags |= R_MUSTRESEND;
  366         }
  367         return (0);
  368 }
  369 
  370 /*
  371  * NFS disconnect. Clean up and unlink.
  372  */
  373 void
  374 nfs_disconnect(struct nfsmount *nmp)
  375 {
  376         struct socket *so;
  377 
  378         if (nmp->nm_so) {
  379                 so = nmp->nm_so;
  380                 nmp->nm_so = NULL;
  381                 soshutdown(so, 2);
  382                 soclose(so);
  383         }
  384 }
  385 
  386 void
  387 nfs_safedisconnect(struct nfsmount *nmp)
  388 {
  389         struct nfsreq dummyreq;
  390 
  391         bzero(&dummyreq, sizeof(dummyreq));
  392         dummyreq.r_nmp = nmp;
  393         nfs_rcvlock(&dummyreq);
  394         nfs_disconnect(nmp);
  395         nfs_rcvunlock(&dummyreq);
  396 }
  397 
  398 /*
  399  * This is the nfs send routine. For connection based socket types, it
  400  * must be called with an nfs_sndlock() on the socket.
  401  * - return EINTR if the RPC is terminated, 0 otherwise
  402  * - set R_MUSTRESEND if the send fails for any reason
  403  * - do any cleanup required by recoverable socket errors (?)
  404  */
  405 int
  406 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
  407     struct nfsreq *rep)
  408 {
  409         struct sockaddr *sendnam;
  410         int error, soflags, flags;
  411 
  412         KASSERT(rep, ("nfs_send: called with rep == NULL"));
  413 
  414         if (rep->r_flags & R_SOFTTERM) {
  415                 m_freem(top);
  416                 return (EINTR);
  417         }
  418         if ((so = rep->r_nmp->nm_so) == NULL) {
  419                 rep->r_flags |= R_MUSTRESEND;
  420                 m_freem(top);
  421                 return (0);
  422         }
  423         rep->r_flags &= ~R_MUSTRESEND;
  424         soflags = rep->r_nmp->nm_soflags;
  425 
  426         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
  427                 sendnam = NULL;
  428         else
  429                 sendnam = nam;
  430         if (so->so_type == SOCK_SEQPACKET)
  431                 flags = MSG_EOR;
  432         else
  433                 flags = 0;
  434 
  435         error = so->so_proto->pr_usrreqs->pru_sosend(so, sendnam, 0, top, 0,
  436                                                      flags, curthread /*XXX*/);
  437         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
  438                 error = 0;
  439                 rep->r_flags |= R_MUSTRESEND;
  440         }
  441 
  442         if (error) {
  443                 log(LOG_INFO, "nfs send error %d for server %s\n", error,
  444                     rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  445                 /*
  446                  * Deal with errors for the client side.
  447                  */
  448                 if (rep->r_flags & R_SOFTTERM)
  449                         error = EINTR;
  450                 else
  451                         rep->r_flags |= R_MUSTRESEND;
  452 
  453                 /*
  454                  * Handle any recoverable (soft) socket errors here. (?)
  455                  */
  456                 if (error != EINTR && error != ERESTART &&
  457                         error != EWOULDBLOCK && error != EPIPE)
  458                         error = 0;
  459         }
  460         return (error);
  461 }
  462 
  463 /*
  464  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
  465  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
  466  * Mark and consolidate the data into a new mbuf list.
  467  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
  468  *     small mbufs.
  469  * For SOCK_STREAM we must be very careful to read an entire record once
  470  * we have read any of it, even if the system call has been interrupted.
  471  */
  472 static int
  473 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
  474 {
  475         struct socket *so;
  476         struct uio auio;
  477         struct iovec aio;
  478         struct mbuf *m;
  479         struct mbuf *control;
  480         u_int32_t len;
  481         struct sockaddr **getnam;
  482         int error, sotype, rcvflg;
  483         struct thread *td = curthread;  /* XXX */
  484 
  485         /*
  486          * Set up arguments for soreceive()
  487          */
  488         *mp = NULL;
  489         *aname = NULL;
  490         sotype = rep->r_nmp->nm_sotype;
  491 
  492         /*
  493          * For reliable protocols, lock against other senders/receivers
  494          * in case a reconnect is necessary.
  495          * For SOCK_STREAM, first get the Record Mark to find out how much
  496          * more there is to get.
  497          * We must lock the socket against other receivers
  498          * until we have an entire rpc request/reply.
  499          */
  500         if (sotype != SOCK_DGRAM) {
  501                 error = nfs_sndlock(rep);
  502                 if (error)
  503                         return (error);
  504 tryagain:
  505                 /*
  506                  * Check for fatal errors and resending request.
  507                  */
  508                 /*
  509                  * Ugh: If a reconnect attempt just happened, nm_so
  510                  * would have changed. NULL indicates a failed
  511                  * attempt that has essentially shut down this
  512                  * mount point.
  513                  */
  514                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
  515                         nfs_sndunlock(rep);
  516                         return (EINTR);
  517                 }
  518                 so = rep->r_nmp->nm_so;
  519                 if (!so) {
  520                         error = nfs_reconnect(rep);
  521                         if (error) {
  522                                 nfs_sndunlock(rep);
  523                                 return (error);
  524                         }
  525                         goto tryagain;
  526                 }
  527                 while (rep->r_flags & R_MUSTRESEND) {
  528                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_TRYWAIT);
  529                         nfsstats.rpcretries++;
  530                         error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
  531                         if (error) {
  532                                 if (error == EINTR || error == ERESTART ||
  533                                     (error = nfs_reconnect(rep)) != 0) {
  534                                         nfs_sndunlock(rep);
  535                                         return (error);
  536                                 }
  537                                 goto tryagain;
  538                         }
  539                 }
  540                 nfs_sndunlock(rep);
  541                 if (sotype == SOCK_STREAM) {
  542                         aio.iov_base = (caddr_t) &len;
  543                         aio.iov_len = sizeof(u_int32_t);
  544                         auio.uio_iov = &aio;
  545                         auio.uio_iovcnt = 1;
  546                         auio.uio_segflg = UIO_SYSSPACE;
  547                         auio.uio_rw = UIO_READ;
  548                         auio.uio_offset = 0;
  549                         auio.uio_resid = sizeof(u_int32_t);
  550                         auio.uio_td = td;
  551                         do {
  552                            rcvflg = MSG_WAITALL;
  553                            error = so->so_proto->pr_usrreqs->pru_soreceive
  554                                    (so, NULL, &auio, NULL, NULL, &rcvflg);
  555                            if (error == EWOULDBLOCK && rep) {
  556                                 if (rep->r_flags & R_SOFTTERM)
  557                                         return (EINTR);
  558                            }
  559                         } while (error == EWOULDBLOCK);
  560                         if (!error && auio.uio_resid > 0) {
  561                             /*
  562                              * Don't log a 0 byte receive; it means
  563                              * that the socket has been closed, and
  564                              * can happen during normal operation
  565                              * (forcible unmount or Solaris server).
  566                              */
  567                             if (auio.uio_resid != sizeof (u_int32_t))
  568                             log(LOG_INFO,
  569                                  "short receive (%d/%d) from nfs server %s\n",
  570                                  (int)(sizeof(u_int32_t) - auio.uio_resid),
  571                                  (int)sizeof(u_int32_t),
  572                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  573                             error = EPIPE;
  574                         }
  575                         if (error)
  576                                 goto errout;
  577                         len = ntohl(len) & ~0x80000000;
  578                         /*
  579                          * This is SERIOUS! We are out of sync with the sender
  580                          * and forcing a disconnect/reconnect is all I can do.
  581                          */
  582                         if (len > NFS_MAXPACKET) {
  583                             log(LOG_ERR, "%s (%d) from nfs server %s\n",
  584                                 "impossible packet length",
  585                                 len,
  586                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  587                             error = EFBIG;
  588                             goto errout;
  589                         }
  590                         auio.uio_resid = len;
  591                         do {
  592                             rcvflg = MSG_WAITALL;
  593                             error =  so->so_proto->pr_usrreqs->pru_soreceive
  594                                     (so, NULL,
  595                                      &auio, mp, NULL, &rcvflg);
  596                         } while (error == EWOULDBLOCK || error == EINTR ||
  597                                  error == ERESTART);
  598                         if (!error && auio.uio_resid > 0) {
  599                             if (len != auio.uio_resid)
  600                             log(LOG_INFO,
  601                                 "short receive (%d/%d) from nfs server %s\n",
  602                                 len - auio.uio_resid, len,
  603                                 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  604                             error = EPIPE;
  605                         }
  606                 } else {
  607                         /*
  608                          * NB: Since uio_resid is big, MSG_WAITALL is ignored
  609                          * and soreceive() will return when it has either a
  610                          * control msg or a data msg.
  611                          * We have no use for control msg., but must grab them
  612                          * and then throw them away so we know what is going
  613                          * on.
  614                          */
  615                         auio.uio_resid = len = 100000000; /* Anything Big */
  616                         auio.uio_td = td;
  617                         do {
  618                             rcvflg = 0;
  619                             error =  so->so_proto->pr_usrreqs->pru_soreceive
  620                                     (so, NULL,
  621                                 &auio, mp, &control, &rcvflg);
  622                             if (control)
  623                                 m_freem(control);
  624                             if (error == EWOULDBLOCK && rep) {
  625                                 if (rep->r_flags & R_SOFTTERM)
  626                                         return (EINTR);
  627                             }
  628                         } while (error == EWOULDBLOCK ||
  629                                  (!error && *mp == NULL && control));
  630                         if ((rcvflg & MSG_EOR) == 0)
  631                                 printf("Egad!!\n");
  632                         if (!error && *mp == NULL)
  633                                 error = EPIPE;
  634                         len -= auio.uio_resid;
  635                 }
  636 errout:
  637                 if (error && error != EINTR && error != ERESTART) {
  638                         m_freem(*mp);
  639                         *mp = NULL;
  640                         if (error != EPIPE)
  641                                 log(LOG_INFO,
  642                                     "receive error %d from nfs server %s\n",
  643                                     error,
  644                                  rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  645                         error = nfs_sndlock(rep);
  646                         if (!error) {
  647                                 error = nfs_reconnect(rep);
  648                                 if (!error)
  649                                         goto tryagain;
  650                                 else
  651                                         nfs_sndunlock(rep);
  652                         }
  653                 }
  654         } else {
  655                 if ((so = rep->r_nmp->nm_so) == NULL)
  656                         return (EACCES);
  657                 if (so->so_state & SS_ISCONNECTED)
  658                         getnam = NULL;
  659                 else
  660                         getnam = aname;
  661                 auio.uio_resid = len = 1000000;
  662                 auio.uio_td = td;
  663                 do {
  664                         rcvflg = 0;
  665                         error =  so->so_proto->pr_usrreqs->pru_soreceive
  666                                 (so, getnam, &auio, mp,
  667                                 NULL, &rcvflg);
  668                         if (error == EWOULDBLOCK &&
  669                             (rep->r_flags & R_SOFTTERM))
  670                                 return (EINTR);
  671                 } while (error == EWOULDBLOCK);
  672                 len -= auio.uio_resid;
  673         }
  674         if (error) {
  675                 m_freem(*mp);
  676                 *mp = NULL;
  677         }
  678         /*
  679          * Search for any mbufs that are not a multiple of 4 bytes long
  680          * or with m_data not longword aligned.
  681          * These could cause pointer alignment problems, so copy them to
  682          * well aligned mbufs.
  683          */
  684         nfs_realign(mp, 5 * NFSX_UNSIGNED);
  685         return (error);
  686 }
  687 
  688 /*
  689  * Implement receipt of reply on a socket.
  690  * We must search through the list of received datagrams matching them
  691  * with outstanding requests using the xid, until ours is found.
  692  */
  693 /* ARGSUSED */
  694 static int
  695 nfs_reply(struct nfsreq *myrep)
  696 {
  697         struct nfsreq *rep;
  698         struct nfsmount *nmp = myrep->r_nmp;
  699         int32_t t1;
  700         struct mbuf *mrep, *md;
  701         struct sockaddr *nam;
  702         u_int32_t rxid, *tl;
  703         caddr_t dpos;
  704         int error;
  705 
  706         /*
  707          * Loop around until we get our own reply
  708          */
  709         for (;;) {
  710                 /*
  711                  * Lock against other receivers so that I don't get stuck in
  712                  * sbwait() after someone else has received my reply for me.
  713                  * Also necessary for connection based protocols to avoid
  714                  * race conditions during a reconnect.
  715                  * If nfs_rcvlock() returns EALREADY, that means that
  716                  * the reply has already been recieved by another
  717                  * process and we can return immediately.  In this
  718                  * case, the lock is not taken to avoid races with
  719                  * other processes.
  720                  */
  721                 error = nfs_rcvlock(myrep);
  722                 if (error == EALREADY)
  723                         return (0);
  724                 if (error)
  725                         return (error);
  726                 /*
  727                  * Get the next Rpc reply off the socket
  728                  */
  729                 error = nfs_receive(myrep, &nam, &mrep);
  730                 nfs_rcvunlock(myrep);
  731                 if (error) {
  732 
  733                         /*
  734                          * Ignore routing errors on connectionless protocols??
  735                          */
  736                         if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
  737                                 nmp->nm_so->so_error = 0;
  738                                 if (myrep->r_flags & R_GETONEREP)
  739                                         return (0);
  740                                 continue;
  741                         }
  742                         return (error);
  743                 }
  744                 if (nam)
  745                         FREE(nam, M_SONAME);
  746 
  747                 /*
  748                  * Get the xid and check that it is an rpc reply
  749                  */
  750                 md = mrep;
  751                 dpos = mtod(md, caddr_t);
  752                 tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
  753                 rxid = *tl++;
  754                 if (*tl != rpc_reply) {
  755                         nfsstats.rpcinvalid++;
  756                         m_freem(mrep);
  757 nfsmout:
  758                         if (myrep->r_flags & R_GETONEREP)
  759                                 return (0);
  760                         continue;
  761                 }
  762 
  763                 /*
  764                  * Loop through the request list to match up the reply
  765                  * Iff no match, just drop the datagram
  766                  */
  767                 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
  768                         if (rep->r_mrep == NULL && rxid == rep->r_xid) {
  769                                 /* Found it.. */
  770                                 rep->r_mrep = mrep;
  771                                 rep->r_md = md;
  772                                 rep->r_dpos = dpos;
  773                                 /*
  774                                  * Update congestion window.
  775                                  * Do the additive increase of
  776                                  * one rpc/rtt.
  777                                  */
  778                                 if (nmp->nm_cwnd <= nmp->nm_sent) {
  779                                         nmp->nm_cwnd +=
  780                                            (NFS_CWNDSCALE * NFS_CWNDSCALE +
  781                                            (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
  782                                         if (nmp->nm_cwnd > NFS_MAXCWND)
  783                                                 nmp->nm_cwnd = NFS_MAXCWND;
  784                                 }
  785                                 if (rep->r_flags & R_SENT) {
  786                                         rep->r_flags &= ~R_SENT;
  787                                         nmp->nm_sent -= NFS_CWNDSCALE;
  788                                 }
  789                                 /*
  790                                  * Update rtt using a gain of 0.125 on the mean
  791                                  * and a gain of 0.25 on the deviation.
  792                                  */
  793                                 if (rep->r_flags & R_TIMING) {
  794                                         /*
  795                                          * Since the timer resolution of
  796                                          * NFS_HZ is so course, it can often
  797                                          * result in r_rtt == 0. Since
  798                                          * r_rtt == N means that the actual
  799                                          * rtt is between N+dt and N+2-dt ticks,
  800                                          * add 1.
  801                                          */
  802                                         t1 = rep->r_rtt + 1;
  803                                         t1 -= (NFS_SRTT(rep) >> 3);
  804                                         NFS_SRTT(rep) += t1;
  805                                         if (t1 < 0)
  806                                                 t1 = -t1;
  807                                         t1 -= (NFS_SDRTT(rep) >> 2);
  808                                         NFS_SDRTT(rep) += t1;
  809                                 }
  810                                 nmp->nm_timeouts = 0;
  811                                 break;
  812                         }
  813                 }
  814                 /*
  815                  * If not matched to a request, drop it.
  816                  * If it's mine, get out.
  817                  */
  818                 if (rep == 0) {
  819                         nfsstats.rpcunexpected++;
  820                         m_freem(mrep);
  821                 } else if (rep == myrep) {
  822                         if (rep->r_mrep == NULL)
  823                                 panic("nfsreply nil");
  824                         return (0);
  825                 }
  826                 if (myrep->r_flags & R_GETONEREP)
  827                         return (0);
  828         }
  829 }
  830 
  831 /*
  832  * nfs_request - goes something like this
  833  *      - fill in request struct
  834  *      - links it into list
  835  *      - calls nfs_send() for first transmit
  836  *      - calls nfs_receive() to get reply
  837  *      - break down rpc header and return with nfs reply pointed to
  838  *        by mrep or error
  839  * nb: always frees up mreq mbuf list
  840  */
  841 /* XXX overloaded before */
  842 #define NQ_TRYLATERDEL  15      /* Initial try later delay (sec) */
  843 
  844 int
  845 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
  846     struct thread *td, struct ucred *cred, struct mbuf **mrp,
  847     struct mbuf **mdp, caddr_t *dposp)
  848 {
  849         struct mbuf *mrep, *m2;
  850         struct nfsreq *rep;
  851         u_int32_t *tl;
  852         int i;
  853         struct nfsmount *nmp;
  854         struct mbuf *m, *md, *mheadend;
  855         time_t waituntil;
  856         caddr_t dpos;
  857         int s, error = 0, mrest_len, auth_len, auth_type;
  858         int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0;
  859         u_int32_t xid;
  860 
  861         /* Reject requests while attempting a forced unmount. */
  862         if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
  863                 m_freem(mrest);
  864                 return (ESTALE);
  865         }
  866         nmp = VFSTONFS(vp->v_mount);
  867         MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
  868         rep->r_nmp = nmp;
  869         rep->r_vp = vp;
  870         rep->r_td = td;
  871         rep->r_procnum = procnum;
  872         mrest_len = m_length(mrest, NULL);
  873 
  874         /*
  875          * Get the RPC header with authorization.
  876          */
  877         auth_type = RPCAUTH_UNIX;
  878         if (cred->cr_ngroups < 1)
  879                 panic("nfsreq nogrps");
  880         auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
  881                 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
  882                 5 * NFSX_UNSIGNED;
  883         m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
  884              mrest, mrest_len, &mheadend, &xid);
  885 
  886         /*
  887          * For stream protocols, insert a Sun RPC Record Mark.
  888          */
  889         if (nmp->nm_sotype == SOCK_STREAM) {
  890                 M_PREPEND(m, NFSX_UNSIGNED, M_TRYWAIT);
  891                 *mtod(m, u_int32_t *) = htonl(0x80000000 |
  892                          (m->m_pkthdr.len - NFSX_UNSIGNED));
  893         }
  894         rep->r_mreq = m;
  895         rep->r_xid = xid;
  896 tryagain:
  897         if (nmp->nm_flag & NFSMNT_SOFT)
  898                 rep->r_retry = nmp->nm_retry;
  899         else
  900                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
  901         rep->r_rtt = rep->r_rexmit = 0;
  902         if (proct[procnum] > 0)
  903                 rep->r_flags = R_TIMING;
  904         else
  905                 rep->r_flags = 0;
  906         rep->r_mrep = NULL;
  907 
  908         /*
  909          * Do the client side RPC.
  910          */
  911         nfsstats.rpcrequests++;
  912         /*
  913          * Chain request into list of outstanding requests. Be sure
  914          * to put it LAST so timer finds oldest requests first.
  915          */
  916         s = splsoftclock();
  917         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
  918 
  919         /*
  920          * If backing off another request or avoiding congestion, don't
  921          * send this one now but let timer do it. If not timing a request,
  922          * do it now.
  923          */
  924         if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
  925                 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
  926                 nmp->nm_sent < nmp->nm_cwnd)) {
  927                 splx(s);
  928                 if (nmp->nm_soflags & PR_CONNREQUIRED)
  929                         error = nfs_sndlock(rep);
  930                 if (!error) {
  931                         m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
  932                         error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
  933                         if (nmp->nm_soflags & PR_CONNREQUIRED)
  934                                 nfs_sndunlock(rep);
  935                 }
  936                 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
  937                         nmp->nm_sent += NFS_CWNDSCALE;
  938                         rep->r_flags |= R_SENT;
  939                 }
  940         } else {
  941                 splx(s);
  942                 rep->r_rtt = -1;
  943         }
  944 
  945         /*
  946          * Wait for the reply from our send or the timer's.
  947          */
  948         if (!error || error == EPIPE)
  949                 error = nfs_reply(rep);
  950 
  951         /*
  952          * RPC done, unlink the request.
  953          */
  954         s = splsoftclock();
  955         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
  956         splx(s);
  957 
  958         /*
  959          * Decrement the outstanding request count.
  960          */
  961         if (rep->r_flags & R_SENT) {
  962                 rep->r_flags &= ~R_SENT;        /* paranoia */
  963                 nmp->nm_sent -= NFS_CWNDSCALE;
  964         }
  965 
  966         /*
  967          * If there was a successful reply and a tprintf msg.
  968          * tprintf a response.
  969          */
  970         if (!error && (rep->r_flags & R_TPRINTFMSG))
  971                 nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
  972                     "is alive again");
  973         mrep = rep->r_mrep;
  974         md = rep->r_md;
  975         dpos = rep->r_dpos;
  976         if (error) {
  977                 m_freem(rep->r_mreq);
  978                 free((caddr_t)rep, M_NFSREQ);
  979                 return (error);
  980         }
  981 
  982         /*
  983          * break down the rpc header and check if ok
  984          */
  985         tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
  986         if (*tl++ == rpc_msgdenied) {
  987                 if (*tl == rpc_mismatch)
  988                         error = EOPNOTSUPP;
  989                 else
  990                         error = EACCES;
  991                 m_freem(mrep);
  992                 m_freem(rep->r_mreq);
  993                 free((caddr_t)rep, M_NFSREQ);
  994                 return (error);
  995         }
  996 
  997         /*
  998          * Just throw away any verifyer (ie: kerberos etc).
  999          */
 1000         i = fxdr_unsigned(int, *tl++);          /* verf type */
 1001         i = fxdr_unsigned(int32_t, *tl);        /* len */
 1002         if (i > 0)
 1003                 nfsm_adv(nfsm_rndup(i));
 1004         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1005         /* 0 == ok */
 1006         if (*tl == 0) {
 1007                 tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1008                 if (*tl != 0) {
 1009                         error = fxdr_unsigned(int, *tl);
 1010                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 1011                                 error == NFSERR_TRYLATER) {
 1012                                 m_freem(mrep);
 1013                                 error = 0;
 1014                                 waituntil = time_second + trylater_delay;
 1015                                 while (time_second < waituntil)
 1016                                         (void) tsleep((caddr_t)&lbolt,
 1017                                                 PSOCK, "nqnfstry", 0);
 1018                                 trylater_delay *= nfs_backoff[trylater_cnt];
 1019                                 if (trylater_cnt < NFS_NBACKOFF - 1)
 1020                                         trylater_cnt++;
 1021                                 goto tryagain;
 1022                         }
 1023 
 1024                         /*
 1025                          * If the File Handle was stale, invalidate the
 1026                          * lookup cache, just in case.
 1027                          */
 1028                         if (error == ESTALE)
 1029                                 cache_purge(vp);
 1030                         if (nmp->nm_flag & NFSMNT_NFSV3) {
 1031                                 *mrp = mrep;
 1032                                 *mdp = md;
 1033                                 *dposp = dpos;
 1034                                 error |= NFSERR_RETERR;
 1035                         } else
 1036                                 m_freem(mrep);
 1037                         m_freem(rep->r_mreq);
 1038                         free((caddr_t)rep, M_NFSREQ);
 1039                         return (error);
 1040                 }
 1041 
 1042                 *mrp = mrep;
 1043                 *mdp = md;
 1044                 *dposp = dpos;
 1045                 m_freem(rep->r_mreq);
 1046                 FREE((caddr_t)rep, M_NFSREQ);
 1047                 return (0);
 1048         }
 1049         m_freem(mrep);
 1050         error = EPROTONOSUPPORT;
 1051 nfsmout:
 1052         m_freem(rep->r_mreq);
 1053         free((caddr_t)rep, M_NFSREQ);
 1054         return (error);
 1055 }
 1056 
 1057 /*
 1058  * Nfs timer routine
 1059  * Scan the nfsreq list and retranmit any requests that have timed out
 1060  * To avoid retransmission attempts on STREAM sockets (in the future) make
 1061  * sure to set the r_retry field to 0 (implies nm_retry == 0).
 1062  */
 1063 void
 1064 nfs_timer(void *arg)
 1065 {
 1066         struct nfsreq *rep;
 1067         struct mbuf *m;
 1068         struct socket *so;
 1069         struct nfsmount *nmp;
 1070         int timeo;
 1071         int s, error;
 1072         struct thread *td;
 1073 
 1074         td = &thread0; /* XXX for credentials, may break if sleep */
 1075         s = splnet();
 1076         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
 1077                 nmp = rep->r_nmp;
 1078                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
 1079                         continue;
 1080                 if (nfs_sigintr(nmp, rep, rep->r_td)) {
 1081                         nfs_softterm(rep);
 1082                         continue;
 1083                 }
 1084                 if (rep->r_rtt >= 0) {
 1085                         rep->r_rtt++;
 1086                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
 1087                                 timeo = nmp->nm_timeo;
 1088                         else
 1089                                 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
 1090                         if (nmp->nm_timeouts > 0)
 1091                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
 1092                         if (rep->r_rtt <= timeo)
 1093                                 continue;
 1094                         if (nmp->nm_timeouts < NFS_NBACKOFF)
 1095                                 nmp->nm_timeouts++;
 1096                 }
 1097                 /*
 1098                  * Check for server not responding
 1099                  */
 1100                 if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
 1101                      rep->r_rexmit > nmp->nm_deadthresh) {
 1102                         char buf[40];
 1103                         sprintf(buf, "not responding %d > %d",
 1104                         rep->r_rexmit, nmp->nm_deadthresh);
 1105                         nfs_msg(rep->r_td,
 1106                             nmp->nm_mountp->mnt_stat.f_mntfromname,
 1107                             buf /* "not responding" */);
 1108                         rep->r_flags |= R_TPRINTFMSG;
 1109                 }
 1110                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
 1111                         nfsstats.rpctimeouts++;
 1112                         nfs_softterm(rep);
 1113                         continue;
 1114                 }
 1115                 if (nmp->nm_sotype != SOCK_DGRAM) {
 1116                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1117                                 rep->r_rexmit = NFS_MAXREXMIT;
 1118                         continue;
 1119                 }
 1120                 if ((so = nmp->nm_so) == NULL)
 1121                         continue;
 1122 
 1123                 /*
 1124                  * If there is enough space and the window allows..
 1125                  *      Resend it
 1126                  * Set r_rtt to -1 in case we fail to send it now.
 1127                  */
 1128                 rep->r_rtt = -1;
 1129                 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
 1130                    ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
 1131                     (rep->r_flags & R_SENT) ||
 1132                     nmp->nm_sent < nmp->nm_cwnd) &&
 1133                    (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){
 1134                         if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
 1135                             error = (*so->so_proto->pr_usrreqs->pru_send)
 1136                                     (so, 0, m, NULL, NULL, td);
 1137                         else
 1138                             error = (*so->so_proto->pr_usrreqs->pru_send)
 1139                                     (so, 0, m, nmp->nm_nam, NULL, td);
 1140                         if (error) {
 1141                                 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
 1142                                         so->so_error = 0;
 1143                         } else {
 1144                                 /*
 1145                                  * Iff first send, start timing
 1146                                  * else turn timing off, backoff timer
 1147                                  * and divide congestion window by 2.
 1148                                  */
 1149                                 if (rep->r_flags & R_SENT) {
 1150                                         rep->r_flags &= ~R_TIMING;
 1151                                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1152                                                 rep->r_rexmit = NFS_MAXREXMIT;
 1153                                         nmp->nm_cwnd >>= 1;
 1154                                         if (nmp->nm_cwnd < NFS_CWNDSCALE)
 1155                                                 nmp->nm_cwnd = NFS_CWNDSCALE;
 1156                                         nfsstats.rpcretries++;
 1157                                 } else {
 1158                                         rep->r_flags |= R_SENT;
 1159                                         nmp->nm_sent += NFS_CWNDSCALE;
 1160                                 }
 1161                                 rep->r_rtt = 0;
 1162                         }
 1163                 }
 1164         }
 1165         splx(s);
 1166         nfs_timer_handle = timeout(nfs_timer, NULL, nfs_ticks);
 1167 }
 1168 
 1169 /*
 1170  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
 1171  * wait for all requests to complete. This is used by forced unmounts
 1172  * to terminate any outstanding RPCs.
 1173  */
 1174 int
 1175 nfs_nmcancelreqs(nmp)
 1176         struct nfsmount *nmp;
 1177 {
 1178         struct nfsreq *req;
 1179         int i, s;
 1180 
 1181         s = splnet();
 1182         TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1183                 if (nmp != req->r_nmp || req->r_mrep != NULL ||
 1184                     (req->r_flags & R_SOFTTERM))
 1185                         continue;
 1186                 nfs_softterm(req);
 1187         }
 1188         splx(s);
 1189 
 1190         for (i = 0; i < 30; i++) {
 1191                 s = splnet();
 1192                 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1193                         if (nmp == req->r_nmp)
 1194                                 break;
 1195                 }
 1196                 splx(s);
 1197                 if (req == NULL)
 1198                         return (0);
 1199                 tsleep(&lbolt, PSOCK, "nfscancel", 0);
 1200         }
 1201         return (EBUSY);
 1202 }
 1203 
 1204 /*
 1205  * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
 1206  * The nm_send count is decremented now to avoid deadlocks when the process in
 1207  * soreceive() hasn't yet managed to send its own request.
 1208  */
 1209 
 1210 static void
 1211 nfs_softterm(struct nfsreq *rep)
 1212 {
 1213 
 1214         rep->r_flags |= R_SOFTTERM;
 1215         if (rep->r_flags & R_SENT) {
 1216                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
 1217                 rep->r_flags &= ~R_SENT;
 1218         }
 1219 }
 1220 
 1221 /*
 1222  * Test for a termination condition pending on the process.
 1223  * This is used for NFSMNT_INT mounts.
 1224  */
 1225 int
 1226 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
 1227 {
 1228         struct proc *p;
 1229         sigset_t tmpset;
 1230 
 1231         if (rep && (rep->r_flags & R_SOFTTERM))
 1232                 return (EINTR);
 1233         /* Terminate all requests while attempting a forced unmount. */
 1234         if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 1235                 return (EINTR);
 1236         if (!(nmp->nm_flag & NFSMNT_INT))
 1237                 return (0);
 1238         if (td == NULL)
 1239                 return (0);
 1240 
 1241         p = td->td_proc;
 1242         tmpset = p->p_siglist;
 1243         SIGSETNAND(tmpset, p->p_sigmask);
 1244         SIGSETNAND(tmpset, p->p_sigignore);
 1245         if (SIGNOTEMPTY(p->p_siglist) && NFSINT_SIGMASK(tmpset))
 1246                 return (EINTR);
 1247 
 1248         return (0);
 1249 }
 1250 
 1251 /*
 1252  * Lock a socket against others.
 1253  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
 1254  * and also to avoid race conditions between the processes with nfs requests
 1255  * in progress when a reconnect is necessary.
 1256  */
 1257 int
 1258 nfs_sndlock(struct nfsreq *rep)
 1259 {
 1260         int *statep = &rep->r_nmp->nm_state;
 1261         struct thread *td;
 1262         int slpflag = 0, slptimeo = 0;
 1263 
 1264         if (rep) {
 1265                 td = rep->r_td;
 1266                 if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1267                         slpflag = PCATCH;
 1268         } else
 1269                 td = NULL;
 1270         while (*statep & NFSSTA_SNDLOCK) {
 1271                 if (nfs_sigintr(rep->r_nmp, rep, td))
 1272                         return (EINTR);
 1273                 *statep |= NFSSTA_WANTSND;
 1274                 (void) tsleep((caddr_t)statep, slpflag | (PZERO - 1),
 1275                         "nfsndlck", slptimeo);
 1276                 if (slpflag == PCATCH) {
 1277                         slpflag = 0;
 1278                         slptimeo = 2 * hz;
 1279                 }
 1280         }
 1281         *statep |= NFSSTA_SNDLOCK;
 1282         return (0);
 1283 }
 1284 
 1285 /*
 1286  * Unlock the stream socket for others.
 1287  */
 1288 void
 1289 nfs_sndunlock(struct nfsreq *rep)
 1290 {
 1291         int *statep = &rep->r_nmp->nm_state;
 1292 
 1293         if ((*statep & NFSSTA_SNDLOCK) == 0)
 1294                 panic("nfs sndunlock");
 1295         *statep &= ~NFSSTA_SNDLOCK;
 1296         if (*statep & NFSSTA_WANTSND) {
 1297                 *statep &= ~NFSSTA_WANTSND;
 1298                 wakeup((caddr_t)statep);
 1299         }
 1300 }
 1301 
 1302 static int
 1303 nfs_rcvlock(struct nfsreq *rep)
 1304 {
 1305         int *statep = &rep->r_nmp->nm_state;
 1306         int slpflag, slptimeo = 0;
 1307 
 1308         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1309                 slpflag = PCATCH;
 1310         else
 1311                 slpflag = 0;
 1312         while (*statep & NFSSTA_RCVLOCK) {
 1313                 if (nfs_sigintr(rep->r_nmp, rep, rep->r_td))
 1314                         return (EINTR);
 1315                 *statep |= NFSSTA_WANTRCV;
 1316                 (void) tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk",
 1317                         slptimeo);
 1318                 /*
 1319                  * If our reply was recieved while we were sleeping,
 1320                  * then just return without taking the lock to avoid a
 1321                  * situation where a single iod could 'capture' the
 1322                  * recieve lock.
 1323                  */
 1324                 if (rep->r_mrep != NULL)
 1325                         return (EALREADY);
 1326                 if (slpflag == PCATCH) {
 1327                         slpflag = 0;
 1328                         slptimeo = 2 * hz;
 1329                 }
 1330         }
 1331         /* Always fail if our request has been cancelled. */
 1332         if (rep != NULL && (rep->r_flags & R_SOFTTERM))
 1333                 return (EINTR);
 1334         *statep |= NFSSTA_RCVLOCK;
 1335         return (0);
 1336 }
 1337 
 1338 /*
 1339  * Unlock the stream socket for others.
 1340  */
 1341 static void
 1342 nfs_rcvunlock(struct nfsreq *rep)
 1343 {
 1344         int *statep = &rep->r_nmp->nm_state;
 1345 
 1346         if ((*statep & NFSSTA_RCVLOCK) == 0)
 1347                 panic("nfs rcvunlock");
 1348         *statep &= ~NFSSTA_RCVLOCK;
 1349         if (*statep & NFSSTA_WANTRCV) {
 1350                 *statep &= ~NFSSTA_WANTRCV;
 1351                 wakeup((caddr_t)statep);
 1352         }
 1353 }
 1354 
 1355 /*
 1356  *      nfs_realign:
 1357  *
 1358  *      Check for badly aligned mbuf data and realign by copying the unaligned
 1359  *      portion of the data into a new mbuf chain and freeing the portions
 1360  *      of the old chain that were replaced.
 1361  *
 1362  *      We cannot simply realign the data within the existing mbuf chain
 1363  *      because the underlying buffers may contain other rpc commands and
 1364  *      we cannot afford to overwrite them.
 1365  *
 1366  *      We would prefer to avoid this situation entirely.  The situation does
 1367  *      not occur with NFS/UDP and is supposed to only occassionally occur
 1368  *      with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
 1369  */
 1370 static void
 1371 nfs_realign(struct mbuf **pm, int hsiz)
 1372 {
 1373         struct mbuf *m;
 1374         struct mbuf *n = NULL;
 1375         int off = 0;
 1376 
 1377         ++nfs_realign_test;
 1378         while ((m = *pm) != NULL) {
 1379                 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
 1380                         MGET(n, M_TRYWAIT, MT_DATA);
 1381                         if (m->m_len >= MINCLSIZE) {
 1382                                 MCLGET(n, M_TRYWAIT);
 1383                         }
 1384                         n->m_len = 0;
 1385                         break;
 1386                 }
 1387                 pm = &m->m_next;
 1388         }
 1389         /*
 1390          * If n is non-NULL, loop on m copying data, then replace the
 1391          * portion of the chain that had to be realigned.
 1392          */
 1393         if (n != NULL) {
 1394                 ++nfs_realign_count;
 1395                 while (m) {
 1396                         m_copyback(n, off, m->m_len, mtod(m, caddr_t));
 1397                         off += m->m_len;
 1398                         m = m->m_next;
 1399                 }
 1400                 m_freem(*pm);
 1401                 *pm = n;
 1402         }
 1403 }
 1404 
 1405 
 1406 static int
 1407 nfs_msg(struct thread *td, char *server, char *msg)
 1408 {
 1409 
 1410         tprintf(td ? td->td_proc : NULL, LOG_INFO,
 1411             "nfs server %s: %s\n", server, msg);
 1412         return (0);
 1413 }

Cache object: 8b3c755782aea60c5844c41f064ece95


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.