nfs_socket.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1989, 1991, 1993, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to Berkeley by
    6  * Rick Macklem at The University of Guelph.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 4. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
   33  */
   34 
   35 #include <sys/cdefs.h>
   36 __FBSDID("$FreeBSD$");
   37 
   38 /*
   39  * Socket operations for use by nfs
   40  */
   41 
   42 #include "opt_inet6.h"
   43 
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/kernel.h>
   47 #include <sys/lock.h>
   48 #include <sys/malloc.h>
   49 #include <sys/mbuf.h>
   50 #include <sys/mount.h>
   51 #include <sys/mutex.h>
   52 #include <sys/proc.h>
   53 #include <sys/protosw.h>
   54 #include <sys/signalvar.h>
   55 #include <sys/syscallsubr.h>
   56 #include <sys/socket.h>
   57 #include <sys/socketvar.h>
   58 #include <sys/sysctl.h>
   59 #include <sys/syslog.h>
   60 #include <sys/vnode.h>
   61 
   62 #include <netinet/in.h>
   63 #include <netinet/tcp.h>
   64 
   65 #include <rpc/rpcclnt.h>
   66 
   67 #include <nfs/rpcv2.h>
   68 #include <nfs/nfsproto.h>
   69 #include <nfsclient/nfs.h>
   70 #include <nfs/xdr_subs.h>
   71 #include <nfsclient/nfsm_subs.h>
   72 #include <nfsclient/nfsmount.h>
   73 #include <nfsclient/nfsnode.h>
   74 
   75 #include <nfs4client/nfs4.h>
   76 
   77 #define TRUE    1
   78 #define FALSE   0
   79 
   80 static int      nfs_realign_test;
   81 static int      nfs_realign_count;
   82 static int      nfs_bufpackets = 4;
   83 static int      nfs_reconnects;
   84 static int     nfs3_jukebox_delay = 10;
   85 static int     nfs_skip_wcc_data_onerr = 1;
   86 
   87 SYSCTL_DECL(_vfs_nfs);
   88 
   89 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
   90 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
   91 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
   92 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
   93     "number of times the nfs client has had to reconnect");
   94 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
   95            "number of seconds to delay a retry after receiving EJUKEBOX");
   96 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0, "");
   97 
   98 /*
   99  * There is a congestion window for outstanding rpcs maintained per mount
  100  * point. The cwnd size is adjusted in roughly the way that:
  101  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
  102  * SIGCOMM '88". ACM, August 1988.
  103  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
  104  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
  105  * of rpcs is in progress.
  106  * (The sent count and cwnd are scaled for integer arith.)
  107  * Variants of "slow start" were tried and were found to be too much of a
  108  * performance hit (ave. rtt 3 times larger),
  109  * I suspect due to the large rtt that nfs rpcs have.
  110  */
  111 #define NFS_CWNDSCALE   256
  112 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
  113 #define NFS_NBACKOFF    8
  114 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
  115 struct callout  nfs_callout;
  116 
  117 static int      nfs_msg(struct thread *, const char *, const char *, int);
  118 static int      nfs_realign(struct mbuf **pm, int hsiz);
  119 static int      nfs_reply(struct nfsreq *);
  120 static void     nfs_softterm(struct nfsreq *rep);
  121 static int      nfs_reconnect(struct nfsreq *rep);
  122 static void nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag);
  123 static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
  124 
  125 extern struct mtx nfs_reqq_mtx;
  126 
  127 /*
  128  * RTT estimator
  129  */
  130 
  131 static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
  132         NFS_DEFAULT_TIMER,      /* NULL */
  133         NFS_GETATTR_TIMER,      /* GETATTR */
  134         NFS_DEFAULT_TIMER,      /* SETATTR */
  135         NFS_LOOKUP_TIMER,       /* LOOKUP */
  136         NFS_GETATTR_TIMER,      /* ACCESS */
  137         NFS_READ_TIMER,         /* READLINK */
  138         NFS_READ_TIMER,         /* READ */
  139         NFS_WRITE_TIMER,        /* WRITE */
  140         NFS_DEFAULT_TIMER,      /* CREATE */
  141         NFS_DEFAULT_TIMER,      /* MKDIR */
  142         NFS_DEFAULT_TIMER,      /* SYMLINK */
  143         NFS_DEFAULT_TIMER,      /* MKNOD */
  144         NFS_DEFAULT_TIMER,      /* REMOVE */
  145         NFS_DEFAULT_TIMER,      /* RMDIR */
  146         NFS_DEFAULT_TIMER,      /* RENAME */
  147         NFS_DEFAULT_TIMER,      /* LINK */
  148         NFS_READ_TIMER,         /* READDIR */
  149         NFS_READ_TIMER,         /* READDIRPLUS */
  150         NFS_DEFAULT_TIMER,      /* FSSTAT */
  151         NFS_DEFAULT_TIMER,      /* FSINFO */
  152         NFS_DEFAULT_TIMER,      /* PATHCONF */
  153         NFS_DEFAULT_TIMER,      /* COMMIT */
  154         NFS_DEFAULT_TIMER,      /* NOOP */
  155 };
  156 
  157 /*
  158  * Choose the correct RTT timer for this NFS procedure.
  159  */
  160 static inline enum nfs_rto_timer_t
  161 nfs_rto_timer(u_int32_t procnum)
  162 {
  163         return nfs_proct[procnum];
  164 }
  165 
  166 /*
  167  * Initialize the RTT estimator state for a new mount point.
  168  */
  169 static void
  170 nfs_init_rtt(struct nfsmount *nmp)
  171 {
  172         int i;
  173 
  174         for (i = 0; i < NFS_MAX_TIMER; i++)
  175                 nmp->nm_srtt[i] = NFS_INITRTT;
  176         for (i = 0; i < NFS_MAX_TIMER; i++)
  177                 nmp->nm_sdrtt[i] = 0;
  178 }
  179 
  180 /*
  181  * Update a mount point's RTT estimator state using data from the
  182  * passed-in request.
  183  * 
  184  * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
  185  *
  186  * NB: Since the timer resolution of NFS_HZ is so course, it can often
  187  * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
  188  * between N + dt and N + 2 - dt ticks, add 1 before calculating the
  189  * update values.
  190  */
  191 static void
  192 nfs_update_rtt(struct nfsreq *rep)
  193 {
  194         int t1 = rep->r_rtt + 1;
  195         int index = nfs_rto_timer(rep->r_procnum) - 1;
  196         int *srtt = &rep->r_nmp->nm_srtt[index];
  197         int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
  198 
  199         t1 -= *srtt >> 3;
  200         *srtt += t1;
  201         if (t1 < 0)
  202                 t1 = -t1;
  203         t1 -= *sdrtt >> 2;
  204         *sdrtt += t1;
  205 }
  206 
  207 /*
  208  * Estimate RTO for an NFS RPC sent via an unreliable datagram.
  209  *
  210  * Use the mean and mean deviation of RTT for the appropriate type
  211  * of RPC for the frequent RPCs and a default for the others.
  212  * The justification for doing "other" this way is that these RPCs
  213  * happen so infrequently that timer est. would probably be stale.
  214  * Also, since many of these RPCs are non-idempotent, a conservative
  215  * timeout is desired.
  216  *
  217  * getattr, lookup - A+2D
  218  * read, write     - A+4D
  219  * other           - nm_timeo
  220  */
  221 static int
  222 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
  223 {
  224         enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
  225         int index = timer - 1;
  226         int rto;
  227 
  228         switch (timer) {
  229         case NFS_GETATTR_TIMER:
  230         case NFS_LOOKUP_TIMER:
  231                 rto = ((nmp->nm_srtt[index] + 3) >> 2) +
  232                                 ((nmp->nm_sdrtt[index] + 1) >> 1);
  233                 break;
  234         case NFS_READ_TIMER:
  235         case NFS_WRITE_TIMER:
  236                 rto = ((nmp->nm_srtt[index] + 7) >> 3) +
  237                                 (nmp->nm_sdrtt[index] + 1);
  238                 break;
  239         default:
  240                 rto = nmp->nm_timeo;
  241                 return (rto);
  242         }
  243 
  244         if (rto < NFS_MINRTO)
  245                 rto = NFS_MINRTO;
  246         else if (rto > NFS_MAXRTO)
  247                 rto = NFS_MAXRTO;
  248 
  249         return (rto);
  250 }
  251 
  252 
  253 /*
  254  * Initialize sockets and congestion for a new NFS connection.
  255  * We do not free the sockaddr if error.
  256  */
  257 int
  258 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
  259 {
  260         struct socket *so;
  261         int error, rcvreserve, sndreserve;
  262         int pktscale;
  263         struct sockaddr *saddr;
  264         struct ucred *origcred;
  265         struct thread *td = curthread;
  266 
  267         /*
  268          * We need to establish the socket using the credentials of
  269          * the mountpoint.  Some parts of this process (such as
  270          * sobind() and soconnect()) will use the curent thread's
  271          * credential instead of the socket credential.  To work
  272          * around this, temporarily change the current thread's
  273          * credential to that of the mountpoint.
  274          *
  275          * XXX: It would be better to explicitly pass the correct
  276          * credential to sobind() and soconnect().
  277          */
  278         origcred = td->td_ucred;
  279         td->td_ucred = nmp->nm_mountp->mnt_cred;
  280 
  281         if (nmp->nm_sotype == SOCK_STREAM) {
  282                 mtx_lock(&nmp->nm_mtx);
  283                 nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
  284                 nmp->nm_nfstcpstate.rpcresid = 0;
  285                 mtx_unlock(&nmp->nm_mtx);
  286         }       
  287         nmp->nm_so = NULL;
  288         saddr = nmp->nm_nam;
  289         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
  290                 nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td);
  291         if (error)
  292                 goto bad;
  293         so = nmp->nm_so;
  294         nmp->nm_soflags = so->so_proto->pr_flags;
  295 
  296         /*
  297          * Some servers require that the client port be a reserved port number.
  298          */
  299         if (nmp->nm_flag & NFSMNT_RESVPORT) {
  300                 struct sockopt sopt;
  301                 int ip, ip2, len;
  302                 struct sockaddr_in6 ssin;
  303                 struct sockaddr *sa;
  304 
  305                 bzero(&sopt, sizeof sopt);
  306                 switch(saddr->sa_family) {
  307                 case AF_INET:
  308                         sopt.sopt_level = IPPROTO_IP;
  309                         sopt.sopt_name = IP_PORTRANGE;
  310                         ip = IP_PORTRANGE_LOW;
  311                         ip2 = IP_PORTRANGE_DEFAULT;
  312                         len = sizeof (struct sockaddr_in);
  313                         break;
  314 #ifdef INET6
  315                 case AF_INET6:
  316                         sopt.sopt_level = IPPROTO_IPV6;
  317                         sopt.sopt_name = IPV6_PORTRANGE;
  318                         ip = IPV6_PORTRANGE_LOW;
  319                         ip2 = IPV6_PORTRANGE_DEFAULT;
  320                         len = sizeof (struct sockaddr_in6);
  321                         break;
  322 #endif
  323                 default:
  324                         goto noresvport;
  325                 }
  326                 sa = (struct sockaddr *)&ssin;
  327                 bzero(sa, len);
  328                 sa->sa_len = len;
  329                 sa->sa_family = saddr->sa_family;
  330                 sopt.sopt_dir = SOPT_SET;
  331                 sopt.sopt_val = (void *)&ip;
  332                 sopt.sopt_valsize = sizeof(ip);
  333                 error = sosetopt(so, &sopt);
  334                 if (error)
  335                         goto bad;
  336                 error = sobind(so, sa, td);
  337                 if (error)
  338                         goto bad;
  339                 ip = ip2;
  340                 error = sosetopt(so, &sopt);
  341                 if (error)
  342                         goto bad;
  343         noresvport: ;
  344         }
  345 
  346         /*
  347          * Protocols that do not require connections may be optionally left
  348          * unconnected for servers that reply from a port other than NFS_PORT.
  349          */
  350         mtx_lock(&nmp->nm_mtx);
  351         if (nmp->nm_flag & NFSMNT_NOCONN) {
  352                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
  353                         error = ENOTCONN;
  354                         mtx_unlock(&nmp->nm_mtx);
  355                         goto bad;
  356                 } else
  357                         mtx_unlock(&nmp->nm_mtx);
  358         } else {
  359                 mtx_unlock(&nmp->nm_mtx);
  360                 error = soconnect(so, nmp->nm_nam, td);
  361                 if (error)
  362                         goto bad;
  363 
  364                 /*
  365                  * Wait for the connection to complete. Cribbed from the
  366                  * connect system call but with the wait timing out so
  367                  * that interruptible mounts don't hang here for a long time.
  368                  */
  369                 SOCK_LOCK(so);
  370                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
  371                         (void) msleep(&so->so_timeo, SOCK_MTX(so),
  372                             PSOCK, "nfscon", 2 * hz);
  373                         if ((so->so_state & SS_ISCONNECTING) &&
  374                             so->so_error == 0 && rep &&
  375                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
  376                                 so->so_state &= ~SS_ISCONNECTING;
  377                                 SOCK_UNLOCK(so);
  378                                 goto bad;
  379                         }
  380                 }
  381                 if (so->so_error) {
  382                         error = so->so_error;
  383                         so->so_error = 0;
  384                         SOCK_UNLOCK(so);
  385                         goto bad;
  386                 }
  387                 SOCK_UNLOCK(so);
  388         }
  389         so->so_rcv.sb_timeo = 12 * hz;
  390         if (nmp->nm_sotype == SOCK_STREAM)
  391                 so->so_snd.sb_timeo = 1 * hz;   /* 1s snd timeout for NFS/TCP */
  392         else
  393                 so->so_snd.sb_timeo = 5 * hz;
  394 
  395         /*
  396          * Get buffer reservation size from sysctl, but impose reasonable
  397          * limits.
  398          */
  399         pktscale = nfs_bufpackets;
  400         if (pktscale < 2)
  401                 pktscale = 2;
  402         if (pktscale > 64)
  403                 pktscale = 64;
  404         mtx_lock(&nmp->nm_mtx);
  405         if (nmp->nm_sotype == SOCK_DGRAM) {
  406                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  407                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  408                     NFS_MAXPKTHDR) * pktscale;
  409         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
  410                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
  411                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
  412                     NFS_MAXPKTHDR) * pktscale;
  413         } else {
  414                 if (nmp->nm_sotype != SOCK_STREAM)
  415                         panic("nfscon sotype");
  416                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  417                         struct sockopt sopt;
  418                         int val;
  419 
  420                         bzero(&sopt, sizeof sopt);
  421                         sopt.sopt_dir = SOPT_SET;
  422                         sopt.sopt_level = SOL_SOCKET;
  423                         sopt.sopt_name = SO_KEEPALIVE;
  424                         sopt.sopt_val = &val;
  425                         sopt.sopt_valsize = sizeof val;
  426                         val = 1;
  427                         mtx_unlock(&nmp->nm_mtx);
  428                         sosetopt(so, &sopt);
  429                         mtx_lock(&nmp->nm_mtx);
  430                 }
  431                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
  432                         struct sockopt sopt;
  433                         int val;
  434 
  435                         bzero(&sopt, sizeof sopt);
  436                         sopt.sopt_dir = SOPT_SET;
  437                         sopt.sopt_level = IPPROTO_TCP;
  438                         sopt.sopt_name = TCP_NODELAY;
  439                         sopt.sopt_val = &val;
  440                         sopt.sopt_valsize = sizeof val;
  441                         val = 1;
  442                         mtx_unlock(&nmp->nm_mtx);
  443                         sosetopt(so, &sopt);
  444                         mtx_lock(&nmp->nm_mtx);
  445                 }
  446                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
  447                     sizeof (u_int32_t)) * pktscale;
  448                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
  449                     sizeof (u_int32_t)) * pktscale;
  450         }
  451         mtx_unlock(&nmp->nm_mtx);
  452         error = soreserve(so, sndreserve, rcvreserve);
  453         if (error)
  454                 goto bad;
  455         SOCKBUF_LOCK(&so->so_rcv);
  456         so->so_rcv.sb_flags |= SB_NOINTR;
  457         so->so_upcallarg = (caddr_t)nmp;
  458         if (so->so_type == SOCK_STREAM)
  459                 so->so_upcall = nfs_clnt_tcp_soupcall;
  460         else    
  461                 so->so_upcall = nfs_clnt_udp_soupcall;
  462         so->so_rcv.sb_flags |= SB_UPCALL;
  463         SOCKBUF_UNLOCK(&so->so_rcv);
  464         SOCKBUF_LOCK(&so->so_snd);
  465         so->so_snd.sb_flags |= SB_NOINTR;
  466         SOCKBUF_UNLOCK(&so->so_snd);
  467 
  468         /* Restore current thread's credentials. */
  469         td->td_ucred = origcred;
  470 
  471         mtx_lock(&nmp->nm_mtx);
  472         /* Initialize other non-zero congestion variables */
  473         nfs_init_rtt(nmp);
  474         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
  475         nmp->nm_sent = 0;
  476         nmp->nm_timeouts = 0;
  477         mtx_unlock(&nmp->nm_mtx);
  478         return (0);
  479 
  480 bad:
  481         /* Restore current thread's credentials. */
  482         td->td_ucred = origcred;
  483 
  484         nfs_disconnect(nmp);
  485         return (error);
  486 }
  487 
  488 static void
  489 nfs_wakup_reconnectors(struct nfsmount *nmp)
  490 {
  491         KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
  492         if (--nmp->nm_nfstcpstate.sock_send_inprog == 0 &&
  493             (nmp->nm_nfstcpstate.flags & NFS_TCP_WAIT_WRITE_DRAIN)) {
  494                 nmp->nm_nfstcpstate.flags &= ~NFS_TCP_WAIT_WRITE_DRAIN;
  495                 wakeup((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog);
  496         }
  497 }
  498 
  499 /*
  500  * Reconnect routine:
  501  * Called when a connection is broken on a reliable protocol.
  502  * - clean up the old socket
  503  * - nfs_connect() again
  504  * - set R_MUSTRESEND for all outstanding requests on mount point
  505  * If this fails the mount point is DEAD!
  506  * nb: Must be called with the nfs_sndlock() set on the mount point.
  507  */
  508 static int
  509 nfs_reconnect(struct nfsreq *rep)
  510 {
  511         struct nfsreq *rp;
  512         struct nfsmount *nmp = rep->r_nmp;
  513         int error;
  514         int slpflag = 0;
  515 
  516         KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
  517         if (nmp->nm_flag & NFSMNT_INT)
  518                 slpflag = PCATCH;
  519         /*
  520          * Wait for any pending writes to this socket to drain (or timeout).
  521          */
  522         while (nmp->nm_nfstcpstate.sock_send_inprog > 0) {
  523                 nmp->nm_nfstcpstate.flags |= NFS_TCP_WAIT_WRITE_DRAIN;
  524                 error = msleep((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog,
  525                                &nmp->nm_mtx, slpflag | (PZERO - 1), "nfscon", 0);               
  526         }
  527         /*
  528          * Grab the nfs_connect_lock to serialize connects. 
  529          * After grabbing the nfs_connect_lock, check if a reconnect is necessary or
  530          * if someone else beat us to the connect !
  531          */
  532         error = nfs_connect_lock(rep);
  533         if (error)
  534                 goto unlock_exit;
  535         if ((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) == 0)
  536                 goto unlock_exit;
  537         else
  538                 mtx_unlock(&nmp->nm_mtx);
  539 
  540         nfs_reconnects++;
  541         nfs_disconnect(nmp);
  542         while ((error = nfs_connect(nmp, rep)) != 0) {
  543                 if (error == ERESTART)
  544                         error = EINTR;
  545                 if (error == EIO || error == EINTR) {
  546                         mtx_lock(&nmp->nm_mtx);
  547                         goto unlock_exit;
  548                 }
  549                 (void) tsleep(&lbolt, PSOCK, "nfscon", 0);
  550         }
  551 
  552         /*
  553          * Clear the FORCE_RECONNECT flag only after the connect 
  554          * succeeds. To prevent races between multiple processes 
  555          * waiting on the mountpoint where the connection is being
  556          * torn down. The first one to acquire the sndlock will 
  557          * retry the connection. The others block on the sndlock
  558          * until the connection is established successfully, and 
  559          * then re-transmit the request.
  560          */
  561         mtx_lock(&nmp->nm_mtx);
  562         nmp->nm_nfstcpstate.flags &= ~NFS_TCP_FORCE_RECONNECT;
  563         nmp->nm_nfstcpstate.rpcresid = 0;
  564         mtx_unlock(&nmp->nm_mtx);       
  565 
  566         /*
  567          * Loop through outstanding request list and fix up all requests
  568          * on old socket.
  569          */
  570         mtx_lock(&nfs_reqq_mtx);
  571         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
  572                 if (rp->r_nmp == nmp) {
  573                         mtx_lock(&rp->r_mtx);                   
  574                         rp->r_flags |= R_MUSTRESEND;
  575                         mtx_unlock(&rp->r_mtx);
  576                 }
  577         }
  578         mtx_unlock(&nfs_reqq_mtx);
  579         mtx_lock(&nmp->nm_mtx);
  580 unlock_exit:
  581         nfs_connect_unlock(rep);
  582         mtx_unlock(&nmp->nm_mtx);               
  583         return (error);
  584 }
  585 
  586 /*
  587  * NFS disconnect. Clean up and unlink.
  588  */
  589 void
  590 nfs_disconnect(struct nfsmount *nmp)
  591 {
  592         struct socket *so;
  593 
  594         mtx_lock(&nmp->nm_mtx);
  595         if (nmp->nm_so) {
  596                 so = nmp->nm_so;
  597                 nmp->nm_so = NULL;
  598                 mtx_unlock(&nmp->nm_mtx);
  599                 SOCKBUF_LOCK(&so->so_rcv);
  600                 so->so_upcallarg = NULL;
  601                 so->so_upcall = NULL;
  602                 so->so_rcv.sb_flags &= ~SB_UPCALL;
  603                 SOCKBUF_UNLOCK(&so->so_rcv);
  604                 soshutdown(so, SHUT_WR);
  605                 soclose(so);
  606         } else
  607                 mtx_unlock(&nmp->nm_mtx);
  608 }
  609 
  610 void
  611 nfs_safedisconnect(struct nfsmount *nmp)
  612 {
  613         struct nfsreq dummyreq;
  614 
  615         bzero(&dummyreq, sizeof(dummyreq));
  616         dummyreq.r_nmp = nmp;
  617         nfs_disconnect(nmp);
  618 }
  619 
  620 /*
  621  * This is the nfs send routine. For connection based socket types, it
  622  * must be called with an nfs_sndlock() on the socket.
  623  * - return EINTR if the RPC is terminated, 0 otherwise
  624  * - set R_MUSTRESEND if the send fails for any reason
  625  * - do any cleanup required by recoverable socket errors (?)
  626  */
  627 int
  628 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
  629     struct nfsreq *rep)
  630 {
  631         struct sockaddr *sendnam;
  632         int error, error2, soflags, flags;
  633 
  634         KASSERT(rep, ("nfs_send: called with rep == NULL"));
  635 
  636         error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
  637         if (error) {
  638                 m_freem(top);
  639                 return (error);
  640         }
  641         mtx_lock(&rep->r_nmp->nm_mtx);
  642         mtx_lock(&rep->r_mtx);
  643         if ((so = rep->r_nmp->nm_so) == NULL) {
  644                 rep->r_flags |= R_MUSTRESEND;
  645                 mtx_unlock(&rep->r_mtx);
  646                 mtx_unlock(&rep->r_nmp->nm_mtx);
  647                 m_freem(top);
  648                 return (EPIPE);
  649         }
  650         rep->r_flags &= ~R_MUSTRESEND;
  651         soflags = rep->r_nmp->nm_soflags;
  652         mtx_unlock(&rep->r_mtx);
  653         mtx_unlock(&rep->r_nmp->nm_mtx);
  654 
  655         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
  656                 sendnam = NULL;
  657         else
  658                 sendnam = nam;
  659         if (so->so_type == SOCK_SEQPACKET)
  660                 flags = MSG_EOR;
  661         else
  662                 flags = 0;
  663 
  664         error = sosend(so, sendnam, 0, top, 0, flags, curthread /*XXX*/);
  665         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
  666                 error = 0;
  667                 mtx_lock(&rep->r_mtx);
  668                 rep->r_flags |= R_MUSTRESEND;
  669                 mtx_unlock(&rep->r_mtx);
  670         }
  671 
  672         if (error) {
  673                 /*
  674                  * Don't report EPIPE errors on nfs sockets.
  675                  * These can be due to idle tcp mounts which will be closed by
  676                  * netapp, solaris, etc. if left idle too long.
  677                  */
  678                 if (error != EPIPE) {
  679                         log(LOG_INFO, "nfs send error %d for server %s\n",
  680                             error,
  681                             rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
  682                 }
  683                 /*
  684                  * Deal with errors for the client side.
  685                  */
  686                 error2 = NFS_SIGREP(rep);
  687                 if (error2)
  688                         error = error2;
  689                 else {
  690                         mtx_lock(&rep->r_mtx);
  691                         rep->r_flags |= R_MUSTRESEND;
  692                         mtx_unlock(&rep->r_mtx);
  693                 }
  694 
  695                 /*
  696                  * Handle any recoverable (soft) socket errors here. (?)
  697                  * Make EWOULDBLOCK a recoverable error, we'll rexmit from nfs_timer().
  698                  */
  699                 if (error != EINTR && error != ERESTART && error != EIO && error != EPIPE)
  700                         error = 0;
  701         }
  702         return (error);
  703 }
  704 
  705 int
  706 nfs_reply(struct nfsreq *rep)
  707 {
  708         register struct socket *so;
  709         register struct mbuf *m;
  710         int error = 0, sotype, slpflag;
  711         struct nfsmount *nmp = rep->r_nmp;
  712         
  713         sotype = nmp->nm_sotype;
  714         /*
  715          * For reliable protocols, lock against other senders/receivers
  716          * in case a reconnect is necessary.
  717          */
  718         if (sotype != SOCK_DGRAM) {
  719 tryagain:
  720                 mtx_lock(&nmp->nm_mtx);
  721                 mtx_lock(&rep->r_mtx);
  722                 if (rep->r_mrep) {
  723                         mtx_unlock(&rep->r_mtx);
  724                         mtx_unlock(&nmp->nm_mtx);
  725                         return (0);
  726                 }
  727                 if (rep->r_flags & R_SOFTTERM) {
  728                         mtx_unlock(&rep->r_mtx);
  729                         mtx_unlock(&nmp->nm_mtx);
  730                         return (EINTR);
  731                 }
  732                 so = nmp->nm_so;
  733                 if (!so || 
  734                     (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
  735                         mtx_unlock(&rep->r_mtx);
  736                         nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
  737                         error = nfs_reconnect(rep);
  738                         if (error)
  739                                 return (error);
  740                         goto tryagain;
  741                 }
  742                 while (rep->r_flags & R_MUSTRESEND) {
  743                         mtx_unlock(&rep->r_mtx);
  744                         nmp->nm_nfstcpstate.sock_send_inprog++;
  745                         mtx_unlock(&nmp->nm_mtx);
  746                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
  747                         nfsstats.rpcretries++;
  748                         error = nfs_send(so, nmp->nm_nam, m, rep);
  749                         if (error) {
  750                                 mtx_lock(&nmp->nm_mtx);
  751                                 nfs_wakup_reconnectors(nmp);
  752                                 if (!(error == EINTR || error == ERESTART)) {
  753                                         nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
  754                                         error = nfs_reconnect(rep);
  755                                 } else
  756                                         mtx_unlock(&nmp->nm_mtx);
  757                                 if (error)
  758                                         return (error);
  759                                 goto tryagain;
  760                         } else {
  761                                 mtx_lock(&nmp->nm_mtx);
  762                                 nfs_wakup_reconnectors(nmp);
  763                                 mtx_lock(&rep->r_mtx);
  764                         }
  765                 }
  766                 mtx_unlock(&rep->r_mtx);
  767                 mtx_unlock(&nmp->nm_mtx);
  768         }
  769         slpflag = 0;
  770         mtx_lock(&nmp->nm_mtx);
  771         if (nmp->nm_flag & NFSMNT_INT)
  772                 slpflag = PCATCH;
  773         mtx_unlock(&nmp->nm_mtx);
  774         mtx_lock(&rep->r_mtx);
  775         while ((rep->r_mrep == NULL) && (error == 0) && 
  776                ((rep->r_flags & R_SOFTTERM) == 0) &&
  777                ((sotype == SOCK_DGRAM) || ((rep->r_flags & R_MUSTRESEND) == 0)))
  778                 error = msleep((caddr_t)rep, &rep->r_mtx, 
  779                                slpflag | (PZERO - 1), "nfsreq", 0);
  780         if (error == EINTR || error == ERESTART) {
  781                 /* NFS operations aren't restartable. Map ERESTART to EINTR */
  782                 mtx_unlock(&rep->r_mtx);
  783                 return (EINTR);
  784         }
  785         if (rep->r_flags & R_SOFTTERM) {
  786                 /* Request was terminated because we exceeded the retries (soft mount) */
  787                 mtx_unlock(&rep->r_mtx);
  788                 return (ETIMEDOUT);
  789         }
  790         mtx_unlock(&rep->r_mtx);
  791         if (sotype == SOCK_STREAM) {
  792                 mtx_lock(&nmp->nm_mtx);
  793                 mtx_lock(&rep->r_mtx);
  794                 if (((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) || 
  795                      (rep->r_flags & R_MUSTRESEND))) {
  796                         mtx_unlock(&rep->r_mtx);
  797                         mtx_unlock(&nmp->nm_mtx);       
  798                         goto tryagain;
  799                 } else {
  800                         mtx_unlock(&rep->r_mtx);
  801                         mtx_unlock(&nmp->nm_mtx);       
  802                 }
  803         }
  804         return (error);
  805 }
  806 
  807 /*
  808  * XXX TO DO
  809  * Make nfs_realign() non-blocking. Also make nfsm_dissect() nonblocking.
  810  */
  811 static void
  812 nfs_clnt_match_xid(struct socket *so, 
  813                    struct nfsmount *nmp, 
  814                    struct mbuf *mrep)
  815 {
  816         struct mbuf *md;
  817         caddr_t dpos;
  818         u_int32_t rxid, *tl;
  819         struct nfsreq *rep;
  820         int error;
  821         
  822         /*
  823          * Search for any mbufs that are not a multiple of 4 bytes long
  824          * or with m_data not longword aligned.
  825          * These could cause pointer alignment problems, so copy them to
  826          * well aligned mbufs.
  827          */
  828         if (nfs_realign(&mrep, 5 * NFSX_UNSIGNED) == ENOMEM) {
  829                 m_freem(mrep);
  830                 nfsstats.rpcinvalid++;
  831                 return;
  832         }
  833         
  834         /*
  835          * Get the xid and check that it is an rpc reply
  836          */
  837         md = mrep;
  838         dpos = mtod(md, caddr_t);
  839         tl = nfsm_dissect_nonblock(u_int32_t *, 2*NFSX_UNSIGNED);
  840         rxid = *tl++;
  841         if (*tl != rpc_reply) {
  842                 m_freem(mrep);
  843 nfsmout:
  844                 nfsstats.rpcinvalid++;
  845                 return;
  846         }
  847 
  848         mtx_lock(&nfs_reqq_mtx);
  849         /*
  850          * Loop through the request list to match up the reply
  851          * Iff no match, just drop the datagram
  852          */
  853         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
  854                 mtx_lock(&nmp->nm_mtx);
  855                 mtx_lock(&rep->r_mtx);
  856                 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
  857                         /* Found it.. */
  858                         rep->r_mrep = mrep;
  859                         rep->r_md = md;
  860                         rep->r_dpos = dpos;
  861                         /*
  862                          * Update congestion window.
  863                          * Do the additive increase of
  864                          * one rpc/rtt.
  865                          */
  866                         if (nmp->nm_cwnd <= nmp->nm_sent) {
  867                                 nmp->nm_cwnd +=
  868                                         (NFS_CWNDSCALE * NFS_CWNDSCALE +
  869                                          (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
  870                                 if (nmp->nm_cwnd > NFS_MAXCWND)
  871                                         nmp->nm_cwnd = NFS_MAXCWND;
  872                         }       
  873                         if (rep->r_flags & R_SENT) {
  874                                 rep->r_flags &= ~R_SENT;
  875                                 nmp->nm_sent -= NFS_CWNDSCALE;
  876                         }
  877                         if (rep->r_flags & R_TIMING)
  878                                 nfs_update_rtt(rep);
  879                         nmp->nm_timeouts = 0;
  880                         wakeup((caddr_t)rep);
  881                         mtx_unlock(&rep->r_mtx);
  882                         mtx_unlock(&nmp->nm_mtx);
  883                         break;
  884                 }
  885                 mtx_unlock(&rep->r_mtx);
  886                 mtx_unlock(&nmp->nm_mtx);
  887         }
  888         /*
  889          * If not matched to a request, drop it.
  890          * If it's mine, wake up requestor.
  891          */
  892         if (rep == 0) {
  893                 nfsstats.rpcunexpected++;
  894                 m_freem(mrep);
  895         }
  896         mtx_unlock(&nfs_reqq_mtx);
  897 }
  898 
  899 static void
  900 nfs_mark_for_reconnect(struct nfsmount *nmp)
  901 {
  902         struct nfsreq *rp;
  903 
  904         mtx_lock(&nmp->nm_mtx);
  905         nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
  906         mtx_unlock(&nmp->nm_mtx);
  907         /* 
  908          * Wakeup all processes that are waiting for replies 
  909          * on this mount point. One of them does the reconnect.
  910          */
  911         mtx_lock(&nfs_reqq_mtx);
  912         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
  913                 if (rp->r_nmp == nmp) {
  914                         mtx_lock(&rp->r_mtx);
  915                         rp->r_flags |= R_MUSTRESEND;
  916                         wakeup((caddr_t)rp);
  917                         mtx_unlock(&rp->r_mtx);
  918                 }
  919         }
  920         mtx_unlock(&nfs_reqq_mtx);
  921 }
  922 
  923 static int
  924 nfstcp_readable(struct socket *so, int bytes)
  925 {
  926         int retval;
  927         
  928         SOCKBUF_LOCK(&so->so_rcv);
  929         retval = (so->so_rcv.sb_cc >= (bytes) ||
  930                   (so->so_rcv.sb_state & SBS_CANTRCVMORE) ||
  931                   so->so_error);
  932         SOCKBUF_UNLOCK(&so->so_rcv);
  933         return (retval);
  934 }
  935 
  936 #define nfstcp_marker_readable(so)      nfstcp_readable(so, sizeof(u_int32_t))
  937 
  938 static int
  939 nfs_copy_len(struct mbuf *mp, char *buf, int len)
  940 {
  941         while (len > 0 && mp != NULL) {
  942                 int copylen = min(len, mp->m_len);
  943                 
  944                 bcopy(mp->m_data, buf, copylen);
  945                 buf += copylen;
  946                 len -= copylen;
  947                 mp = mp->m_next;
  948         }
  949         return (len);
  950 }
  951 
  952 static void
  953 nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag)
  954 {
  955         struct nfsmount *nmp = (struct nfsmount *)arg;
  956         struct mbuf *mp = NULL;
  957         struct uio auio;
  958         int error;
  959         u_int32_t len;
  960         int rcvflg;
  961 
  962         /*
  963          * Don't pick any more data from the socket if we've marked the 
  964          * mountpoint for reconnect.
  965          */
  966         mtx_lock(&nmp->nm_mtx);
  967         if (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) {
  968                 mtx_unlock(&nmp->nm_mtx);               
  969                 return;
  970         } else                  
  971                 mtx_unlock(&nmp->nm_mtx);
  972         auio.uio_td = curthread;
  973         auio.uio_segflg = UIO_SYSSPACE;
  974         auio.uio_rw = UIO_READ;
  975         for ( ; ; ) {
  976                 mtx_lock(&nmp->nm_mtx);
  977                 if (nmp->nm_nfstcpstate.flags & NFS_TCP_EXPECT_RPCMARKER) {
  978                         int resid;
  979 
  980                         mtx_unlock(&nmp->nm_mtx);
  981                         if (!nfstcp_marker_readable(so)) {
  982                                 /* Marker is not readable */
  983                                 return;
  984                         }
  985                         auio.uio_resid = sizeof(u_int32_t);
  986                         auio.uio_iov = NULL;
  987                         auio.uio_iovcnt = 0;
  988                         mp = NULL;
  989                         rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
  990                         error =  soreceive(so, (struct sockaddr **)0, &auio,
  991                             &mp, (struct mbuf **)0, &rcvflg);
  992                         /*
  993                          * We've already tested that the socket is readable. 2 cases 
  994                          * here, we either read 0 bytes (client closed connection), 
  995                          * or got some other error. In both cases, we tear down the 
  996                          * connection.
  997                          */
  998                         if (error || auio.uio_resid > 0) {
  999                                 if (error && error != ECONNRESET) {
 1000                                         log(LOG_ERR, 
 1001                                             "nfs/tcp clnt: Error %d reading socket, tearing down TCP connection\n",
 1002                                             error);
 1003                                 }
 1004                                 goto mark_reconnect;
 1005                         }
 1006                         if (mp == NULL)
 1007                                 panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
 1008                         /*
 1009                          * Sigh. We can't do the obvious thing here (which would
 1010                          * be to have soreceive copy the length from mbufs for us).
 1011                          * Calling uiomove() from the context of a socket callback
 1012                          * (even for kernel-kernel copies) leads to LORs (since
 1013                          * we hold network locks at this point).
 1014                          */
 1015                         if ((resid = nfs_copy_len(mp, (char *)&len, 
 1016                                                   sizeof(u_int32_t)))) {
 1017                                 log(LOG_ERR, "%s (%d) from nfs server %s\n",
 1018                                     "Bad RPC HDR length",
 1019                                     (int)(sizeof(u_int32_t) - resid),
 1020                                     nmp->nm_mountp->mnt_stat.f_mntfromname);
 1021                                 goto mark_reconnect;
 1022                         }                               
 1023                         len = ntohl(len) & ~0x80000000;
 1024                         m_freem(mp);
 1025                         /*
 1026                          * This is SERIOUS! We are out of sync with the sender
 1027                          * and forcing a disconnect/reconnect is all I can do.
 1028                          */
 1029                         if (len > NFS_MAXPACKET || len == 0) {
 1030                                 log(LOG_ERR, "%s (%d) from nfs server %s\n",
 1031                                     "impossible packet length",
 1032                                     len,
 1033                                     nmp->nm_mountp->mnt_stat.f_mntfromname);
 1034                                 goto mark_reconnect;
 1035                         }
 1036                         mtx_lock(&nmp->nm_mtx);
 1037                         nmp->nm_nfstcpstate.rpcresid = len;
 1038                         nmp->nm_nfstcpstate.flags &= ~(NFS_TCP_EXPECT_RPCMARKER);
 1039                         mtx_unlock(&nmp->nm_mtx);
 1040                 } else
 1041                         mtx_unlock(&nmp->nm_mtx);
 1042 
 1043                 /* 
 1044                  * Processed RPC marker or no RPC marker to process. 
 1045                  * Pull in and process data.
 1046                  */
 1047                 mtx_lock(&nmp->nm_mtx);
 1048                 if (nmp->nm_nfstcpstate.rpcresid > 0) {
 1049                         mtx_unlock(&nmp->nm_mtx);
 1050                         if (!nfstcp_readable(so, nmp->nm_nfstcpstate.rpcresid)) {
 1051                                 /* All data not readable */
 1052                                 return;
 1053                         }
 1054                         auio.uio_resid = nmp->nm_nfstcpstate.rpcresid;
 1055                         auio.uio_iov = NULL;
 1056                         auio.uio_iovcnt = 0;
 1057                         mp = NULL;
 1058                         rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
 1059                         error =  soreceive(so, (struct sockaddr **)0, &auio,
 1060                             &mp, (struct mbuf **)0, &rcvflg);
 1061                         if (error || auio.uio_resid > 0) {
 1062                                 if (error && error != ECONNRESET) {
 1063                                         log(LOG_ERR, 
 1064                                             "nfs/tcp clnt: Error %d reading socket, tearing down TCP connection\n",
 1065                                             error);
 1066                                 }
 1067                                 goto mark_reconnect;                            
 1068                         }
 1069                         if (mp == NULL)
 1070                                 panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
 1071                         mtx_lock(&nmp->nm_mtx);
 1072                         nmp->nm_nfstcpstate.rpcresid = 0;
 1073                         nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
 1074                         mtx_unlock(&nmp->nm_mtx);
 1075                         /* We got the entire RPC reply. Match XIDs and wake up requestor */
 1076                         nfs_clnt_match_xid(so, nmp, mp);
 1077                 } else
 1078                         mtx_unlock(&nmp->nm_mtx);
 1079         }
 1080 
 1081 mark_reconnect:
 1082         nfs_mark_for_reconnect(nmp);
 1083 }
 1084 
 1085 static void
 1086 nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag)
 1087 {
 1088         struct nfsmount *nmp = (struct nfsmount *)arg;
 1089         struct uio auio;
 1090         struct mbuf *mp = NULL;
 1091         struct mbuf *control = NULL;
 1092         int error, rcvflag;
 1093 
 1094         auio.uio_resid = 1000000;
 1095         auio.uio_td = curthread;
 1096         rcvflag = MSG_DONTWAIT;
 1097         auio.uio_resid = 1000000000;
 1098         do {
 1099                 mp = control = NULL;
 1100                 error = soreceive(so, NULL, &auio, &mp, &control, &rcvflag);
 1101                 if (control)
 1102                         m_freem(control);
 1103                 if (mp)
 1104                         nfs_clnt_match_xid(so, nmp, mp);
 1105         } while (mp && !error);
 1106 }
 1107 
 1108 /*
 1109  * nfs_request - goes something like this
 1110  *      - fill in request struct
 1111  *      - links it into list
 1112  *      - calls nfs_send() for first transmit
 1113  *      - calls nfs_receive() to get reply
 1114  *      - break down rpc header and return with nfs reply pointed to
 1115  *        by mrep or error
 1116  * nb: always frees up mreq mbuf list
 1117  */
 1118 int
 1119 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
 1120     struct thread *td, struct ucred *cred, struct mbuf **mrp,
 1121     struct mbuf **mdp, caddr_t *dposp)
 1122 {
 1123         struct mbuf *mrep, *m2;
 1124         struct nfsreq *rep;
 1125         u_int32_t *tl;
 1126         int i;
 1127         struct nfsmount *nmp;
 1128         struct mbuf *m, *md, *mheadend;
 1129         time_t waituntil;
 1130         caddr_t dpos;
 1131         int error = 0, mrest_len, auth_len, auth_type;
 1132         struct timeval now;
 1133         u_int32_t *xidp;
 1134 
 1135         /* Reject requests while attempting a forced unmount. */
 1136         if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
 1137                 m_freem(mrest);
 1138                 return (ESTALE);
 1139         }
 1140         nmp = VFSTONFS(vp->v_mount);
 1141         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 1142                 return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp);
 1143         MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
 1144         bzero(rep, sizeof(struct nfsreq));
 1145         rep->r_nmp = nmp;
 1146         rep->r_vp = vp;
 1147         rep->r_td = td;
 1148         rep->r_procnum = procnum;
 1149         mtx_init(&rep->r_mtx, "NFSrep lock", NULL, MTX_DEF);
 1150 
 1151         getmicrouptime(&now);
 1152         rep->r_lastmsg = now.tv_sec -
 1153             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 1154         mrest_len = m_length(mrest, NULL);
 1155 
 1156         /*
 1157          * Get the RPC header with authorization.
 1158          */
 1159         auth_type = RPCAUTH_UNIX;
 1160         if (cred->cr_ngroups < 1)
 1161                 panic("nfsreq nogrps");
 1162         auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
 1163                 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
 1164                 5 * NFSX_UNSIGNED;
 1165         m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
 1166              mrest, mrest_len, &mheadend, &xidp);
 1167 
 1168         /*
 1169          * For stream protocols, insert a Sun RPC Record Mark.
 1170          */
 1171         if (nmp->nm_sotype == SOCK_STREAM) {
 1172                 M_PREPEND(m, NFSX_UNSIGNED, M_TRYWAIT);
 1173                 *mtod(m, u_int32_t *) = htonl(0x80000000 |
 1174                          (m->m_pkthdr.len - NFSX_UNSIGNED));
 1175         }
 1176         rep->r_mreq = m;
 1177         rep->r_xid = *xidp;
 1178 tryagain:
 1179         if (nmp->nm_flag & NFSMNT_SOFT)
 1180                 rep->r_retry = nmp->nm_retry;
 1181         else
 1182                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
 1183         rep->r_rtt = rep->r_rexmit = 0;
 1184         if (nfs_rto_timer(procnum) != NFS_DEFAULT_TIMER)
 1185                 rep->r_flags = R_TIMING;
 1186         else
 1187                 rep->r_flags = 0;
 1188         rep->r_mrep = NULL;
 1189 
 1190         /*
 1191          * Do the client side RPC.
 1192          */
 1193         nfsstats.rpcrequests++;
 1194         /*
 1195          * Chain request into list of outstanding requests. Be sure
 1196          * to put it LAST so timer finds oldest requests first.
 1197          */
 1198         mtx_lock(&nfs_reqq_mtx);
 1199         if (TAILQ_EMPTY(&nfs_reqq))
 1200                 callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
 1201         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
 1202         mtx_unlock(&nfs_reqq_mtx);
 1203 
 1204         /*
 1205          * If backing off another request or avoiding congestion, don't
 1206          * send this one now but let timer do it. If not timing a request,
 1207          * do it now.
 1208          */
 1209         mtx_lock(&nmp->nm_mtx);
 1210         if (nmp->nm_so && 
 1211             (((nmp->nm_sotype == SOCK_STREAM) && !(nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) || 
 1212              (nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) {
 1213                 if (nmp->nm_sotype == SOCK_STREAM)
 1214                         nmp->nm_nfstcpstate.sock_send_inprog++;
 1215                 mtx_unlock(&nmp->nm_mtx);
 1216                 m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
 1217                 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
 1218                 mtx_lock(&nmp->nm_mtx);
 1219                 mtx_lock(&rep->r_mtx);
 1220                 /* 
 1221                  * nfs_timer() could've re-transmitted the request if we ended up
 1222                  * blocking on nfs_send() too long, so check for R_SENT here.
 1223                  */
 1224                 if (!error && (rep->r_flags & (R_SENT | R_MUSTRESEND)) == 0) {
 1225                         nmp->nm_sent += NFS_CWNDSCALE;
 1226                         rep->r_flags |= R_SENT;
 1227                 }
 1228                 mtx_unlock(&rep->r_mtx);
 1229                 if (nmp->nm_sotype == SOCK_STREAM)
 1230                         nfs_wakup_reconnectors(rep->r_nmp);
 1231                 mtx_unlock(&nmp->nm_mtx);
 1232         } else {
 1233                 mtx_unlock(&nmp->nm_mtx);
 1234                 rep->r_rtt = -1;
 1235         }
 1236 
 1237         /*
 1238          * Wait for the reply from our send or the timer's.
 1239          */
 1240         if (!error || error == EPIPE)
 1241                 error = nfs_reply(rep);
 1242 
 1243         /*
 1244          * nfs_timer() may be in the process of re-transmitting this request.
 1245          * nfs_timer() drops the nfs_reqq_mtx before the pru_send() (to avoid LORs).
 1246          * Wait till nfs_timer() completes the re-transmission. When the reply 
 1247          * comes back, it will be discarded (since the req struct for it no longer 
 1248          * exists).
 1249          */
 1250 wait_for_pinned_req:
 1251         mtx_lock(&rep->r_mtx);
 1252         while (rep->r_flags & R_PIN_REQ) {
 1253                 msleep((caddr_t)&rep->r_flags, &rep->r_mtx, 
 1254                        (PZERO - 1), "nfsrxmt", 0);
 1255         }
 1256         mtx_unlock(&rep->r_mtx);
 1257 
 1258         mtx_lock(&nfs_reqq_mtx);
 1259         /* Have to check for R_PIN_REQ after grabbing wlock again */
 1260         mtx_lock(&rep->r_mtx);
 1261         if (rep->r_flags & R_PIN_REQ) {
 1262                 mtx_unlock(&rep->r_mtx);
 1263                 mtx_unlock(&nfs_reqq_mtx);
 1264                 goto wait_for_pinned_req;
 1265         } else
 1266                 mtx_unlock(&rep->r_mtx);
 1267         /* RPC done (timer not active, request not pinned), unlink the request */
 1268         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
 1269         if (TAILQ_EMPTY(&nfs_reqq))
 1270                 callout_stop(&nfs_callout);
 1271         mtx_unlock(&nfs_reqq_mtx);
 1272 
 1273         /*
 1274          * Decrement the outstanding request count.
 1275          */
 1276         mtx_lock(&rep->r_mtx);
 1277         if (rep->r_flags & R_SENT) {
 1278                 rep->r_flags &= ~R_SENT;        /* paranoia */
 1279                 mtx_unlock(&rep->r_mtx);
 1280                 mtx_lock(&nmp->nm_mtx);
 1281                 nmp->nm_sent -= NFS_CWNDSCALE;
 1282                 mtx_unlock(&nmp->nm_mtx);
 1283         } else
 1284                 mtx_unlock(&rep->r_mtx);
 1285 
 1286         /*
 1287          * If there was a successful reply and a tprintf msg.
 1288          * tprintf a response.
 1289          */
 1290         if (!error) {
 1291                 nfs_up(rep, nmp, rep->r_td, "is alive again", NFSSTA_TIMEO);
 1292         }
 1293         mrep = rep->r_mrep;
 1294         md = rep->r_md;
 1295         dpos = rep->r_dpos;
 1296         if (error) {
 1297                 /*
 1298                  * If we got interrupted by a signal in nfs_reply(), there's
 1299                  * a very small window where the reply could've come in before
 1300                  * this process got scheduled in. To handle that case, we need 
 1301                  * to free the reply if it was delivered.
 1302                  */
 1303                 if (rep->r_mrep != NULL)
 1304                         m_freem(rep->r_mrep);
 1305                 m_freem(rep->r_mreq);
 1306                 mtx_destroy(&rep->r_mtx);
 1307                 free((caddr_t)rep, M_NFSREQ);
 1308                 return (error);
 1309         }
 1310 
 1311         if (rep->r_mrep == NULL)
 1312                 panic("nfs_request: rep->r_mrep shouldn't be NULL if no error\n");
 1313 
 1314         /*
 1315          * break down the rpc header and check if ok
 1316          */
 1317         tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
 1318         if (*tl++ == rpc_msgdenied) {
 1319                 if (*tl == rpc_mismatch)
 1320                         error = EOPNOTSUPP;
 1321                 else
 1322                         error = EACCES;
 1323                 m_freem(mrep);
 1324                 m_freem(rep->r_mreq);
 1325                 mtx_destroy(&rep->r_mtx);
 1326                 free((caddr_t)rep, M_NFSREQ);
 1327                 return (error);
 1328         }
 1329 
 1330         /*
 1331          * Just throw away any verifyer (ie: kerberos etc).
 1332          */
 1333         i = fxdr_unsigned(int, *tl++);          /* verf type */
 1334         i = fxdr_unsigned(int32_t, *tl);        /* len */
 1335         if (i > 0)
 1336                 nfsm_adv(nfsm_rndup(i));
 1337         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1338         /* 0 == ok */
 1339         if (*tl == 0) {
 1340                 tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 1341                 if (*tl != 0) {
 1342                         error = fxdr_unsigned(int, *tl);
 1343                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 1344                                 error == NFSERR_TRYLATER) {
 1345                                 m_freem(mrep);
 1346                                 error = 0;
 1347                                 waituntil = time_second + nfs3_jukebox_delay;
 1348                                 while (time_second < waituntil) {
 1349                                         (void) tsleep(&lbolt, PSOCK, "nqnfstry", 0);
 1350                                 }
 1351                                 rep->r_xid = *xidp = txdr_unsigned(nfs_xid_gen());
 1352                                 goto tryagain;
 1353                         }
 1354                         /*
 1355                          * Make sure NFSERR_RETERR isn't bogusly set by a
 1356                          * server such as amd. (No actual NFS error has bit 31
 1357                          * set.)
 1358                          */
 1359                         error &= ~NFSERR_RETERR;
 1360 
 1361                         /*
 1362                          * If the File Handle was stale, invalidate the
 1363                          * lookup cache, just in case.
 1364                          */
 1365                         if (error == ESTALE)
 1366                                 cache_purge(vp);
 1367                         /*
 1368                          * Skip wcc data on non-ENOENT NFS errors for
 1369                          * now.  NetApp filers return corrupt postop
 1370                          * attrs in the wcc data for NFS err EROFS.
 1371                          * Not sure if they could return corrupt
 1372                          * postop attrs for others errors.  Blocking
 1373                          * ENOENT post-op attributes breaks negative
 1374                          * name caching, so always allow it through.
 1375                          */
 1376                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
 1377                             (!nfs_skip_wcc_data_onerr || error == ENOENT)) {
 1378                                 *mrp = mrep;
 1379                                 *mdp = md;
 1380                                 *dposp = dpos;
 1381                                 error |= NFSERR_RETERR;
 1382                         } else
 1383                                 m_freem(mrep);
 1384                         m_freem(rep->r_mreq);
 1385                         mtx_destroy(&rep->r_mtx);
 1386                         free((caddr_t)rep, M_NFSREQ);
 1387                         return (error);
 1388                 }
 1389 
 1390                 *mrp = mrep;
 1391                 *mdp = md;
 1392                 *dposp = dpos;
 1393                 m_freem(rep->r_mreq);
 1394                 mtx_destroy(&rep->r_mtx);
 1395                 FREE((caddr_t)rep, M_NFSREQ);
 1396                 return (0);
 1397         }
 1398         m_freem(mrep);
 1399         error = EPROTONOSUPPORT;
 1400 nfsmout:
 1401         m_freem(rep->r_mreq);
 1402         mtx_destroy(&rep->r_mtx);
 1403         free((caddr_t)rep, M_NFSREQ);
 1404         return (error);
 1405 }
 1406 
 1407 /*
 1408  * Nfs timer routine
 1409  * Scan the nfsreq list and retranmit any requests that have timed out
 1410  * To avoid retransmission attempts on STREAM sockets (in the future) make
 1411  * sure to set the r_retry field to 0 (implies nm_retry == 0).
 1412  * 
 1413  * The nfs reqq lock cannot be held while we do the pru_send() because of a
 1414  * lock ordering violation. The NFS client socket callback acquires 
 1415  * inp_lock->nfsreq mutex and pru_send acquires inp_lock. So we drop the 
 1416  * reqq mutex (and reacquire it after the pru_send()). The req structure
 1417  * (for the rexmit) is prevented from being removed by the R_PIN_REQ flag.
 1418  */
 1419 void
 1420 nfs_timer(void *arg)
 1421 {
 1422         struct nfsreq *rep;
 1423         struct mbuf *m;
 1424         struct socket *so;
 1425         struct nfsmount *nmp;
 1426         int timeo;
 1427         int error;
 1428         struct timeval now;
 1429 
 1430         getmicrouptime(&now);
 1431         mtx_lock(&nfs_reqq_mtx);
 1432         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
 1433                 nmp = rep->r_nmp;
 1434                 mtx_lock(&rep->r_mtx);
 1435                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
 1436                         mtx_unlock(&rep->r_mtx);                        
 1437                         continue;
 1438                 } else {
 1439                         /*
 1440                          * Terminate request if force-unmount in progress.
 1441                          * Note that NFS could have vfs_busy'ed the mount,
 1442                          * causing the unmount to wait for the mnt_lock, making
 1443                          * this bit of logic necessary.
 1444                          */
 1445                         if (rep->r_nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) {
 1446                                 nfs_softterm(rep);
 1447                                 mtx_unlock(&rep->r_mtx);
 1448                                 continue;
 1449                         }                               
 1450                         mtx_unlock(&rep->r_mtx);                        
 1451                 }
 1452                 if (nfs_sigintr(nmp, rep, rep->r_td))
 1453                         continue;
 1454                 mtx_lock(&nmp->nm_mtx);
 1455                 mtx_lock(&rep->r_mtx);
 1456                 if (nmp->nm_tprintf_initial_delay != 0 &&
 1457                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
 1458                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
 1459                         rep->r_lastmsg = now.tv_sec;
 1460                         /*
 1461                          * Pin down the request and drop locks for the acquisition
 1462                          * of Giant from tprintf() in nfs_down().
 1463                          */
 1464                         rep->r_flags |= R_PIN_REQ;
 1465                         mtx_unlock(&rep->r_mtx);
 1466                         mtx_unlock(&nmp->nm_mtx);
 1467                         mtx_unlock(&nfs_reqq_mtx);
 1468                         nfs_down(rep, nmp, rep->r_td, "not responding",
 1469                                  0, NFSSTA_TIMEO);
 1470                         mtx_lock(&nfs_reqq_mtx);
 1471                         mtx_lock(&nmp->nm_mtx);
 1472                         mtx_lock(&rep->r_mtx);
 1473                         rep->r_flags &= ~R_PIN_REQ;
 1474                         wakeup((caddr_t)&rep->r_flags);
 1475                 }
 1476                 if (rep->r_rtt >= 0) {
 1477                         rep->r_rtt++;
 1478                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
 1479                                 timeo = nmp->nm_timeo;
 1480                         else
 1481                                 timeo = nfs_estimate_rto(nmp, rep->r_procnum);
 1482                         if (nmp->nm_timeouts > 0)
 1483                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
 1484                         if (rep->r_rtt <= timeo) {
 1485                                 mtx_unlock(&rep->r_mtx);
 1486                                 mtx_unlock(&nmp->nm_mtx);
 1487                                 continue;
 1488                         }
 1489                         if (nmp->nm_timeouts < NFS_NBACKOFF)
 1490                                 nmp->nm_timeouts++;
 1491                 }
 1492                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
 1493                         nfsstats.rpctimeouts++;
 1494                         nfs_softterm(rep);
 1495                         mtx_unlock(&rep->r_mtx);
 1496                         mtx_unlock(&nmp->nm_mtx);
 1497                         continue;
 1498                 }
 1499                 if (nmp->nm_sotype != SOCK_DGRAM) {
 1500                         if (++rep->r_rexmit > NFS_MAXREXMIT)
 1501                                 rep->r_rexmit = NFS_MAXREXMIT;
 1502                         /*
 1503                          * For NFS/TCP, setting R_MUSTRESEND and waking up 
 1504                          * the requester will cause the request to be   
 1505                          * retransmitted (in nfs_reply()), re-connecting
 1506                          * if necessary.
 1507                          */
 1508                         rep->r_flags |= R_MUSTRESEND;
 1509                         wakeup((caddr_t)rep);
 1510                         rep->r_rtt = 0;
 1511                         mtx_unlock(&rep->r_mtx);
 1512                         mtx_unlock(&nmp->nm_mtx);
 1513                         continue;
 1514                 }
 1515                 if ((so = nmp->nm_so) == NULL) {
 1516                         mtx_unlock(&rep->r_mtx);
 1517                         mtx_unlock(&nmp->nm_mtx);
 1518                         continue;
 1519                 }
 1520                 /*
 1521                  * If there is enough space and the window allows..
 1522                  *      Resend it
 1523                  * Set r_rtt to -1 in case we fail to send it now.
 1524                  */
 1525                 rep->r_rtt = -1;
 1526                 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
 1527                     ((nmp->nm_flag & NFSMNT_DUMBTIMR) || (rep->r_flags & R_SENT) ||
 1528                      nmp->nm_sent < nmp->nm_cwnd)) {
 1529                         mtx_unlock(&rep->r_mtx);
 1530                         mtx_unlock(&nmp->nm_mtx);
 1531                         if ((m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))) {
 1532                                 /*
 1533                                  * Mark the request to indicate that a XMIT is in 
 1534                                  * progress to prevent the req structure being 
 1535                                  * removed in nfs_request().
 1536                                  */
 1537                                 mtx_lock(&rep->r_mtx);
 1538                                 rep->r_flags |= R_PIN_REQ;
 1539                                 mtx_unlock(&rep->r_mtx);
 1540                                 mtx_unlock(&nfs_reqq_mtx);
 1541                                 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
 1542                                         error = (*so->so_proto->pr_usrreqs->pru_send)
 1543                                                 (so, 0, m, NULL, NULL, curthread);
 1544                                 else    
 1545                                         error = (*so->so_proto->pr_usrreqs->pru_send)
 1546                                                 (so, 0, m, nmp->nm_nam, NULL, 
 1547                                                  curthread);
 1548                                 mtx_lock(&nfs_reqq_mtx);
 1549                                 mtx_lock(&nmp->nm_mtx);
 1550                                 mtx_lock(&rep->r_mtx);
 1551                                 rep->r_flags &= ~R_PIN_REQ;
 1552                                 wakeup((caddr_t)&rep->r_flags);
 1553                                 if (error) {
 1554                                         if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
 1555                                                 so->so_error = 0;
 1556                                         rep->r_flags |= R_RESENDERR;
 1557                                 } else {
 1558                                         /*
 1559                                          * Iff first send, start timing
 1560                                          * else turn timing off, backoff timer
 1561                                          * and divide congestion window by 2.
 1562                                          */
 1563                                         rep->r_flags &= ~R_RESENDERR;
 1564                                         if (rep->r_flags & R_SENT) {
 1565                                                 rep->r_flags &= ~R_TIMING;
 1566                                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
 1567                                                         rep->r_rexmit = NFS_MAXREXMIT;
 1568                                                 nmp->nm_cwnd >>= 1;
 1569                                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
 1570                                                         nmp->nm_cwnd = NFS_CWNDSCALE;
 1571                                                 nfsstats.rpcretries++;
 1572                                         } else {
 1573                                                 rep->r_flags |= R_SENT;
 1574                                                 nmp->nm_sent += NFS_CWNDSCALE;
 1575                                         }
 1576                                         rep->r_rtt = 0;
 1577                                 }
 1578                                 mtx_unlock(&rep->r_mtx);
 1579                                 mtx_unlock(&nmp->nm_mtx);
 1580                         }
 1581                 } else {
 1582                         mtx_unlock(&rep->r_mtx);
 1583                         mtx_unlock(&nmp->nm_mtx);
 1584                 }
 1585         }
 1586         mtx_unlock(&nfs_reqq_mtx);
 1587         callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
 1588 }
 1589 
 1590 /*
 1591  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
 1592  * wait for all requests to complete. This is used by forced unmounts
 1593  * to terminate any outstanding RPCs.
 1594  */
 1595 int
 1596 nfs_nmcancelreqs(nmp)
 1597         struct nfsmount *nmp;
 1598 {
 1599         struct nfsreq *req;
 1600         int i;
 1601 
 1602         mtx_lock(&nfs_reqq_mtx);
 1603         TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1604                 mtx_lock(&req->r_mtx);
 1605                 if (nmp != req->r_nmp || req->r_mrep != NULL ||
 1606                     (req->r_flags & R_SOFTTERM)) {
 1607                         mtx_unlock(&req->r_mtx);                        
 1608                         continue;
 1609                 }
 1610                 nfs_softterm(req);
 1611                 mtx_unlock(&req->r_mtx);
 1612         }
 1613         mtx_unlock(&nfs_reqq_mtx);
 1614 
 1615         for (i = 0; i < 30; i++) {
 1616                 mtx_lock(&nfs_reqq_mtx);
 1617                 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
 1618                         if (nmp == req->r_nmp)
 1619                                 break;
 1620                 }
 1621                 mtx_unlock(&nfs_reqq_mtx);
 1622                 if (req == NULL)
 1623                         return (0);
 1624                 tsleep(&lbolt, PSOCK, "nfscancel", 0);
 1625         }
 1626         return (EBUSY);
 1627 }
 1628 
 1629 /*
 1630  * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
 1631  * The nm_send count is decremented now to avoid deadlocks when the process in
 1632  * soreceive() hasn't yet managed to send its own request.
 1633  */
 1634 
 1635 static void
 1636 nfs_softterm(struct nfsreq *rep)
 1637 {
 1638         KASSERT(mtx_owned(&rep->r_mtx), ("NFS req lock not owned !"));
 1639         rep->r_flags |= R_SOFTTERM;
 1640         if (rep->r_flags & R_SENT) {
 1641                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
 1642                 rep->r_flags &= ~R_SENT;
 1643         }
 1644         /* 
 1645          * Request terminated, wakeup the blocked process, so that we
 1646          * can return EINTR back.
 1647          */
 1648         wakeup((caddr_t)rep);
 1649 }
 1650 
 1651 /*
 1652  * Any signal that can interrupt an NFS operation in an intr mount
 1653  * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
 1654  */
 1655 int nfs_sig_set[] = {
 1656         SIGINT,
 1657         SIGTERM,
 1658         SIGHUP,
 1659         SIGKILL,
 1660         SIGSTOP,
 1661         SIGQUIT
 1662 };
 1663 
 1664 /*
 1665  * Check to see if one of the signals in our subset is pending on
 1666  * the process (in an intr mount).
 1667  */
 1668 static int
 1669 nfs_sig_pending(sigset_t set)
 1670 {
 1671         int i;
 1672         
 1673         for (i = 0 ; i < sizeof(nfs_sig_set)/sizeof(int) ; i++)
 1674                 if (SIGISMEMBER(set, nfs_sig_set[i]))
 1675                         return (1);
 1676         return (0);
 1677 }
 1678  
 1679 /*
 1680  * The set/restore sigmask functions are used to (temporarily) overwrite
 1681  * the process p_sigmask during an RPC call (for example). These are also
 1682  * used in other places in the NFS client that might tsleep().
 1683  */
 1684 void
 1685 nfs_set_sigmask(struct thread *td, sigset_t *oldset)
 1686 {
 1687         sigset_t newset;
 1688         int i;
 1689         struct proc *p;
 1690         
 1691         SIGFILLSET(newset);
 1692         if (td == NULL)
 1693                 td = curthread; /* XXX */
 1694         p = td->td_proc;
 1695         /* Remove the NFS set of signals from newset */
 1696         PROC_LOCK(p);
 1697         mtx_lock(&p->p_sigacts->ps_mtx);
 1698         for (i = 0 ; i < sizeof(nfs_sig_set)/sizeof(int) ; i++) {
 1699                 /*
 1700                  * But make sure we leave the ones already masked
 1701                  * by the process, ie. remove the signal from the
 1702                  * temporary signalmask only if it wasn't already
 1703                  * in p_sigmask.
 1704                  */
 1705                 if (!SIGISMEMBER(td->td_sigmask, nfs_sig_set[i]) &&
 1706                     !SIGISMEMBER(p->p_sigacts->ps_sigignore, nfs_sig_set[i]))
 1707                         SIGDELSET(newset, nfs_sig_set[i]);
 1708         }
 1709         mtx_unlock(&p->p_sigacts->ps_mtx);
 1710         PROC_UNLOCK(p);
 1711         kern_sigprocmask(td, SIG_SETMASK, &newset, oldset, 0);
 1712 }
 1713 
 1714 void
 1715 nfs_restore_sigmask(struct thread *td, sigset_t *set)
 1716 {
 1717         if (td == NULL)
 1718                 td = curthread; /* XXX */
 1719         kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
 1720 }
 1721 
 1722 /*
 1723  * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
 1724  * old one after msleep() returns.
 1725  */
 1726 int
 1727 nfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
 1728 {
 1729         sigset_t oldset;
 1730         int error;
 1731         struct proc *p;
 1732         
 1733         if ((priority & PCATCH) == 0)
 1734                 return msleep(ident, mtx, priority, wmesg, timo);
 1735         if (td == NULL)
 1736                 td = curthread; /* XXX */
 1737         nfs_set_sigmask(td, &oldset);
 1738         error = msleep(ident, mtx, priority, wmesg, timo);
 1739         nfs_restore_sigmask(td, &oldset);
 1740         p = td->td_proc;
 1741         return (error);
 1742 }
 1743 
 1744 /*
 1745  * Test for a termination condition pending on the process.
 1746  * This is used for NFSMNT_INT mounts.
 1747  */
 1748 int
 1749 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
 1750 {
 1751         struct proc *p;
 1752         sigset_t tmpset;
 1753         
 1754         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 1755                 return nfs4_sigintr(nmp, rep, td);
 1756         if (rep) {
 1757                 mtx_lock(&rep->r_mtx);
 1758                 if (rep->r_flags & R_SOFTTERM) {
 1759                         mtx_unlock(&rep->r_mtx);
 1760                         return (EIO);
 1761                 } else
 1762                         mtx_unlock(&rep->r_mtx);
 1763         }
 1764         /* Terminate all requests while attempting a forced unmount. */
 1765         if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
 1766                 return (EIO);
 1767         if (!(nmp->nm_flag & NFSMNT_INT))
 1768                 return (0);
 1769         if (td == NULL)
 1770                 return (0);
 1771         p = td->td_proc;
 1772         PROC_LOCK(p);
 1773         tmpset = p->p_siglist;
 1774         SIGSETOR(tmpset, td->td_siglist);
 1775         SIGSETNAND(tmpset, td->td_sigmask);
 1776         mtx_lock(&p->p_sigacts->ps_mtx);
 1777         SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
 1778         mtx_unlock(&p->p_sigacts->ps_mtx);
 1779         if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
 1780             && nfs_sig_pending(tmpset)) {
 1781                 PROC_UNLOCK(p);
 1782                 return (EINTR);
 1783         }
 1784         PROC_UNLOCK(p);
 1785         return (0);
 1786 }
 1787 
 1788 /*
 1789  * Lock a socket against others.
 1790  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
 1791  * and also to avoid race conditions between the processes with nfs requests
 1792  * in progress when a reconnect is necessary.
 1793  */
 1794 int
 1795 nfs_connect_lock(struct nfsreq *rep)
 1796 {
 1797         int *statep = &rep->r_nmp->nm_state;
 1798         struct thread *td;
 1799         int error, slpflag = 0, slptimeo = 0;
 1800 
 1801         td = rep->r_td;
 1802         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 1803                 slpflag = PCATCH;
 1804         while (*statep & NFSSTA_SNDLOCK) {
 1805                 error = nfs_sigintr(rep->r_nmp, rep, td);
 1806                 if (error) {
 1807                         return (error);
 1808                 }
 1809                 *statep |= NFSSTA_WANTSND;
 1810                 (void) msleep(statep, &rep->r_nmp->nm_mtx,
 1811                               slpflag | (PZERO - 1), "nfsndlck", slptimeo);
 1812                 if (slpflag == PCATCH) {
 1813                         slpflag = 0;
 1814                         slptimeo = 2 * hz;
 1815                 }
 1816         }
 1817         *statep |= NFSSTA_SNDLOCK;
 1818         return (0);
 1819 }
 1820 
 1821 /*
 1822  * Unlock the stream socket for others.
 1823  */
 1824 void
 1825 nfs_connect_unlock(struct nfsreq *rep)
 1826 {
 1827         int *statep = &rep->r_nmp->nm_state;
 1828 
 1829         if ((*statep & NFSSTA_SNDLOCK) == 0)
 1830                 panic("nfs sndunlock");
 1831         *statep &= ~NFSSTA_SNDLOCK;
 1832         if (*statep & NFSSTA_WANTSND) {
 1833                 *statep &= ~NFSSTA_WANTSND;
 1834                 wakeup(statep);
 1835         }
 1836 }
 1837 
 1838 /*
 1839  *      nfs_realign:
 1840  *
 1841  *      Check for badly aligned mbuf data and realign by copying the unaligned
 1842  *      portion of the data into a new mbuf chain and freeing the portions
 1843  *      of the old chain that were replaced.
 1844  *
 1845  *      We cannot simply realign the data within the existing mbuf chain
 1846  *      because the underlying buffers may contain other rpc commands and
 1847  *      we cannot afford to overwrite them.
 1848  *
 1849  *      We would prefer to avoid this situation entirely.  The situation does
 1850  *      not occur with NFS/UDP and is supposed to only occassionally occur
 1851  *      with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
 1852  *
 1853  */
 1854 static int
 1855 nfs_realign(struct mbuf **pm, int hsiz)
 1856 {
 1857         struct mbuf *m;
 1858         struct mbuf *n = NULL;
 1859         int off = 0;
 1860 
 1861         ++nfs_realign_test;
 1862         while ((m = *pm) != NULL) {
 1863                 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
 1864                         MGET(n, M_DONTWAIT, MT_DATA);
 1865                         if (n == NULL)
 1866                                 return (ENOMEM);
 1867                         if (m->m_len >= MINCLSIZE) {
 1868                                 MCLGET(n, M_DONTWAIT);
 1869                                 if (n->m_ext.ext_buf == NULL) {
 1870                                         m_freem(n);
 1871                                         return (ENOMEM);
 1872                                 }
 1873                         }
 1874                         n->m_len = 0;
 1875                         break;
 1876                 }
 1877                 pm = &m->m_next;
 1878         }
 1879         /*
 1880          * If n is non-NULL, loop on m copying data, then replace the
 1881          * portion of the chain that had to be realigned.
 1882          */
 1883         if (n != NULL) {
 1884                 ++nfs_realign_count;
 1885                 while (m) {
 1886                         m_copyback(n, off, m->m_len, mtod(m, caddr_t));
 1887                         off += m->m_len;
 1888                         m = m->m_next;
 1889                 }
 1890                 m_freem(*pm);
 1891                 *pm = n;
 1892         }
 1893         return (0);
 1894 }
 1895 
 1896 
 1897 static int
 1898 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
 1899 {
 1900         struct proc *p;
 1901 
 1902         p = td ? td->td_proc : NULL;
 1903         if (error) {
 1904                 tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
 1905                     msg, error);
 1906         } else {
 1907                 tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
 1908         }
 1909         return (0);
 1910 }
 1911 
 1912 void
 1913 nfs_down(rep, nmp, td, msg, error, flags)
 1914         struct nfsreq *rep;
 1915         struct nfsmount *nmp;
 1916         struct thread *td;
 1917         const char *msg;
 1918         int error, flags;
 1919 {
 1920         if (nmp == NULL)
 1921                 return;
 1922         mtx_lock(&nmp->nm_mtx);
 1923         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
 1924                 nmp->nm_state |= NFSSTA_TIMEO;
 1925                 mtx_unlock(&nmp->nm_mtx);
 1926                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1927                     VQ_NOTRESP, 0);
 1928         } else
 1929                 mtx_unlock(&nmp->nm_mtx);
 1930 #ifdef NFSSTA_LOCKTIMEO
 1931         mtx_lock(&nmp->nm_mtx);
 1932         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 1933                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
 1934                 mtx_unlock(&nmp->nm_mtx);
 1935                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1936                     VQ_NOTRESPLOCK, 0);
 1937         } else
 1938                 mtx_unlock(&nmp->nm_mtx);
 1939 #endif
 1940         if (rep != NULL) {
 1941                 mtx_lock(&rep->r_mtx);
 1942                 rep->r_flags |= R_TPRINTFMSG;
 1943                 mtx_unlock(&rep->r_mtx);
 1944         }
 1945         nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
 1946 }
 1947 
 1948 void
 1949 nfs_up(rep, nmp, td, msg, flags)
 1950         struct nfsreq *rep;
 1951         struct nfsmount *nmp;
 1952         struct thread *td;
 1953         const char *msg;
 1954         int flags;
 1955 {
 1956         if (nmp == NULL || rep == NULL)
 1957                 return;
 1958         mtx_lock(&rep->r_mtx);
 1959         if ((rep->r_flags & R_TPRINTFMSG) != 0) {
 1960                 mtx_unlock(&rep->r_mtx);
 1961                 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
 1962         } else
 1963                 mtx_unlock(&rep->r_mtx);
 1964 
 1965         mtx_lock(&nmp->nm_mtx);
 1966         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
 1967                 nmp->nm_state &= ~NFSSTA_TIMEO;
 1968                 mtx_unlock(&nmp->nm_mtx);
 1969                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1970                     VQ_NOTRESP, 1);
 1971         } else
 1972                 mtx_unlock(&nmp->nm_mtx);
 1973         
 1974 #ifdef NFSSTA_LOCKTIMEO
 1975         mtx_lock(&nmp->nm_mtx);
 1976         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 1977                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 1978                 mtx_unlock(&nmp->nm_mtx);
 1979                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 1980                     VQ_NOTRESPLOCK, 1);
 1981         } else
 1982                 mtx_unlock(&nmp->nm_mtx);
 1983 #endif
 1984 }
Cache object: ab2767757b25decc4591ac612f59e16e
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/nfsclient/nfs_socket.c

FreeBSD/Linux Kernel Cross Reference
sys/nfsclient/nfs_socket.c