[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]

FreeBSD/Linux Kernel Cross Reference
sys/rpc/rpcclnt.c

Version: -  FREEBSD  -  FREEBSD7  -  FREEBSD70  -  FREEBSD6  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  OPENSOLARIS  -  minix-3-1-1  -  TRUSTEDBSD-SEBSD  -  FREEBSD-LIBC  -  FREEBSD7-LIBC  -  FREEBSD6-LIBC  -  GLIBC27 
SearchContext: -  none  -  excerpts  -  bigexcerpts 

  1 /* $FreeBSD: src/sys/rpc/rpcclnt.c,v 1.24 2008/10/23 15:53:51 des Exp $ */
  2 /* $Id: rpcclnt.c,v 1.9 2003/11/05 14:59:03 rees Exp $ */
  3 
  4 /*-
  5  * copyright (c) 2003
  6  * the regents of the university of michigan
  7  * all rights reserved
  8  * 
  9  * permission is granted to use, copy, create derivative works and redistribute
 10  * this software and such derivative works for any purpose, so long as the name
 11  * of the university of michigan is not used in any advertising or publicity
 12  * pertaining to the use or distribution of this software without specific,
 13  * written prior authorization.  if the above copyright notice or any other
 14  * identification of the university of michigan is included in any copy of any
 15  * portion of this software, then the disclaimer below must also be included.
 16  * 
 17  * this software is provided as is, without representation from the university
 18  * of michigan as to its fitness for any purpose, and without warranty by the
 19  * university of michigan of any kind, either express or implied, including
 20  * without limitation the implied warranties of merchantability and fitness for
 21  * a particular purpose. the regents of the university of michigan shall not be
 22  * liable for any damages, including special, indirect, incidental, or
 23  * consequential damages, with respect to any claim arising out of or in
 24  * connection with the use of the software, even if it has been or is hereafter
 25  * advised of the possibility of such damages.
 26  */
 27 
 28 /*-
 29  * Copyright (c) 1989, 1991, 1993, 1995 The Regents of the University of
 30  * California.  All rights reserved.
 31  * 
 32  * This code is derived from software contributed to Berkeley by Rick Macklem at
 33  * The University of Guelph.
 34  * 
 35  * Redistribution and use in source and binary forms, with or without
 36  * modification, are permitted provided that the following conditions are
 37  * met: 1. Redistributions of source code must retain the above copyright
 38  * notice, this list of conditions and the following disclaimer. 2.
 39  * Redistributions in binary form must reproduce the above copyright notice,
 40  * this list of conditions and the following disclaimer in the documentation
 41  * and/or other materials provided with the distribution. 3. All advertising
 42  * materials mentioning features or use of this software must display the
 43  * following acknowledgement: This product includes software developed by the
 44  * University of California, Berkeley and its contributors. 4. Neither the
 45  * name of the University nor the names of its contributors may be used to
 46  * endorse or promote products derived from this software without specific
 47  * prior written permission.
 48  * 
 49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
 50  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 51  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 52  * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
 53  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 55  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 56  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 59  * SUCH DAMAGE.
 60  * 
 61  * @(#)nfs_socket.c     8.5 (Berkeley) 3/30/95
 62  */
 63 
 64 /* XXX: kill ugly debug strings */
 65 /* XXX: get rid of proct, as it is not even being used... (or keep it so v{2,3}
 66  *      can run, but clean it up! */
 67 
 68 #include <sys/param.h>
 69 #include <sys/systm.h>
 70 #include <sys/proc.h>
 71 #include <sys/mount.h>
 72 #include <sys/kernel.h>
 73 #include <sys/mbuf.h>
 74 #include <sys/syslog.h>
 75 #include <sys/malloc.h>
 76 #include <sys/uio.h>
 77 #include <sys/lock.h>
 78 #include <sys/signalvar.h>
 79 #include <sys/sysent.h>
 80 #include <sys/syscall.h>
 81 #include <sys/sysctl.h>
 82 
 83 #include <sys/domain.h>
 84 #include <sys/protosw.h>
 85 #include <sys/socket.h>
 86 #include <sys/socketvar.h>
 87 #include <sys/mutex.h>
 88 
 89 #include <netinet/in.h>
 90 #include <netinet/tcp.h>
 91 
 92 #include <nfs/rpcv2.h>
 93 
 94 #include <rpc/rpcm_subs.h>
 95 #include <rpc/rpcclnt.h>
 96 
 97 /* memory management */
 98 #ifdef __OpenBSD__
 99 struct pool     rpctask_pool;
100 struct pool     rpcclnt_pool;
101 #define RPCTASKPOOL_LWM 10
102 #define RPCTASKPOOL_HWM 40
103 #else
104 static          MALLOC_DEFINE(M_RPCCLNT, "rpcclnt", "rpc state");
105 #endif
106 
107 #define RPC_RETURN(X) do { RPCDEBUG("returning %d", X); return X; }while(0)
108 
109 /*
110  * Estimate rto for an nfs rpc sent via. an unreliable datagram. Use the mean
111  * and mean deviation of rtt for the appropriate type of rpc for the frequent
112  * rpcs and a default for the others. The justification for doing "other"
113  * this way is that these rpcs happen so infrequently that timer est. would
114  * probably be stale. Also, since many of these rpcs are non-idempotent, a
115  * conservative timeout is desired. getattr, lookup - A+2D read, write     -
116  * A+4D other           - nm_timeo
117  */
118 #define RPC_RTO(n, t) \
119         ((t) == 0 ? (n)->rc_timeo : \
120          ((t) < 3 ? \
121           (((((n)->rc_srtt[t-1] + 3) >> 2) + (n)->rc_sdrtt[t-1] + 1) >> 1) : \
122           ((((n)->rc_srtt[t-1] + 7) >> 3) + (n)->rc_sdrtt[t-1] + 1)))
123 
124 #define RPC_SRTT(s,r)   (r)->r_rpcclnt->rc_srtt[rpcclnt_proct((s),\
125                                 (r)->r_procnum) - 1]
126 
127 #define RPC_SDRTT(s,r)  (r)->r_rpcclnt->rc_sdrtt[rpcclnt_proct((s),\
128                                 (r)->r_procnum) - 1]
129 
130 
131 /*
132  * There is a congestion window for outstanding rpcs maintained per mount
133  * point. The cwnd size is adjusted in roughly the way that: Van Jacobson,
134  * Congestion avoidance and Control, In "Proceedings of SIGCOMM '88". ACM,
135  * August 1988. describes for TCP. The cwnd size is chopped in half on a
136  * retransmit timeout and incremented by 1/cwnd when each rpc reply is
137  * received and a full cwnd of rpcs is in progress. (The sent count and cwnd
138  * are scaled for integer arith.) Variants of "slow start" were tried and
139  * were found to be too much of a performance hit (ave. rtt 3 times larger),
140  * I suspect due to the large rtt that nfs rpcs have.
141  */
142 #define RPC_CWNDSCALE   256
143 #define RPC_MAXCWND     (RPC_CWNDSCALE * 32)
144 static const int      rpcclnt_backoff[8] = {2, 4, 8, 16, 32, 64, 128, 256,};
145 
146 /* XXX ugly debug strings */
147 #define RPC_ERRSTR_ACCEPTED_SIZE 6
148 char *rpc_errstr_accepted[RPC_ERRSTR_ACCEPTED_SIZE] = {
149         "",                     /* no good message... */
150         "remote server hasn't exported program.",
151         "remote server can't support version number.",
152         "program can't support procedure.",
153         "procedure can't decode params.",
154         "remote error.  remote side memory allocation failure?"
155 };
156 
157 char *rpc_errstr_denied[2] = {
158         "remote server doesnt support rpc version 2!",
159         "remote server authentication error."
160 };
161 
162 #define RPC_ERRSTR_AUTH_SIZE 6
163 char *rpc_errstr_auth[RPC_ERRSTR_AUTH_SIZE] = {
164         "",
165         "auth error: bad credential (seal broken).",
166         "auth error: client must begin new session.",
167         "auth error: bad verifier (seal broken).",
168         "auth error: verifier expired or replayed.",
169         "auth error: rejected for security reasons.",
170 };
171 
172 /*
173  * Static data, mostly RPC constants in XDR form
174  */
175 static u_int32_t rpc_reply, rpc_call, rpc_vers;
176 
177 /*
178  * rpc_msgdenied, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted,
179  * rpc_autherr, rpc_auth_kerb;
180  */
181 
182 static u_int32_t rpcclnt_xid = 0;
183 static u_int32_t rpcclnt_xid_touched = 0;
184 struct rpcstats rpcstats;
185 int      rpcclnt_ticks;
186 static int fake_wchan;
187 
188 SYSCTL_NODE(_kern, OID_AUTO, rpc, CTLFLAG_RD, 0, "RPC Subsystem");
189 
190 SYSCTL_UINT(_kern_rpc, OID_AUTO, retries, CTLFLAG_RD, &rpcstats.rpcretries, 0, "retries");
191 SYSCTL_UINT(_kern_rpc, OID_AUTO, request, CTLFLAG_RD, &rpcstats.rpcrequests, 0, "request");
192 SYSCTL_UINT(_kern_rpc, OID_AUTO, timeouts, CTLFLAG_RD, &rpcstats.rpctimeouts, 0, "timeouts");
193 SYSCTL_UINT(_kern_rpc, OID_AUTO, unexpected, CTLFLAG_RD, &rpcstats.rpcunexpected, 0, "unexpected");
194 SYSCTL_UINT(_kern_rpc, OID_AUTO, invalid, CTLFLAG_RD, &rpcstats.rpcinvalid, 0, "invalid");
195 
196 
197 #ifdef RPCCLNT_DEBUG
198 int             rpcdebugon = 0;
199 SYSCTL_UINT(_kern_rpc, OID_AUTO, debug_on, CTLFLAG_RW, &rpcdebugon, 0, "RPC Debug messages");
200 #endif
201 
202 /*
203  * Queue head for rpctask's
204  */
205 static 
206 TAILQ_HEAD(, rpctask) rpctask_q;
207 struct callout  rpcclnt_callout;
208 
209 #ifdef __OpenBSD__
210 static int             rpcclnt_send(struct socket *, struct mbuf *, struct mbuf *, struct rpctask *);
211 static int             rpcclnt_receive(struct rpctask *, struct mbuf **, struct mbuf **, RPC_EXEC_CTX);
212 #else
213 static int             rpcclnt_send(struct socket *, struct sockaddr *, struct mbuf *, struct rpctask *);
214 static int             rpcclnt_receive(struct rpctask *, struct sockaddr **, struct mbuf **, RPC_EXEC_CTX);
215 #endif
216 
217 static int             rpcclnt_msg(RPC_EXEC_CTX, const char *, char *);
218 
219 static int             rpcclnt_reply(struct rpctask *, RPC_EXEC_CTX);
220 static void            rpcclnt_timer(void *);
221 static int             rpcclnt_sndlock(int *, struct rpctask *);
222 static void            rpcclnt_sndunlock(int *);
223 static int             rpcclnt_rcvlock(struct rpctask *);
224 static void            rpcclnt_rcvunlock(int *);
225 #if 0
226 void            rpcclnt_realign(struct mbuf *, int);
227 #else
228 static void     rpcclnt_realign(struct mbuf **, int);
229 #endif
230 
231 static struct mbuf    *rpcclnt_buildheader(struct rpcclnt *, int, struct mbuf *, u_int32_t, int *, struct mbuf **, struct ucred *);
232 static int             rpcm_disct(struct mbuf **, caddr_t *, int, int, caddr_t *);
233 static u_int32_t       rpcclnt_proct(struct rpcclnt *, u_int32_t);
234 static int             rpc_adv(struct mbuf **, caddr_t *, int, int);
235 static void     rpcclnt_softterm(struct rpctask * task);
236 
237 static int rpcauth_buildheader(struct rpc_auth * auth, struct ucred *, struct mbuf **, caddr_t *);
238 
239 void
240 rpcclnt_init(void)
241 {
242 #ifdef __OpenBSD__
243         static struct timeout rpcclnt_timer_to;
244 #endif
245 
246         rpcclnt_ticks = (hz * RPC_TICKINTVL + 500) / 1000;
247         if (rpcclnt_ticks < 1)
248                 rpcclnt_ticks = 1;
249         rpcstats.rpcretries = 0;
250         rpcstats.rpcrequests = 0;
251         rpcstats.rpctimeouts = 0;
252         rpcstats.rpcunexpected = 0;
253         rpcstats.rpcinvalid = 0;
254 
255         /*
256          * rpc constants how about actually using more than one of these!
257          */
258 
259         rpc_reply = txdr_unsigned(RPC_REPLY);
260         rpc_vers = txdr_unsigned(RPC_VER2);
261         rpc_call = txdr_unsigned(RPC_CALL);
262 #if 0
263         rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED);
264         rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED);
265         rpc_mismatch = txdr_unsigned(RPC_MISMATCH);
266         rpc_autherr = txdr_unsigned(RPC_AUTHERR);
267         rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX);
268         rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4);
269 #endif
270 
271         /* initialize rpctask queue */
272         TAILQ_INIT(&rpctask_q);
273 
274 #ifdef __OpenBSD__
275         /* initialize pools */
276         pool_init(&rpctask_pool, sizeof(struct rpctask), 0, 0, RPCTASKPOOL_LWM,
277                   "rpctask_p", NULL);
278         pool_setlowat(&rpctask_pool, RPCTASKPOOL_LWM);
279         pool_sethiwat(&rpctask_pool, RPCTASKPOOL_HWM);
280 
281         pool_init(&rpcclnt_pool, sizeof(struct rpcclnt), 0, 0, 1, "rpcclnt_p", NULL);
282 
283         /* initialize timers */
284         timeout_set(&rpcclnt_timer_to, rpcclnt_timer, &rpcclnt_timer_to);
285         rpcclnt_timer(&rpcclnt_timer_to);
286 #else /* !__OpenBSD__ */
287         callout_init(&rpcclnt_callout, 0);
288 #endif /* !__OpenBSD__ */
289 
290         RPCDEBUG("rpc initialed");
291 
292         return;
293 }
294 
295 void
296 rpcclnt_uninit(void)
297 {
298         RPCDEBUG("uninit");
299         /* XXX delete sysctl variables? */
300         callout_stop(&rpcclnt_callout);
301 }
302 
303 int
304 rpcclnt_setup(clnt, program, addr, sotype, soproto, auth, max_read_size, max_write_size, flags)
305     struct rpcclnt * clnt;
306     struct rpc_program * program;
307     struct sockaddr * addr;
308     int sotype;
309     int soproto;
310     struct rpc_auth * auth;
311     int max_read_size;
312     int max_write_size;
313     int flags;
314 {
315         if (clnt == NULL || program == NULL || addr == NULL || auth == NULL)
316           RPC_RETURN (EFAULT);
317 
318         if (program->prog_name == NULL)
319           RPC_RETURN (EFAULT);
320         clnt->rc_prog = program;
321 
322         clnt->rc_name = addr;
323         clnt->rc_sotype = sotype;
324         clnt->rc_soproto = soproto;
325         clnt->rc_auth = auth;
326         clnt->rc_rsize = max_read_size;
327         clnt->rc_wsize = max_write_size;
328         clnt->rc_flag = flags;
329 
330         clnt->rc_proctlen = 0;
331         clnt->rc_proct = NULL;
332 
333         RPC_RETURN (0);
334 }
335 
336 /*
337  * Initialize sockets and congestion for a new RPC connection. We do not free
338  * the sockaddr if error.
339  */
340 int
341 rpcclnt_connect(rpc, td)
342         struct rpcclnt *rpc;
343         RPC_EXEC_CTX td;
344 {
345         struct socket  *so;
346         int             s, error, rcvreserve, sndreserve;
347         struct sockaddr *saddr;
348 
349 #ifdef __OpenBSD__
350         struct sockaddr_in *sin;
351         struct mbuf    *m;
352 #else
353         struct sockaddr_in sin;
354 
355         int             soarg;
356         struct sockopt  opt;
357 #endif
358 
359         if (rpc == NULL) {
360                 RPCDEBUG("no rpcclnt struct!\n");
361                 RPC_RETURN(EFAULT);
362         }
363 
364         /* create the socket */
365         rpc->rc_so = NULL;
366 
367         saddr = rpc->rc_name;
368 
369         error = socreate(saddr->sa_family, &rpc->rc_so, rpc->rc_sotype,
370                          rpc->rc_soproto, td->td_ucred, td);
371         if (error) {
372                 RPCDEBUG("error %d in socreate()", error);
373                 RPC_RETURN(error);
374         }
375         so = rpc->rc_so;
376         rpc->rc_soflags = so->so_proto->pr_flags;
377 
378         /*
379          * Some servers require that the client port be a reserved port
380          * number. We always allocate a reserved port, as this prevents
381          * filehandle disclosure through UDP port capture.
382          */
383         if (saddr->sa_family == AF_INET) {
384 #ifdef __OpenBSD__
385                 struct mbuf    *mopt;
386                 int            *ip;
387 #endif
388 
389 #ifdef __OpenBSD__
390                 MGET(mopt, M_TRYWAIT, MT_SOOPTS);
391                 mopt->m_len = sizeof(int);
392                 ip = mtod(mopt, int *);
393                 *ip = IP_PORTRANGE_LOW;
394 
395                 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt);
396 #else
397                 soarg = IP_PORTRANGE_LOW;
398                 bzero(&opt, sizeof(struct sockopt));
399                 opt.sopt_dir = SOPT_SET;
400                 opt.sopt_level = IPPROTO_IP;
401                 opt.sopt_name = IP_PORTRANGE;
402                 opt.sopt_val = &soarg;
403                 opt.sopt_valsize = sizeof(soarg);
404 
405                 error = sosetopt(so, &opt);
406 #endif
407                 if (error)
408                         goto bad;
409 
410 #ifdef __OpenBSD__
411                 MGET(m, M_TRYWAIT, MT_SONAME);
412                 sin = mtod(m, struct sockaddr_in *);
413                 sin->sin_len = m->m_len = sizeof(struct sockaddr_in);
414                 sin->sin_family = AF_INET;
415                 sin->sin_addr.s_addr = INADDR_ANY;
416                 sin->sin_port = htons(0);
417                 error = sobind(so, m);
418                 m_freem(m);
419 #else
420                 sin.sin_len = sizeof(struct sockaddr_in);
421                 sin.sin_family = AF_INET;
422                 sin.sin_addr.s_addr = INADDR_ANY;
423                 sin.sin_port = htons(0);
424                 /*
425                  * &thread0 gives us root credentials to ensure sobind
426                  * will give us a reserved ephemeral port.
427                  */
428                 error = sobind(so, (struct sockaddr *) & sin, &thread0);
429 #endif
430                 if (error)
431                         goto bad;
432 
433 #ifdef __OpenBSD__
434                 MGET(mopt, M_TRYWAIT, MT_SOOPTS);
435                 mopt->m_len = sizeof(int);
436                 ip = mtod(mopt, int *);
437                 *ip = IP_PORTRANGE_DEFAULT;
438                 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt);
439 #else
440                 soarg = IP_PORTRANGE_DEFAULT;
441                 bzero(&opt, sizeof(struct sockopt));
442                 opt.sopt_dir = SOPT_SET;
443                 opt.sopt_level = IPPROTO_IP;
444                 opt.sopt_name = IP_PORTRANGE;
445                 opt.sopt_val = &soarg;
446                 opt.sopt_valsize = sizeof(soarg);
447                 error = sosetopt(so, &opt);
448 #endif
449                 if (error)
450                         goto bad;
451         }
452         /*
453          * Protocols that do not require connections may be optionally left
454          * unconnected for servers that reply from a port other than
455          * NFS_PORT.
456          */
457         if (rpc->rc_flag & RPCCLNT_NOCONN) {
458                 if (rpc->rc_soflags & PR_CONNREQUIRED) {
459                         error = ENOTCONN;
460                         goto bad;
461                 }
462         } else {
463                 error = soconnect(so, saddr, td);
464                 if (error)
465                         goto bad;
466 
467                 /*
468                  * Wait for the connection to complete. Cribbed from the
469                  * connect system call but with the wait timing out so that
470                  * interruptible mounts don't hang here for a long time.
471                  */
472 #ifdef __OpenBSD__
473                 s = splsoftnet();
474 #else
475                 s = splnet();
476 #endif
477                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
478                         (void)tsleep((caddr_t) & so->so_timeo, PSOCK,
479                                      "rpc", 2 * hz);
480 
481                         /*
482                          * XXX needs to catch interrupt signals. something
483                          * like this: if ((so->so_state & SS_ISCONNECTING) &&
484                          * so->so_error == 0 && rep && (error =
485                          * nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
486                          * so->so_state &= ~SS_ISCONNECTING; splx(s); goto
487                          * bad; }
488                          */
489                 }
490                 if (so->so_error) {
491                         error = so->so_error;
492                         so->so_error = 0;
493                         splx(s);
494                         goto bad;
495                 }
496                 splx(s);
497         }
498         if (rpc->rc_flag & (RPCCLNT_SOFT | RPCCLNT_INT)) {
499                 so->so_rcv.sb_timeo = (5 * hz);
500                 so->so_snd.sb_timeo = (5 * hz);
501         } else {
502                 so->so_rcv.sb_timeo = 0;
503                 so->so_snd.sb_timeo = 0;
504         }
505 
506 
507         if (rpc->rc_sotype == SOCK_DGRAM) {
508                 sndreserve = rpc->rc_wsize + RPC_MAXPKTHDR;
509                 rcvreserve = rpc->rc_rsize + RPC_MAXPKTHDR;
510         } else if (rpc->rc_sotype == SOCK_SEQPACKET) {
511                 sndreserve = (rpc->rc_wsize + RPC_MAXPKTHDR) * 2;
512                 rcvreserve = (rpc->rc_rsize + RPC_MAXPKTHDR) * 2;
513         } else {
514                 if (rpc->rc_sotype != SOCK_STREAM)
515                         panic("rpcclnt_connect() bad sotype");
516                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
517 #ifdef __OpenBSD__
518                         MGET(m, M_TRYWAIT, MT_SOOPTS);
519                         *mtod(m, int32_t *) = 1;
520                         m->m_len = sizeof(int32_t);
521                         sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m);
522 #else
523                         soarg = 1;
524 
525                         bzero(&opt, sizeof(struct sockopt));
526                         opt.sopt_dir = SOPT_SET;
527                         opt.sopt_level = SOL_SOCKET;
528                         opt.sopt_name = SO_KEEPALIVE;
529                         opt.sopt_val = &soarg;
530                         opt.sopt_valsize = sizeof(soarg);
531                         sosetopt(so, &opt);
532 #endif
533                 }
534                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
535 #ifdef __OpenBSD__
536                         MGET(m, M_TRYWAIT, MT_SOOPTS);
537                         *mtod(m, int32_t *) = 1;
538                         m->m_len = sizeof(int32_t);
539                         sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m);
540 #else
541                         soarg = 1;
542 
543                         bzero(&opt, sizeof(struct sockopt));
544                         opt.sopt_dir = SOPT_SET;
545                         opt.sopt_level = IPPROTO_TCP;
546                         opt.sopt_name = TCP_NODELAY;
547                         opt.sopt_val = &soarg;
548                         opt.sopt_valsize = sizeof(soarg);
549                         sosetopt(so, &opt);
550 #endif
551                 }
552                 sndreserve = (rpc->rc_wsize + RPC_MAXPKTHDR +
553                               sizeof(u_int32_t)) * 2;
554                 rcvreserve = (rpc->rc_rsize + RPC_MAXPKTHDR +
555                               sizeof(u_int32_t)) * 2;
556         }
557         error = soreserve(so, sndreserve, rcvreserve);
558         if (error)
559                 goto bad;
560         so->so_rcv.sb_flags |= SB_NOINTR;
561         so->so_snd.sb_flags |= SB_NOINTR;
562 
563         /* Initialize other non-zero congestion variables */
564         rpc->rc_srtt[0] = rpc->rc_srtt[1] = rpc->rc_srtt[2] =
565                  rpc->rc_srtt[3] = (RPC_TIMEO << 3);
566         rpc->rc_sdrtt[0] = rpc->rc_sdrtt[1] = rpc->rc_sdrtt[2] =
567                 rpc->rc_sdrtt[3] = 0;
568         rpc->rc_cwnd = RPC_MAXCWND / 2; /* Initial send window */
569         rpc->rc_sent = 0;
570         rpc->rc_timeouts = 0;
571         RPC_RETURN(0);
572 
573 bad:
574         rpcclnt_disconnect(rpc);
575         RPC_RETURN(error);
576 }
577 
578 
579 /*
580  * Reconnect routine:
581  * Called when a connection is broken on a reliable protocol.
582  * - clean up the old socket
583  * - rpcclnt_connect() again
584  * - set R_MUSTRESEND for all outstanding requests on mount point
585  * If this fails the mount point is DEAD!
586  * nb: Must be called with the rpcclnt_sndlock() set on the mount point.
587  */
588 int
589 rpcclnt_reconnect(rep, td)
590         struct rpctask *rep;
591         RPC_EXEC_CTX td;
592 {
593         struct rpctask *rp;
594         struct rpcclnt *rpc = rep->r_rpcclnt;
595         int             error;
596 
597         rpcclnt_disconnect(rpc);
598         while ((error = rpcclnt_connect(rpc, td)) != 0) {
599                 if (error == EINTR || error == ERESTART)
600                         RPC_RETURN(EINTR);
601                 tsleep(&fake_wchan, PSOCK, "rpccon", hz);
602         }
603 
604         /*
605          * Loop through outstanding request list and fix up all requests on
606          * old socket.
607          */
608         for (rp = TAILQ_FIRST(&rpctask_q); rp != NULL;
609              rp = TAILQ_NEXT(rp, r_chain)) {
610                 if (rp->r_rpcclnt == rpc)
611                         rp->r_flags |= R_MUSTRESEND;
612         }
613         RPC_RETURN(0);
614 }
615 
616 /*
617  * RPC transport disconnect. Clean up and unlink.
618  */
619 void
620 rpcclnt_disconnect(rpc)
621         struct rpcclnt *rpc;
622 {
623         struct socket  *so;
624 
625         if (rpc->rc_so) {
626                 so = rpc->rc_so;
627                 rpc->rc_so = NULL;
628                 soshutdown(so, 2);
629                 soclose(so);
630         }
631 }
632 
633 void
634 rpcclnt_safedisconnect(struct rpcclnt * rpc)
635 {
636         struct rpctask  dummytask;
637 
638         bzero(&dummytask, sizeof(dummytask));
639         dummytask.r_rpcclnt = rpc;
640         rpcclnt_rcvlock(&dummytask);
641         rpcclnt_disconnect(rpc);
642         rpcclnt_rcvunlock(&rpc->rc_flag);
643 }
644 
645 /*
646  * This is the rpc send routine. For connection based socket types, it
647  * must be called with an rpcclnt_sndlock() on the socket.
648  * "rep == NULL" indicates that it has been called from a server.
649  * For the client side:
650  * - return EINTR if the RPC is terminated, 0 otherwise
651  * - set R_MUSTRESEND if the send fails for any reason
652  * - do any cleanup required by recoverable socket errors (?)
653  * For the server side:
654  * - return EINTR or ERESTART if interrupted by a signal
655  * - return EPIPE if a connection is lost for connection based sockets (TCP...)
656  * - do any cleanup required by recoverable socket errors (?)
657  */
658 static int
659 rpcclnt_send(so, nam, top, rep)
660         struct socket  *so;
661 #ifdef __OpenBSD__
662         struct mbuf    *nam;
663 #else
664         struct sockaddr *nam;
665 #endif
666         struct mbuf    *top;
667         struct rpctask *rep;
668 {
669 #ifdef __OpenBSD__
670         struct mbuf    *sendnam;
671 #else
672         struct sockaddr *sendnam;
673         struct thread  *td = curthread;
674 #endif
675         int error, soflags, flags;
676 
677         if (rep) {
678                 if (rep->r_flags & R_SOFTTERM) {
679                         m_freem(top);
680                         RPC_RETURN(EINTR);
681                 }
682                 if ((so = rep->r_rpcclnt->rc_so) == NULL) {
683                         rep->r_flags |= R_MUSTRESEND;
684                         m_freem(top);
685                         RPC_RETURN(0);
686                 }
687                 rep->r_flags &= ~R_MUSTRESEND;
688                 soflags = rep->r_rpcclnt->rc_soflags;
689         } else
690                 soflags = so->so_proto->pr_flags;
691 
692         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
693                 sendnam = NULL;
694         else
695                 sendnam = nam;
696 
697         if (so->so_type == SOCK_SEQPACKET)
698                 flags = MSG_EOR;
699         else
700                 flags = 0;
701 
702         /*
703          * XXXRW: If/when this code becomes MPSAFE itself, Giant might have
704          * to be conditionally acquired earlier for the stack so has to avoid
705          * lock order reversals with any locks held over rpcclnt_send().
706          */
707         error = sosend(so, sendnam, NULL, top, NULL, flags, td);
708         if (error) {
709                 if (rep) {
710                         log(LOG_INFO, "rpc send error %d for service %s\n", error,
711                             rep->r_rpcclnt->rc_prog->prog_name);
712                         /*
713                          * Deal with errors for the client side.
714                          */
715                         if (rep->r_flags & R_SOFTTERM)
716                                 error = EINTR;
717                         else
718                                 rep->r_flags |= R_MUSTRESEND;
719                 } else
720                         log(LOG_INFO, "rpc service send error %d\n", error);
721 
722                 /*
723                  * Handle any recoverable (soft) socket errors here.
724                  */
725                 if (error != EINTR && error != ERESTART &&
726                     error != EWOULDBLOCK && error != EPIPE)
727                         error = 0;
728         }
729         RPC_RETURN(error);
730 }
731 
732 /*
733  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all done by
734  * soreceive(), but for SOCK_STREAM we must deal with the Record Mark and
735  * consolidate the data into a new mbuf list. nb: Sometimes TCP passes the
736  * data up to soreceive() in long lists of small mbufs. For SOCK_STREAM we
737  * must be very careful to read an entire record once we have read any of it,
738  * even if the system call has been interrupted.
739  */
740 static int
741 rpcclnt_receive(rep, aname, mp, td)
742         struct rpctask *rep;
743 #ifdef __OpenBSD__
744         struct mbuf   **aname;
745 #else
746         struct sockaddr **aname;
747 #endif
748         struct mbuf   **mp;
749         RPC_EXEC_CTX  td;
750 {
751         struct socket  *so;
752         struct uio      auio;
753         struct iovec    aio;
754         struct mbuf    *m;
755         struct mbuf    *control;
756         u_int32_t       len;
757 #ifdef __OpenBSD__
758         struct mbuf   **getnam;
759 #else
760         struct sockaddr **getnam;
761 #endif
762         int error, sotype, rcvflg;
763 
764         /*
765          * Set up arguments for soreceive()
766          */
767         *mp = NULL;
768         *aname = NULL;
769         sotype = rep->r_rpcclnt->rc_sotype;
770 
771         /*
772          * For reliable protocols, lock against other senders/receivers in
773          * case a reconnect is necessary. For SOCK_STREAM, first get the
774          * Record Mark to find out how much more there is to get. We must
775          * lock the socket against other receivers until we have an entire
776          * rpc request/reply.
777          */
778         if (sotype != SOCK_DGRAM) {
779                 error = rpcclnt_sndlock(&rep->r_rpcclnt->rc_flag, rep);
780                 if (error)
781                         RPC_RETURN(error);
782 tryagain:
783                 /*
784                  * Check for fatal errors and resending request.
785                  */
786                 /*
787                  * Ugh: If a reconnect attempt just happened, rc_so would
788                  * have changed. NULL indicates a failed attempt that has
789                  * essentially shut down this mount point.
790                  */
791                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
792                         rpcclnt_sndunlock(&rep->r_rpcclnt->rc_flag);
793                         RPC_RETURN(EINTR);
794                 }
795                 so = rep->r_rpcclnt->rc_so;
796                 if (!so) {
797                         error = rpcclnt_reconnect(rep, td);
798                         if (error) {
799                                 rpcclnt_sndunlock(&rep->r_rpcclnt->rc_flag);
800                                 RPC_RETURN(error);
801                         }
802                         goto tryagain;
803                 }
804                 while (rep->r_flags & R_MUSTRESEND) {
805                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
806                         rpcstats.rpcretries++;
807                         error = rpcclnt_send(so, rep->r_rpcclnt->rc_name, m, rep);
808                         if (error) {
809                                 if (error == EINTR || error == ERESTART ||
810                                     (error = rpcclnt_reconnect(rep, td)) != 0) {
811                                         rpcclnt_sndunlock(&rep->r_rpcclnt->rc_flag);
812                                         RPC_RETURN(error);
813                                 }
814                                 goto tryagain;
815                         }
816                 }
817                 rpcclnt_sndunlock(&rep->r_rpcclnt->rc_flag);
818                 if (sotype == SOCK_STREAM) {
819                         aio.iov_base = (caddr_t) & len;
820                         aio.iov_len = sizeof(u_int32_t);
821                         auio.uio_iov = &aio;
822                         auio.uio_iovcnt = 1;
823                         auio.uio_segflg = UIO_SYSSPACE;
824                         auio.uio_rw = UIO_READ;
825                         auio.uio_offset = 0;
826                         auio.uio_resid = sizeof(u_int32_t);
827 #ifdef __OpenBSD__
828                         auio.uio_procp = td;
829 #else
830                         auio.uio_td = td;
831 #endif
832                         do {
833                                 rcvflg = MSG_WAITALL;
834                                 error = soreceive(so, NULL, &auio, NULL, NULL, &rcvflg);
835                                 if (error == EWOULDBLOCK && rep) {
836                                         if (rep->r_flags & R_SOFTTERM)
837                                                 RPC_RETURN(EINTR);
838                                 }
839                         } while (error == EWOULDBLOCK);
840                         if (!error && auio.uio_resid > 0) {
841                                 log(LOG_INFO,
842                                 "short receive (%zu/%zu) from rpc server %s\n",
843                                     sizeof(u_int32_t) - auio.uio_resid,
844                                     sizeof(u_int32_t),
845                                     rep->r_rpcclnt->rc_prog->prog_name);
846                                 error = EPIPE;
847                         }
848                         if (error)
849                                 goto errout;
850                         len = ntohl(len) & ~0x80000000;
851                         /*
852                          * This is SERIOUS! We are out of sync with the
853                          * sender and forcing a disconnect/reconnect is all I
854                          * can do.
855                          */
856                         if (len > RPC_MAXPACKET) {
857                                 log(LOG_ERR, "%s (%d) from rpc server %s\n",
858                                     "impossible packet length",
859                                     len,
860                                     rep->r_rpcclnt->rc_prog->prog_name);
861                                 error = EFBIG;
862                                 goto errout;
863                         }
864                         auio.uio_resid = len;
865                         do {
866                                 rcvflg = MSG_WAITALL;
867                                 error = soreceive(so, NULL, &auio, mp, NULL, &rcvflg);
868                         } while (error == EWOULDBLOCK || error == EINTR ||
869                                  error == ERESTART);
870                         if (!error && auio.uio_resid > 0) {
871                                 log(LOG_INFO,
872                                 "short receive (%d/%d) from rpc server %s\n",
873                                     len - auio.uio_resid, len,
874                                     rep->r_rpcclnt->rc_prog->prog_name);
875                                 error = EPIPE;
876                         }
877                 } else {
878                         /*
879                          * NB: Since uio_resid is big, MSG_WAITALL is ignored
880                          * and soreceive() will return when it has either a
881                          * control msg or a data msg. We have no use for
882                          * control msg., but must grab them and then throw
883                          * them away so we know what is going on.
884                          */
885                         auio.uio_resid = len = 100000000;       /* Anything Big */
886 #ifdef __OpenBSD__
887                         auio.uio_procp = td;
888 #else
889                         auio.uio_td = td;
890 #endif
891                         do {
892                                 rcvflg = 0;
893                                 error = soreceive(so, NULL, &auio, mp, &control, &rcvflg);
894                                 if (control)
895                                         m_freem(control);
896                                 if (error == EWOULDBLOCK && rep) {
897                                         if (rep->r_flags & R_SOFTTERM)
898                                                 RPC_RETURN(EINTR);
899                                 }
900                         } while (error == EWOULDBLOCK ||
901                                  (!error && *mp == NULL && control));
902                         if ((rcvflg & MSG_EOR) == 0)
903                                 printf("Egad!!\n");
904                         if (!error && *mp == NULL)
905                                 error = EPIPE;
906                         len -= auio.uio_resid;
907                 }
908 errout:
909                 if (error && error != EINTR && error != ERESTART) {
910                         m_freem(*mp);
911                         *mp = (struct mbuf *) 0;
912                         if (error != EPIPE)
913                                 log(LOG_INFO,
914                                     "receive error %d from rpc server %s\n",
915                                     error,
916                                     rep->r_rpcclnt->rc_prog->prog_name);
917                         error = rpcclnt_sndlock(&rep->r_rpcclnt->rc_flag, rep);
918                         if (!error)
919                                 error = rpcclnt_reconnect(rep, td);
920                         if (!error)
921                                 goto tryagain;
922                 }
923         } else {
924                 if ((so = rep->r_rpcclnt->rc_so) == NULL)
925                         RPC_RETURN(EACCES);
926                 if (so->so_state & SS_ISCONNECTED)
927                         getnam = NULL;
928                 else
929                         getnam = aname;
930                 auio.uio_resid = len = 1000000;
931 #ifdef __OpenBSD__
932                 auio.uio_procp = td;
933 #else
934                 auio.uio_td = td;
935 #endif
936 
937                 do {
938                         rcvflg = 0;
939                         error = soreceive(so, getnam, &auio, mp, NULL, &rcvflg);
940                         RPCDEBUG("soreceive returns %d", error);
941                         if (error == EWOULDBLOCK && (rep->r_flags & R_SOFTTERM)) {
942                                 RPCDEBUG("wouldblock && softerm -> EINTR");
943                                 RPC_RETURN(EINTR);
944                         }
945                 } while (error == EWOULDBLOCK);
946                 len -= auio.uio_resid;
947         }
948         if (error) {
949                 m_freem(*mp);
950                 *mp = NULL;
951         } else {
952                 /*
953                  * Search for any mbufs that are not a multiple of 4 bytes
954                  * long or with m_data not longword aligned. These could
955                  * cause pointer alignment problems, so copy them to well
956                  * aligned mbufs.
957                  */
958                 rpcclnt_realign(mp, 5 * RPCX_UNSIGNED);
959         }
960         RPC_RETURN(error);
961 }
962 
963 
964 /*
965  * Implement receipt of reply on a socket. We must search through the list of
966  * received datagrams matching them with outstanding requests using the xid,
967  * until ours is found.
968  */
969 /* ARGSUSED */
970 static int
971 rpcclnt_reply(myrep, td)
972         struct rpctask *myrep;
973         RPC_EXEC_CTX td;
974 {
975         struct rpctask *rep;
976         struct rpcclnt *rpc = myrep->r_rpcclnt;
977         int32_t         t1;
978         struct mbuf    *mrep, *md;
979 #ifdef __OpenBSD__
980         struct mbuf    *nam;
981 #else
982         struct sockaddr *nam;
983 #endif
984         u_int32_t       rxid, *tl;
985         caddr_t         dpos, cp2;
986         int             error;
987 
988         /*
989          * Loop around until we get our own reply
990          */
991         for (;;) {
992                 /*
993                  * Lock against other receivers so that I don't get stuck in
994                  * sbwait() after someone else has received my reply for me.
995                  * Also necessary for connection based protocols to avoid
996