1 /*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
30 * $FreeBSD: releng/6.0/sys/netinet/tcp_usrreq.c 151121 2005-10-09 03:17:41Z delphij $
31 */
32
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_tcpdebug.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/kernel.h>
41 #include <sys/sysctl.h>
42 #include <sys/mbuf.h>
43 #ifdef INET6
44 #include <sys/domain.h>
45 #endif /* INET6 */
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/protosw.h>
49 #include <sys/proc.h>
50 #include <sys/jail.h>
51
52 #include <net/if.h>
53 #include <net/route.h>
54
55 #include <netinet/in.h>
56 #include <netinet/in_systm.h>
57 #ifdef INET6
58 #include <netinet/ip6.h>
59 #endif
60 #include <netinet/in_pcb.h>
61 #ifdef INET6
62 #include <netinet6/in6_pcb.h>
63 #endif
64 #include <netinet/in_var.h>
65 #include <netinet/ip_var.h>
66 #ifdef INET6
67 #include <netinet6/ip6_var.h>
68 #endif
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #ifdef TCPDEBUG
76 #include <netinet/tcp_debug.h>
77 #endif
78
79 /*
80 * TCP protocol interface to socket abstraction.
81 */
82 extern char *tcpstates[]; /* XXX ??? */
83
84 static int tcp_attach(struct socket *);
85 static int tcp_connect(struct tcpcb *, struct sockaddr *,
86 struct thread *td);
87 #ifdef INET6
88 static int tcp6_connect(struct tcpcb *, struct sockaddr *,
89 struct thread *td);
90 #endif /* INET6 */
91 static struct tcpcb *
92 tcp_disconnect(struct tcpcb *);
93 static struct tcpcb *
94 tcp_usrclosed(struct tcpcb *);
95 static void tcp_fill_info(struct tcpcb *, struct tcp_info *);
96
97 #ifdef TCPDEBUG
98 #define TCPDEBUG0 int ostate = 0
99 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0
100 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
101 tcp_trace(TA_USER, ostate, tp, 0, 0, req)
102 #else
103 #define TCPDEBUG0
104 #define TCPDEBUG1()
105 #define TCPDEBUG2(req)
106 #endif
107
108 /*
109 * TCP attaches to socket via pru_attach(), reserving space,
110 * and an internet control block.
111 */
112 static int
113 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
114 {
115 int error;
116 struct inpcb *inp;
117 struct tcpcb *tp = 0;
118 TCPDEBUG0;
119
120 INP_INFO_WLOCK(&tcbinfo);
121 TCPDEBUG1();
122 inp = sotoinpcb(so);
123 if (inp) {
124 error = EISCONN;
125 goto out;
126 }
127
128 error = tcp_attach(so);
129 if (error)
130 goto out;
131
132 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
133 so->so_linger = TCP_LINGERTIME;
134
135 inp = sotoinpcb(so);
136 tp = intotcpcb(inp);
137 out:
138 TCPDEBUG2(PRU_ATTACH);
139 INP_INFO_WUNLOCK(&tcbinfo);
140 return error;
141 }
142
143 /*
144 * pru_detach() detaches the TCP protocol from the socket.
145 * If the protocol state is non-embryonic, then can't
146 * do this directly: have to initiate a pru_disconnect(),
147 * which may finish later; embryonic TCB's can just
148 * be discarded here.
149 */
150 static int
151 tcp_usr_detach(struct socket *so)
152 {
153 int error = 0;
154 struct inpcb *inp;
155 struct tcpcb *tp;
156 TCPDEBUG0;
157
158 INP_INFO_WLOCK(&tcbinfo);
159 inp = sotoinpcb(so);
160 if (inp == NULL) {
161 INP_INFO_WUNLOCK(&tcbinfo);
162 return error;
163 }
164 INP_LOCK(inp);
165 tp = intotcpcb(inp);
166 TCPDEBUG1();
167 tp = tcp_disconnect(tp);
168
169 TCPDEBUG2(PRU_DETACH);
170 if (tp)
171 INP_UNLOCK(inp);
172 INP_INFO_WUNLOCK(&tcbinfo);
173 return error;
174 }
175
176 #define INI_NOLOCK 0
177 #define INI_READ 1
178 #define INI_WRITE 2
179
180 #define COMMON_START() \
181 TCPDEBUG0; \
182 do { \
183 if (inirw == INI_READ) \
184 INP_INFO_RLOCK(&tcbinfo); \
185 else if (inirw == INI_WRITE) \
186 INP_INFO_WLOCK(&tcbinfo); \
187 inp = sotoinpcb(so); \
188 if (inp == 0) { \
189 if (inirw == INI_READ) \
190 INP_INFO_RUNLOCK(&tcbinfo); \
191 else if (inirw == INI_WRITE) \
192 INP_INFO_WUNLOCK(&tcbinfo); \
193 return EINVAL; \
194 } \
195 INP_LOCK(inp); \
196 if (inirw == INI_READ) \
197 INP_INFO_RUNLOCK(&tcbinfo); \
198 tp = intotcpcb(inp); \
199 TCPDEBUG1(); \
200 } while(0)
201
202 #define COMMON_END(req) \
203 out: TCPDEBUG2(req); \
204 do { \
205 if (tp) \
206 INP_UNLOCK(inp); \
207 if (inirw == INI_WRITE) \
208 INP_INFO_WUNLOCK(&tcbinfo); \
209 return error; \
210 goto out; \
211 } while(0)
212
213 /*
214 * Give the socket an address.
215 */
216 static int
217 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
218 {
219 int error = 0;
220 struct inpcb *inp;
221 struct tcpcb *tp;
222 struct sockaddr_in *sinp;
223 const int inirw = INI_WRITE;
224
225 sinp = (struct sockaddr_in *)nam;
226 if (nam->sa_len != sizeof (*sinp))
227 return (EINVAL);
228 /*
229 * Must check for multicast addresses and disallow binding
230 * to them.
231 */
232 if (sinp->sin_family == AF_INET &&
233 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
234 return (EAFNOSUPPORT);
235
236 COMMON_START();
237 error = in_pcbbind(inp, nam, td->td_ucred);
238 if (error)
239 goto out;
240 COMMON_END(PRU_BIND);
241 }
242
243 #ifdef INET6
244 static int
245 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
246 {
247 int error = 0;
248 struct inpcb *inp;
249 struct tcpcb *tp;
250 struct sockaddr_in6 *sin6p;
251 const int inirw = INI_WRITE;
252
253 sin6p = (struct sockaddr_in6 *)nam;
254 if (nam->sa_len != sizeof (*sin6p))
255 return (EINVAL);
256 /*
257 * Must check for multicast addresses and disallow binding
258 * to them.
259 */
260 if (sin6p->sin6_family == AF_INET6 &&
261 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
262 return (EAFNOSUPPORT);
263
264 COMMON_START();
265 inp->inp_vflag &= ~INP_IPV4;
266 inp->inp_vflag |= INP_IPV6;
267 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
268 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
269 inp->inp_vflag |= INP_IPV4;
270 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
271 struct sockaddr_in sin;
272
273 in6_sin6_2_sin(&sin, sin6p);
274 inp->inp_vflag |= INP_IPV4;
275 inp->inp_vflag &= ~INP_IPV6;
276 error = in_pcbbind(inp, (struct sockaddr *)&sin,
277 td->td_ucred);
278 goto out;
279 }
280 }
281 error = in6_pcbbind(inp, nam, td->td_ucred);
282 if (error)
283 goto out;
284 COMMON_END(PRU_BIND);
285 }
286 #endif /* INET6 */
287
288 /*
289 * Prepare to accept connections.
290 */
291 static int
292 tcp_usr_listen(struct socket *so, struct thread *td)
293 {
294 int error = 0;
295 struct inpcb *inp;
296 struct tcpcb *tp;
297 const int inirw = INI_WRITE;
298
299 COMMON_START();
300 SOCK_LOCK(so);
301 error = solisten_proto_check(so);
302 if (error == 0 && inp->inp_lport == 0)
303 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
304 if (error == 0) {
305 tp->t_state = TCPS_LISTEN;
306 solisten_proto(so);
307 }
308 SOCK_UNLOCK(so);
309 COMMON_END(PRU_LISTEN);
310 }
311
312 #ifdef INET6
313 static int
314 tcp6_usr_listen(struct socket *so, struct thread *td)
315 {
316 int error = 0;
317 struct inpcb *inp;
318 struct tcpcb *tp;
319 const int inirw = INI_WRITE;
320
321 COMMON_START();
322 SOCK_LOCK(so);
323 error = solisten_proto_check(so);
324 if (error == 0 && inp->inp_lport == 0) {
325 inp->inp_vflag &= ~INP_IPV4;
326 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
327 inp->inp_vflag |= INP_IPV4;
328 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
329 }
330 if (error == 0) {
331 tp->t_state = TCPS_LISTEN;
332 solisten_proto(so);
333 }
334 SOCK_UNLOCK(so);
335 COMMON_END(PRU_LISTEN);
336 }
337 #endif /* INET6 */
338
339 /*
340 * Initiate connection to peer.
341 * Create a template for use in transmissions on this connection.
342 * Enter SYN_SENT state, and mark socket as connecting.
343 * Start keep-alive timer, and seed output sequence space.
344 * Send initial segment on connection.
345 */
346 static int
347 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
348 {
349 int error = 0;
350 struct inpcb *inp;
351 struct tcpcb *tp;
352 struct sockaddr_in *sinp;
353 const int inirw = INI_WRITE;
354
355 sinp = (struct sockaddr_in *)nam;
356 if (nam->sa_len != sizeof (*sinp))
357 return (EINVAL);
358 /*
359 * Must disallow TCP ``connections'' to multicast addresses.
360 */
361 if (sinp->sin_family == AF_INET
362 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
363 return (EAFNOSUPPORT);
364 if (jailed(td->td_ucred))
365 prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
366
367 COMMON_START();
368 if ((error = tcp_connect(tp, nam, td)) != 0)
369 goto out;
370 error = tcp_output(tp);
371 COMMON_END(PRU_CONNECT);
372 }
373
374 #ifdef INET6
375 static int
376 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
377 {
378 int error = 0;
379 struct inpcb *inp;
380 struct tcpcb *tp;
381 struct sockaddr_in6 *sin6p;
382 const int inirw = INI_WRITE;
383
384 sin6p = (struct sockaddr_in6 *)nam;
385 if (nam->sa_len != sizeof (*sin6p))
386 return (EINVAL);
387 /*
388 * Must disallow TCP ``connections'' to multicast addresses.
389 */
390 if (sin6p->sin6_family == AF_INET6
391 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
392 return (EAFNOSUPPORT);
393
394 COMMON_START();
395 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
396 struct sockaddr_in sin;
397
398 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
399 error = EINVAL;
400 goto out;
401 }
402
403 in6_sin6_2_sin(&sin, sin6p);
404 inp->inp_vflag |= INP_IPV4;
405 inp->inp_vflag &= ~INP_IPV6;
406 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
407 goto out;
408 error = tcp_output(tp);
409 goto out;
410 }
411 inp->inp_vflag &= ~INP_IPV4;
412 inp->inp_vflag |= INP_IPV6;
413 inp->inp_inc.inc_isipv6 = 1;
414 if ((error = tcp6_connect(tp, nam, td)) != 0)
415 goto out;
416 error = tcp_output(tp);
417 COMMON_END(PRU_CONNECT);
418 }
419 #endif /* INET6 */
420
421 /*
422 * Initiate disconnect from peer.
423 * If connection never passed embryonic stage, just drop;
424 * else if don't need to let data drain, then can just drop anyways,
425 * else have to begin TCP shutdown process: mark socket disconnecting,
426 * drain unread data, state switch to reflect user close, and
427 * send segment (e.g. FIN) to peer. Socket will be really disconnected
428 * when peer sends FIN and acks ours.
429 *
430 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
431 */
432 static int
433 tcp_usr_disconnect(struct socket *so)
434 {
435 int error = 0;
436 struct inpcb *inp;
437 struct tcpcb *tp;
438 const int inirw = INI_WRITE;
439
440 COMMON_START();
441 tp = tcp_disconnect(tp);
442 COMMON_END(PRU_DISCONNECT);
443 }
444
445 /*
446 * Accept a connection. Essentially all the work is
447 * done at higher levels; just return the address
448 * of the peer, storing through addr.
449 */
450 static int
451 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
452 {
453 int error = 0;
454 struct inpcb *inp = NULL;
455 struct tcpcb *tp = NULL;
456 struct in_addr addr;
457 in_port_t port = 0;
458 TCPDEBUG0;
459
460 if (so->so_state & SS_ISDISCONNECTED) {
461 error = ECONNABORTED;
462 goto out;
463 }
464
465 INP_INFO_RLOCK(&tcbinfo);
466 inp = sotoinpcb(so);
467 if (!inp) {
468 INP_INFO_RUNLOCK(&tcbinfo);
469 return (EINVAL);
470 }
471 INP_LOCK(inp);
472 INP_INFO_RUNLOCK(&tcbinfo);
473 tp = intotcpcb(inp);
474 TCPDEBUG1();
475
476 /*
477 * We inline in_setpeeraddr and COMMON_END here, so that we can
478 * copy the data of interest and defer the malloc until after we
479 * release the lock.
480 */
481 port = inp->inp_fport;
482 addr = inp->inp_faddr;
483
484 out: TCPDEBUG2(PRU_ACCEPT);
485 if (tp)
486 INP_UNLOCK(inp);
487 if (error == 0)
488 *nam = in_sockaddr(port, &addr);
489 return error;
490 }
491
492 #ifdef INET6
493 static int
494 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
495 {
496 struct inpcb *inp = NULL;
497 int error = 0;
498 struct tcpcb *tp = NULL;
499 struct in_addr addr;
500 struct in6_addr addr6;
501 in_port_t port = 0;
502 int v4 = 0;
503 TCPDEBUG0;
504
505 if (so->so_state & SS_ISDISCONNECTED) {
506 error = ECONNABORTED;
507 goto out;
508 }
509
510 INP_INFO_RLOCK(&tcbinfo);
511 inp = sotoinpcb(so);
512 if (inp == 0) {
513 INP_INFO_RUNLOCK(&tcbinfo);
514 return (EINVAL);
515 }
516 INP_LOCK(inp);
517 INP_INFO_RUNLOCK(&tcbinfo);
518 tp = intotcpcb(inp);
519 TCPDEBUG1();
520 /*
521 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
522 * copy the data of interest and defer the malloc until after we
523 * release the lock.
524 */
525 if (inp->inp_vflag & INP_IPV4) {
526 v4 = 1;
527 port = inp->inp_fport;
528 addr = inp->inp_faddr;
529 } else {
530 port = inp->inp_fport;
531 addr6 = inp->in6p_faddr;
532 }
533
534 out: TCPDEBUG2(PRU_ACCEPT);
535 if (tp)
536 INP_UNLOCK(inp);
537 if (error == 0) {
538 if (v4)
539 *nam = in6_v4mapsin6_sockaddr(port, &addr);
540 else
541 *nam = in6_sockaddr(port, &addr6);
542 }
543 return error;
544 }
545 #endif /* INET6 */
546
547 /*
548 * This is the wrapper function for in_setsockaddr. We just pass down
549 * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking
550 * here because in_setsockaddr will call malloc and can block.
551 */
552 static int
553 tcp_sockaddr(struct socket *so, struct sockaddr **nam)
554 {
555 return (in_setsockaddr(so, nam, &tcbinfo));
556 }
557
558 /*
559 * This is the wrapper function for in_setpeeraddr. We just pass down
560 * the pcbinfo for in_setpeeraddr to lock.
561 */
562 static int
563 tcp_peeraddr(struct socket *so, struct sockaddr **nam)
564 {
565 return (in_setpeeraddr(so, nam, &tcbinfo));
566 }
567
568 /*
569 * Mark the connection as being incapable of further output.
570 */
571 static int
572 tcp_usr_shutdown(struct socket *so)
573 {
574 int error = 0;
575 struct inpcb *inp;
576 struct tcpcb *tp;
577 const int inirw = INI_WRITE;
578
579 COMMON_START();
580 socantsendmore(so);
581 tp = tcp_usrclosed(tp);
582 if (tp)
583 error = tcp_output(tp);
584 COMMON_END(PRU_SHUTDOWN);
585 }
586
587 /*
588 * After a receive, possibly send window update to peer.
589 */
590 static int
591 tcp_usr_rcvd(struct socket *so, int flags)
592 {
593 int error = 0;
594 struct inpcb *inp;
595 struct tcpcb *tp;
596 const int inirw = INI_READ;
597
598 COMMON_START();
599 tcp_output(tp);
600 COMMON_END(PRU_RCVD);
601 }
602
603 /*
604 * Do a send by putting data in output queue and updating urgent
605 * marker if URG set. Possibly send more data. Unlike the other
606 * pru_*() routines, the mbuf chains are our responsibility. We
607 * must either enqueue them or free them. The other pru_* routines
608 * generally are caller-frees.
609 */
610 static int
611 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
612 struct sockaddr *nam, struct mbuf *control, struct thread *td)
613 {
614 int error = 0;
615 struct inpcb *inp;
616 struct tcpcb *tp;
617 int unlocked = 0;
618 #ifdef INET6
619 int isipv6;
620 #endif
621 TCPDEBUG0;
622
623 /*
624 * Need write lock here because this function might call
625 * tcp_connect or tcp_usrclosed.
626 * We really want to have to this function upgrade from read lock
627 * to write lock. XXX
628 */
629 INP_INFO_WLOCK(&tcbinfo);
630 inp = sotoinpcb(so);
631 if (inp == NULL) {
632 /*
633 * OOPS! we lost a race, the TCP session got reset after
634 * we checked SBS_CANTSENDMORE, eg: while doing uiomove or a
635 * network interrupt in the non-splnet() section of sosend().
636 */
637 if (m)
638 m_freem(m);
639 if (control)
640 m_freem(control);
641 error = ECONNRESET; /* XXX EPIPE? */
642 tp = NULL;
643 TCPDEBUG1();
644 goto out;
645 }
646 INP_LOCK(inp);
647 #ifdef INET6
648 isipv6 = nam && nam->sa_family == AF_INET6;
649 #endif /* INET6 */
650 tp = intotcpcb(inp);
651 TCPDEBUG1();
652 if (control) {
653 /* TCP doesn't do control messages (rights, creds, etc) */
654 if (control->m_len) {
655 m_freem(control);
656 if (m)
657 m_freem(m);
658 error = EINVAL;
659 goto out;
660 }
661 m_freem(control); /* empty control, just free it */
662 }
663 if (!(flags & PRUS_OOB)) {
664 sbappendstream(&so->so_snd, m);
665 if (nam && tp->t_state < TCPS_SYN_SENT) {
666 /*
667 * Do implied connect if not yet connected,
668 * initialize window to default value, and
669 * initialize maxseg/maxopd using peer's cached
670 * MSS.
671 */
672 #ifdef INET6
673 if (isipv6)
674 error = tcp6_connect(tp, nam, td);
675 else
676 #endif /* INET6 */
677 error = tcp_connect(tp, nam, td);
678 if (error)
679 goto out;
680 tp->snd_wnd = TTCP_CLIENT_SND_WND;
681 tcp_mss(tp, -1);
682 }
683
684 if (flags & PRUS_EOF) {
685 /*
686 * Close the send side of the connection after
687 * the data is sent.
688 */
689 socantsendmore(so);
690 tp = tcp_usrclosed(tp);
691 }
692 INP_INFO_WUNLOCK(&tcbinfo);
693 unlocked = 1;
694 if (tp != NULL) {
695 if (flags & PRUS_MORETOCOME)
696 tp->t_flags |= TF_MORETOCOME;
697 error = tcp_output(tp);
698 if (flags & PRUS_MORETOCOME)
699 tp->t_flags &= ~TF_MORETOCOME;
700 }
701 } else {
702 SOCKBUF_LOCK(&so->so_snd);
703 if (sbspace(&so->so_snd) < -512) {
704 SOCKBUF_UNLOCK(&so->so_snd);
705 m_freem(m);
706 error = ENOBUFS;
707 goto out;
708 }
709 /*
710 * According to RFC961 (Assigned Protocols),
711 * the urgent pointer points to the last octet
712 * of urgent data. We continue, however,
713 * to consider it to indicate the first octet
714 * of data past the urgent section.
715 * Otherwise, snd_up should be one lower.
716 */
717 sbappendstream_locked(&so->so_snd, m);
718 SOCKBUF_UNLOCK(&so->so_snd);
719 if (nam && tp->t_state < TCPS_SYN_SENT) {
720 /*
721 * Do implied connect if not yet connected,
722 * initialize window to default value, and
723 * initialize maxseg/maxopd using peer's cached
724 * MSS.
725 */
726 #ifdef INET6
727 if (isipv6)
728 error = tcp6_connect(tp, nam, td);
729 else
730 #endif /* INET6 */
731 error = tcp_connect(tp, nam, td);
732 if (error)
733 goto out;
734 tp->snd_wnd = TTCP_CLIENT_SND_WND;
735 tcp_mss(tp, -1);
736 }
737 INP_INFO_WUNLOCK(&tcbinfo);
738 unlocked = 1;
739 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
740 tp->t_flags |= TF_FORCEDATA;
741 error = tcp_output(tp);
742 tp->t_flags &= ~TF_FORCEDATA;
743 }
744 out:
745 TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
746 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
747 if (tp)
748 INP_UNLOCK(inp);
749 if (!unlocked)
750 INP_INFO_WUNLOCK(&tcbinfo);
751 return (error);
752 }
753
754 /*
755 * Abort the TCP.
756 */
757 static int
758 tcp_usr_abort(struct socket *so)
759 {
760 int error = 0;
761 struct inpcb *inp;
762 struct tcpcb *tp;
763 const int inirw = INI_WRITE;
764
765 COMMON_START();
766 tp = tcp_drop(tp, ECONNABORTED);
767 COMMON_END(PRU_ABORT);
768 }
769
770 /*
771 * Receive out-of-band data.
772 */
773 static int
774 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
775 {
776 int error = 0;
777 struct inpcb *inp;
778 struct tcpcb *tp;
779 const int inirw = INI_READ;
780
781 COMMON_START();
782 if ((so->so_oobmark == 0 &&
783 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
784 so->so_options & SO_OOBINLINE ||
785 tp->t_oobflags & TCPOOB_HADDATA) {
786 error = EINVAL;
787 goto out;
788 }
789 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
790 error = EWOULDBLOCK;
791 goto out;
792 }
793 m->m_len = 1;
794 *mtod(m, caddr_t) = tp->t_iobc;
795 if ((flags & MSG_PEEK) == 0)
796 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
797 COMMON_END(PRU_RCVOOB);
798 }
799
800 struct pr_usrreqs tcp_usrreqs = {
801 .pru_abort = tcp_usr_abort,
802 .pru_accept = tcp_usr_accept,
803 .pru_attach = tcp_usr_attach,
804 .pru_bind = tcp_usr_bind,
805 .pru_connect = tcp_usr_connect,
806 .pru_control = in_control,
807 .pru_detach = tcp_usr_detach,
808 .pru_disconnect = tcp_usr_disconnect,
809 .pru_listen = tcp_usr_listen,
810 .pru_peeraddr = tcp_peeraddr,
811 .pru_rcvd = tcp_usr_rcvd,
812 .pru_rcvoob = tcp_usr_rcvoob,
813 .pru_send = tcp_usr_send,
814 .pru_shutdown = tcp_usr_shutdown,
815 .pru_sockaddr = tcp_sockaddr,
816 .pru_sosetlabel = in_pcbsosetlabel
817 };
818
819 #ifdef INET6
820 struct pr_usrreqs tcp6_usrreqs = {
821 .pru_abort = tcp_usr_abort,
822 .pru_accept = tcp6_usr_accept,
823 .pru_attach = tcp_usr_attach,
824 .pru_bind = tcp6_usr_bind,
825 .pru_connect = tcp6_usr_connect,
826 .pru_control = in6_control,
827 .pru_detach = tcp_usr_detach,
828 .pru_disconnect = tcp_usr_disconnect,
829 .pru_listen = tcp6_usr_listen,
830 .pru_peeraddr = in6_mapped_peeraddr,
831 .pru_rcvd = tcp_usr_rcvd,
832 .pru_rcvoob = tcp_usr_rcvoob,
833 .pru_send = tcp_usr_send,
834 .pru_shutdown = tcp_usr_shutdown,
835 .pru_sockaddr = in6_mapped_sockaddr,
836 .pru_sosetlabel = in_pcbsosetlabel
837 };
838 #endif /* INET6 */
839
840 /*
841 * Common subroutine to open a TCP connection to remote host specified
842 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
843 * port number if needed. Call in_pcbconnect_setup to do the routing and
844 * to choose a local host address (interface). If there is an existing
845 * incarnation of the same connection in TIME-WAIT state and if the remote
846 * host was sending CC options and if the connection duration was < MSL, then
847 * truncate the previous TIME-WAIT state and proceed.
848 * Initialize connection parameters and enter SYN-SENT state.
849 */
850 static int
851 tcp_connect(tp, nam, td)
852 register struct tcpcb *tp;
853 struct sockaddr *nam;
854 struct thread *td;
855 {
856 struct inpcb *inp = tp->t_inpcb, *oinp;
857 struct socket *so = inp->inp_socket;
858 struct in_addr laddr;
859 u_short lport;
860 int error;
861
862 if (inp->inp_lport == 0) {
863 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
864 if (error)
865 return error;
866 }
867
868 /*
869 * Cannot simply call in_pcbconnect, because there might be an
870 * earlier incarnation of this same connection still in
871 * TIME_WAIT state, creating an ADDRINUSE error.
872 */
873 laddr = inp->inp_laddr;
874 lport = inp->inp_lport;
875 error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
876 &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
877 if (error && oinp == NULL)
878 return error;
879 if (oinp)
880 return EADDRINUSE;
881 inp->inp_laddr = laddr;
882 in_pcbrehash(inp);
883
884 /* Compute window scaling to request. */
885 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
886 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
887 tp->request_r_scale++;
888
889 soisconnecting(so);
890 tcpstat.tcps_connattempt++;
891 tp->t_state = TCPS_SYN_SENT;
892 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
893 tp->iss = tcp_new_isn(tp);
894 tp->t_bw_rtseq = tp->iss;
895 tcp_sendseqinit(tp);
896
897 return 0;
898 }
899
900 #ifdef INET6
901 static int
902 tcp6_connect(tp, nam, td)
903 register struct tcpcb *tp;
904 struct sockaddr *nam;
905 struct thread *td;
906 {
907 struct inpcb *inp = tp->t_inpcb, *oinp;
908 struct socket *so = inp->inp_socket;
909 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
910 struct in6_addr *addr6;
911 int error;
912
913 if (inp->inp_lport == 0) {
914 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
915 if (error)
916 return error;
917 }
918
919 /*
920 * Cannot simply call in_pcbconnect, because there might be an
921 * earlier incarnation of this same connection still in
922 * TIME_WAIT state, creating an ADDRINUSE error.
923 */
924 error = in6_pcbladdr(inp, nam, &addr6);
925 if (error)
926 return error;
927 oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
928 &sin6->sin6_addr, sin6->sin6_port,
929 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
930 ? addr6
931 : &inp->in6p_laddr,
932 inp->inp_lport, 0, NULL);
933 if (oinp)
934 return EADDRINUSE;
935 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
936 inp->in6p_laddr = *addr6;
937 inp->in6p_faddr = sin6->sin6_addr;
938 inp->inp_fport = sin6->sin6_port;
939 /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
940 inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
941 if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
942 inp->in6p_flowinfo |=
943 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
944 in_pcbrehash(inp);
945
946 /* Compute window scaling to request. */
947 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
948 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
949 tp->request_r_scale++;
950
951 soisconnecting(so);
952 tcpstat.tcps_connattempt++;
953 tp->t_state = TCPS_SYN_SENT;
954 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
955 tp->iss = tcp_new_isn(tp);
956 tp->t_bw_rtseq = tp->iss;
957 tcp_sendseqinit(tp);
958
959 return 0;
960 }
961 #endif /* INET6 */
962
963 /*
964 * Export TCP internal state information via a struct tcp_info, based on the
965 * Linux 2.6 API. Not ABI compatible as our constants are mapped differently
966 * (TCP state machine, etc). We export all information using FreeBSD-native
967 * constants -- for example, the numeric values for tcpi_state will differ
968 * from Linux.
969 */
970 static void
971 tcp_fill_info(tp, ti)
972 struct tcpcb *tp;
973 struct tcp_info *ti;
974 {
975
976 INP_LOCK_ASSERT(tp->t_inpcb);
977 bzero(ti, sizeof(*ti));
978
979 ti->tcpi_state = tp->t_state;
980 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
981 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
982 if (tp->sack_enable)
983 ti->tcpi_options |= TCPI_OPT_SACK;
984 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
985 ti->tcpi_options |= TCPI_OPT_WSCALE;
986 ti->tcpi_snd_wscale = tp->snd_scale;
987 ti->tcpi_rcv_wscale = tp->rcv_scale;
988 }
989 ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
990 ti->tcpi_snd_cwnd = tp->snd_cwnd;
991
992 /*
993 * FreeBSD-specific extension fields for tcp_info.
994 */
995 ti->tcpi_rcv_space = tp->rcv_wnd;
996 ti->tcpi_snd_wnd = tp->snd_wnd;
997 ti->tcpi_snd_bwnd = tp->snd_bwnd;
998 }
999
1000 /*
1001 * The new sockopt interface makes it possible for us to block in the
1002 * copyin/out step (if we take a page fault). Taking a page fault at
1003 * splnet() is probably a Bad Thing. (Since sockets and pcbs both now
1004 * use TSM, there probably isn't any need for this function to run at
1005 * splnet() any more. This needs more examination.)
1006 *
1007 * XXXRW: The locking here is wrong; we may take a page fault while holding
1008 * the inpcb lock.
1009 */
1010 int
1011 tcp_ctloutput(so, sopt)
1012 struct socket *so;
1013 struct sockopt *sopt;
1014 {
1015 int error, opt, optval;
1016 struct inpcb *inp;
1017 struct tcpcb *tp;
1018 struct tcp_info ti;
1019
1020 error = 0;
1021 INP_INFO_RLOCK(&tcbinfo);
1022 inp = sotoinpcb(so);
1023 if (inp == NULL) {
1024 INP_INFO_RUNLOCK(&tcbinfo);
1025 return (ECONNRESET);
1026 }
1027 INP_LOCK(inp);
1028 INP_INFO_RUNLOCK(&tcbinfo);
1029 if (sopt->sopt_level != IPPROTO_TCP) {
1030 INP_UNLOCK(inp);
1031 #ifdef INET6
1032 if (INP_CHECK_SOCKAF(so, AF_INET6))
1033 error = ip6_ctloutput(so, sopt);
1034 else
1035 #endif /* INET6 */
1036 error = ip_ctloutput(so, sopt);
1037 return (error);
1038 }
1039 tp = intotcpcb(inp);
1040
1041 switch (sopt->sopt_dir) {
1042 case SOPT_SET:
1043 switch (sopt->sopt_name) {
1044 #ifdef TCP_SIGNATURE
1045 case TCP_MD5SIG:
1046 error = sooptcopyin(sopt, &optval, sizeof optval,
1047 sizeof optval);
1048 if (error)
1049 break;
1050
1051 if (optval > 0)
1052 tp->t_flags |= TF_SIGNATURE;
1053 else
1054 tp->t_flags &= ~TF_SIGNATURE;
1055 break;
1056 #endif /* TCP_SIGNATURE */
1057 case TCP_NODELAY:
1058 case TCP_NOOPT:
1059 error = sooptcopyin(sopt, &optval, sizeof optval,
1060 sizeof optval);
1061 if (error)
1062 break;
1063
1064 switch (sopt->sopt_name) {
1065 case TCP_NODELAY:
1066 opt = TF_NODELAY;
1067 break;
1068 case TCP_NOOPT:
1069 opt = TF_NOOPT;
1070 break;
1071 default:
1072 opt = 0; /* dead code to fool gcc */
1073 break;
1074 }
1075
1076 if (optval)
1077 tp->t_flags |= opt;
1078 else
1079 tp->t_flags &= ~opt;
1080 break;
1081
1082 case TCP_NOPUSH:
1083 error = sooptcopyin(sopt, &optval, sizeof optval,
1084 sizeof optval);
1085 if (error)
1086 break;
1087
1088 if (optval)
1089 tp->t_flags |= TF_NOPUSH;
1090 else {
1091 tp->t_flags &= ~TF_NOPUSH;
1092 error = tcp_output(tp);
1093 }
1094 break;
1095
1096 case TCP_MAXSEG:
1097 error = sooptcopyin(sopt, &optval, sizeof optval,
1098 sizeof optval);
1099 if (error)
1100 break;
1101
1102 if (optval > 0 && optval <= tp->t_maxseg &&
1103 optval + 40 >= tcp_minmss)
1104 tp->t_maxseg = optval;
1105 else
1106 error = EINVAL;
1107 break;
1108
1109 case TCP_INFO:
1110 error = EINVAL;
1111 break;
1112
1113 default:
1114 error = ENOPROTOOPT;
1115 break;
1116 }
1117 break;
1118
1119 case SOPT_GET:
1120 switch (sopt->sopt_name) {
1121 #ifdef TCP_SIGNATURE
1122 case TCP_MD5SIG:
1123 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
1124 error = sooptcopyout(sopt, &optval, sizeof optval);
1125 break;
1126 #endif
1127 case TCP_NODELAY:
1128 optval = tp->t_flags & TF_NODELAY;
1129 error = sooptcopyout(sopt, &optval, sizeof optval);
1130 break;
1131 case TCP_MAXSEG:
1132 optval = tp->t_maxseg;
1133 error = sooptcopyout(sopt, &optval, sizeof optval);
1134 break;
1135 case TCP_NOOPT:
1136 optval = tp->t_flags & TF_NOOPT;
1137 error = sooptcopyout(sopt, &optval, sizeof optval);
1138 break;
1139 case TCP_NOPUSH:
1140 optval = tp->t_flags & TF_NOPUSH;
1141 error = sooptcopyout(sopt, &optval, sizeof optval);
1142 break;
1143 case TCP_INFO:
1144 tcp_fill_info(tp, &ti);
1145 error = sooptcopyout(sopt, &ti, sizeof ti);
1146 break;
1147 default:
1148 error = ENOPROTOOPT;
1149 break;
1150 }
1151 break;
1152 }
1153 INP_UNLOCK(inp);
1154 return (error);
1155 }
1156
1157 /*
1158 * tcp_sendspace and tcp_recvspace are the default send and receive window
1159 * sizes, respectively. These are obsolescent (this information should
1160 * be set by the route).
1161 */
1162 u_long tcp_sendspace = 1024*32;
1163 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1164 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1165 u_long tcp_recvspace = 1024*64;
1166 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1167 &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1168
1169 /*
1170 * Attach TCP protocol to socket, allocating
1171 * internet protocol control block, tcp control block,
1172 * bufer space, and entering LISTEN state if to accept connections.
1173 */
1174 static int
1175 tcp_attach(so)
1176 struct socket *so;
1177 {
1178 register struct tcpcb *tp;
1179 struct inpcb *inp;
1180 int error;
1181 #ifdef INET6
1182 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
1183 #endif
1184
1185 INP_INFO_WLOCK_ASSERT(&tcbinfo);
1186
1187 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1188 error = soreserve(so, tcp_sendspace, tcp_recvspace);
1189 if (error)
1190 return (error);
1191 }
1192 error = in_pcballoc(so, &tcbinfo, "tcpinp");
1193 if (error)
1194 return (error);
1195 inp = sotoinpcb(so);
1196 #ifdef INET6
1197 if (isipv6) {
1198 inp->inp_vflag |= INP_IPV6;
1199 inp->in6p_hops = -1; /* use kernel default */
1200 }
1201 else
1202 #endif
1203 inp->inp_vflag |= INP_IPV4;
1204 tp = tcp_newtcpcb(inp);
1205 if (tp == 0) {
1206 int nofd = so->so_state & SS_NOFDREF; /* XXX */
1207
1208 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
1209
1210 INP_LOCK(inp);
1211 #ifdef INET6
1212 if (isipv6)
1213 in6_pcbdetach(inp);
1214 else
1215 #endif
1216 in_pcbdetach(inp);
1217 so->so_state |= nofd;
1218 return (ENOBUFS);
1219 }
1220 tp->t_state = TCPS_CLOSED;
1221 return (0);
1222 }
1223
1224 /*
1225 * Initiate (or continue) disconnect.
1226 * If embryonic state, just send reset (once).
1227 * If in ``let data drain'' option and linger null, just drop.
1228 * Otherwise (hard), mark socket disconnecting and drop
1229 * current input data; switch states based on user close, and
1230 * send segment to peer (with FIN).
1231 */
1232 static struct tcpcb *
1233 tcp_disconnect(tp)
1234 register struct tcpcb *tp;
1235 {
1236 struct inpcb *inp = tp->t_inpcb;
1237 struct socket *so = inp->inp_socket;
1238
1239 INP_INFO_WLOCK_ASSERT(&tcbinfo);
1240 INP_LOCK_ASSERT(inp);
1241
1242 if (tp->t_state < TCPS_ESTABLISHED)
1243 tp = tcp_close(tp);
1244 else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1245 tp = tcp_drop(tp, 0);
1246 else {
1247 soisdisconnecting(so);
1248 sbflush(&so->so_rcv);
1249 tp = tcp_usrclosed(tp);
1250 if (tp)
1251 (void) tcp_output(tp);
1252 }
1253 return (tp);
1254 }
1255
1256 /*
1257 * User issued close, and wish to trail through shutdown states:
1258 * if never received SYN, just forget it. If got a SYN from peer,
1259 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1260 * If already got a FIN from peer, then almost done; go to LAST_ACK
1261 * state. In all other cases, have already sent FIN to peer (e.g.
1262 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1263 * for peer to send FIN or not respond to keep-alives, etc.
1264 * We can let the user exit from the close as soon as the FIN is acked.
1265 */
1266 static struct tcpcb *
1267 tcp_usrclosed(tp)
1268 register struct tcpcb *tp;
1269 {
1270
1271 INP_INFO_WLOCK_ASSERT(&tcbinfo);
1272 INP_LOCK_ASSERT(tp->t_inpcb);
1273
1274 switch (tp->t_state) {
1275
1276 case TCPS_CLOSED:
1277 case TCPS_LISTEN:
1278 tp->t_state = TCPS_CLOSED;
1279 tp = tcp_close(tp);
1280 break;
1281
1282 case TCPS_SYN_SENT:
1283 case TCPS_SYN_RECEIVED:
1284 tp->t_flags |= TF_NEEDFIN;
1285 break;
1286
1287 case TCPS_ESTABLISHED:
1288 tp->t_state = TCPS_FIN_WAIT_1;
1289 break;
1290
1291 case TCPS_CLOSE_WAIT:
1292 tp->t_state = TCPS_LAST_ACK;
1293 break;
1294 }
1295 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1296 soisdisconnected(tp->t_inpcb->inp_socket);
1297 /* To prevent the connection hanging in FIN_WAIT_2 forever. */
1298 if (tp->t_state == TCPS_FIN_WAIT_2)
1299 callout_reset(tp->tt_2msl, tcp_maxidle,
1300 tcp_timer_2msl, tp);
1301 }
1302 return (tp);
1303 }
Cache object: dea40ef66f7e7d2ed5f69eff76816086
|