The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/bsd/netinet/tcp_usrreq.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
    3  *
    4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
    5  * 
    6  * This file contains Original Code and/or Modifications of Original Code
    7  * as defined in and that are subject to the Apple Public Source License
    8  * Version 2.0 (the 'License'). You may not use this file except in
    9  * compliance with the License. The rights granted to you under the License
   10  * may not be used to create, or enable the creation or redistribution of,
   11  * unlawful or unlicensed copies of an Apple operating system, or to
   12  * circumvent, violate, or enable the circumvention or violation of, any
   13  * terms of an Apple operating system software license agreement.
   14  * 
   15  * Please obtain a copy of the License at
   16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
   17  * 
   18  * The Original Code and all software distributed under the License are
   19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   23  * Please see the License for the specific language governing rights and
   24  * limitations under the License.
   25  * 
   26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   27  */
   28 /*
   29  * Copyright (c) 1982, 1986, 1988, 1993
   30  *      The Regents of the University of California.  All rights reserved.
   31  *
   32  * Redistribution and use in source and binary forms, with or without
   33  * modification, are permitted provided that the following conditions
   34  * are met:
   35  * 1. Redistributions of source code must retain the above copyright
   36  *    notice, this list of conditions and the following disclaimer.
   37  * 2. Redistributions in binary form must reproduce the above copyright
   38  *    notice, this list of conditions and the following disclaimer in the
   39  *    documentation and/or other materials provided with the distribution.
   40  * 3. All advertising materials mentioning features or use of this software
   41  *    must display the following acknowledgement:
   42  *      This product includes software developed by the University of
   43  *      California, Berkeley and its contributors.
   44  * 4. Neither the name of the University nor the names of its contributors
   45  *    may be used to endorse or promote products derived from this software
   46  *    without specific prior written permission.
   47  *
   48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   58  * SUCH DAMAGE.
   59  *
   60  *      From: @(#)tcp_usrreq.c  8.2 (Berkeley) 1/3/94
   61  * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.9 2001/08/22 00:59:12 silby Exp $
   62  */
   63 
   64 
   65 #include <sys/param.h>
   66 #include <sys/systm.h>
   67 #include <sys/kernel.h>
   68 #include <sys/sysctl.h>
   69 #include <sys/mbuf.h>
   70 #if INET6
   71 #include <sys/domain.h>
   72 #endif /* INET6 */
   73 #include <sys/socket.h>
   74 #include <sys/socketvar.h>
   75 #include <sys/protosw.h>
   76 
   77 #include <net/if.h>
   78 #include <net/route.h>
   79 #include <net/ntstat.h>
   80 
   81 #include <netinet/in.h>
   82 #include <netinet/in_systm.h>
   83 #if INET6
   84 #include <netinet/ip6.h>
   85 #endif
   86 #include <netinet/in_pcb.h>
   87 #if INET6
   88 #include <netinet6/in6_pcb.h>
   89 #endif
   90 #include <netinet/in_var.h>
   91 #include <netinet/ip_var.h>
   92 #if INET6
   93 #include <netinet6/ip6_var.h>
   94 #endif
   95 #include <netinet/tcp.h>
   96 #include <netinet/tcp_fsm.h>
   97 #include <netinet/tcp_seq.h>
   98 #include <netinet/tcp_timer.h>
   99 #include <netinet/tcp_var.h>
  100 #include <netinet/tcpip.h>
  101 #if TCPDEBUG
  102 #include <netinet/tcp_debug.h>
  103 #endif
  104 
  105 #if IPSEC
  106 #include <netinet6/ipsec.h>
  107 #endif /*IPSEC*/
  108 
  109 void    tcp_fill_info(struct tcpcb *, struct tcp_info *);
  110 errno_t tcp_fill_info_for_info_tuple(struct info_tuple *, struct tcp_info *);
  111 
  112 int tcp_sysctl_info(struct sysctl_oid *, void *, int , struct sysctl_req *);
  113 
  114 /*
  115  * TCP protocol interface to socket abstraction.
  116  */
  117 extern  char *tcpstates[];      /* XXX ??? */
  118 
  119 static int      tcp_attach(struct socket *, struct proc *);
  120 static int      tcp_connect(struct tcpcb *, struct sockaddr *, struct proc *);
  121 #if INET6
  122 static int      tcp6_connect(struct tcpcb *, struct sockaddr *, struct proc *);
  123 #endif /* INET6 */
  124 static struct tcpcb *
  125                 tcp_disconnect(struct tcpcb *);
  126 static struct tcpcb *
  127                 tcp_usrclosed(struct tcpcb *);
  128 
  129 __private_extern__ int  tcp_win_scale = 3;
  130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
  131     &tcp_win_scale, 0, "Window scaling factor");
  132 
  133 static u_int32_t tcps_in_sw_cksum;
  134 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, in_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
  135     &tcps_in_sw_cksum, 0,
  136     "Number of received packets checksummed in software");
  137 
  138 static u_int64_t tcps_in_sw_cksum_bytes;
  139 SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
  140     &tcps_in_sw_cksum_bytes,
  141     "Amount of received data checksummed in software");
  142 
  143 static u_int32_t tcps_out_sw_cksum;
  144 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, out_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
  145     &tcps_out_sw_cksum, 0,
  146     "Number of transmitted packets checksummed in software");
  147 
  148 static u_int64_t tcps_out_sw_cksum_bytes;
  149 SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
  150     &tcps_out_sw_cksum_bytes,
  151     "Amount of transmitted data checksummed in software");
  152 
  153 #if TCPDEBUG
  154 #define TCPDEBUG0       int ostate = 0
  155 #define TCPDEBUG1()     ostate = tp ? tp->t_state : 0
  156 #define TCPDEBUG2(req)  if (tp && (so->so_options & SO_DEBUG)) \
  157                                 tcp_trace(TA_USER, ostate, tp, 0, 0, req)
  158 #else
  159 #define TCPDEBUG0
  160 #define TCPDEBUG1()
  161 #define TCPDEBUG2(req)
  162 #endif
  163 
  164 #if CONFIG_USESOCKTHRESHOLD
  165 __private_extern__ unsigned int tcp_sockthreshold = 64;
  166 #else
  167 __private_extern__ unsigned int tcp_sockthreshold = 0;
  168 #endif
  169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW | CTLFLAG_LOCKED,
  170     &tcp_sockthreshold , 0, "TCP Socket size increased if less than threshold");
  171 
  172 
  173 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, info, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
  174     0 , 0, tcp_sysctl_info, "S", "TCP info per tuple");
  175 
  176 /*
  177  * TCP attaches to socket via pru_attach(), reserving space,
  178  * and an internet control block.
  179  *
  180  * Returns:     0                       Success
  181  *              EISCONN
  182  *      tcp_attach:ENOBUFS
  183  *      tcp_attach:ENOMEM
  184  *      tcp_attach:???                  [IPSEC specific]
  185  */
  186 static int
  187 tcp_usr_attach(struct socket *so, __unused int proto, struct proc *p)
  188 {
  189         int error;
  190         struct inpcb *inp = sotoinpcb(so);
  191         struct tcpcb *tp = 0;
  192         TCPDEBUG0;
  193 
  194         TCPDEBUG1();
  195         if (inp) {
  196                 error = EISCONN;
  197                 goto out;
  198         }
  199         
  200         error = tcp_attach(so, p);
  201         if (error)
  202                 goto out;
  203 
  204         if ((so->so_options & SO_LINGER) && so->so_linger == 0)
  205                 so->so_linger = TCP_LINGERTIME * hz;
  206         tp = sototcpcb(so);
  207 out:
  208         TCPDEBUG2(PRU_ATTACH);
  209         return error;
  210 }
  211 
  212 /*
  213  * pru_detach() detaches the TCP protocol from the socket.
  214  * If the protocol state is non-embryonic, then can't
  215  * do this directly: have to initiate a pru_disconnect(),
  216  * which may finish later; embryonic TCB's can just
  217  * be discarded here.
  218  */
  219 static int
  220 tcp_usr_detach(struct socket *so)
  221 {
  222         int error = 0;
  223         struct inpcb *inp = sotoinpcb(so);
  224         struct tcpcb *tp;
  225         TCPDEBUG0;
  226 
  227         if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
  228                 return EINVAL;  /* XXX */
  229         }
  230         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
  231         tp = intotcpcb(inp);
  232         /* In case we got disconnected from the peer */
  233         if (tp == 0) 
  234             goto out;
  235         TCPDEBUG1();
  236 
  237         calculate_tcp_clock();
  238 
  239         tp = tcp_disconnect(tp);
  240 out:
  241         TCPDEBUG2(PRU_DETACH);
  242         return error;
  243 }
  244 
  245 #define COMMON_START()  TCPDEBUG0; \
  246                         do { \
  247                                      if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { \
  248                                              return EINVAL; \
  249                                      } \
  250                                      tp = intotcpcb(inp); \
  251                                      TCPDEBUG1(); \
  252                                      calculate_tcp_clock(); \
  253                      } while(0)
  254                              
  255 #define COMMON_END(req) out: TCPDEBUG2(req); return error; goto out
  256 
  257 
  258 /*
  259  * Give the socket an address.
  260  *
  261  * Returns:     0                       Success
  262  *              EINVAL                  Invalid argument [COMMON_START]
  263  *              EAFNOSUPPORT            Address family not supported
  264  *      in_pcbbind:EADDRNOTAVAIL        Address not available.
  265  *      in_pcbbind:EINVAL               Invalid argument
  266  *      in_pcbbind:EAFNOSUPPORT         Address family not supported [notdef]
  267  *      in_pcbbind:EACCES               Permission denied
  268  *      in_pcbbind:EADDRINUSE           Address in use
  269  *      in_pcbbind:EAGAIN               Resource unavailable, try again
  270  *      in_pcbbind:EPERM                Operation not permitted
  271  */
  272 static int
  273 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
  274 {
  275         int error = 0;
  276         struct inpcb *inp = sotoinpcb(so);
  277         struct tcpcb *tp;
  278         struct sockaddr_in *sinp;
  279 
  280         COMMON_START();
  281 
  282         if (nam->sa_family != 0 && nam->sa_family != AF_INET) {
  283                 error = EAFNOSUPPORT;
  284                 goto out;
  285         }
  286 
  287         /*
  288          * Must check for multicast addresses and disallow binding
  289          * to them.
  290          */
  291         sinp = (struct sockaddr_in *)nam;
  292         if (sinp->sin_family == AF_INET &&
  293             IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
  294                 error = EAFNOSUPPORT;
  295                 goto out;
  296         }
  297         error = in_pcbbind(inp, nam, p);
  298         if (error)
  299                 goto out;
  300         COMMON_END(PRU_BIND);
  301 
  302 }
  303 
  304 #if INET6
  305 static int
  306 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
  307 {
  308         int error = 0;
  309         struct inpcb *inp = sotoinpcb(so);
  310         struct tcpcb *tp;
  311         struct sockaddr_in6 *sin6p;
  312 
  313         COMMON_START();
  314 
  315         if (nam->sa_family != 0 && nam->sa_family != AF_INET6) {
  316                 error = EAFNOSUPPORT;
  317                 goto out;
  318         }
  319 
  320         /*
  321          * Must check for multicast addresses and disallow binding
  322          * to them.
  323          */
  324         sin6p = (struct sockaddr_in6 *)nam;
  325         if (sin6p->sin6_family == AF_INET6 &&
  326             IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
  327                 error = EAFNOSUPPORT;
  328                 goto out;
  329         }
  330         inp->inp_vflag &= ~INP_IPV4;
  331         inp->inp_vflag |= INP_IPV6;
  332         if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
  333                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
  334                         inp->inp_vflag |= INP_IPV4;
  335                 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
  336                         struct sockaddr_in sin;
  337 
  338                         in6_sin6_2_sin(&sin, sin6p);
  339                         inp->inp_vflag |= INP_IPV4;
  340                         inp->inp_vflag &= ~INP_IPV6;
  341                         error = in_pcbbind(inp, (struct sockaddr *)&sin, p);
  342                         goto out;
  343                 }
  344         }
  345         error = in6_pcbbind(inp, nam, p);
  346         if (error)
  347                 goto out;
  348         COMMON_END(PRU_BIND);
  349 }
  350 #endif /* INET6 */
  351 
  352 /*
  353  * Prepare to accept connections.
  354  *
  355  * Returns:     0                       Success
  356  *              EINVAL [COMMON_START]
  357  *      in_pcbbind:EADDRNOTAVAIL        Address not available.
  358  *      in_pcbbind:EINVAL               Invalid argument
  359  *      in_pcbbind:EAFNOSUPPORT         Address family not supported [notdef]
  360  *      in_pcbbind:EACCES               Permission denied
  361  *      in_pcbbind:EADDRINUSE           Address in use
  362  *      in_pcbbind:EAGAIN               Resource unavailable, try again
  363  *      in_pcbbind:EPERM                Operation not permitted
  364  */
  365 static int
  366 tcp_usr_listen(struct socket *so, struct proc *p)
  367 {
  368         int error = 0;
  369         struct inpcb *inp = sotoinpcb(so);
  370         struct tcpcb *tp;
  371 
  372         COMMON_START();
  373         if (inp->inp_lport == 0)
  374                 error = in_pcbbind(inp, (struct sockaddr *)0, p);
  375         if (error == 0)
  376                 tp->t_state = TCPS_LISTEN;
  377         COMMON_END(PRU_LISTEN);
  378 }
  379 
  380 #if INET6
  381 static int
  382 tcp6_usr_listen(struct socket *so, struct proc *p)
  383 {
  384         int error = 0;
  385         struct inpcb *inp = sotoinpcb(so);
  386         struct tcpcb *tp;
  387 
  388         COMMON_START();
  389         if (inp->inp_lport == 0) {
  390                 inp->inp_vflag &= ~INP_IPV4;
  391                 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
  392                         inp->inp_vflag |= INP_IPV4;
  393                 error = in6_pcbbind(inp, (struct sockaddr *)0, p);
  394         }
  395         if (error == 0)
  396                 tp->t_state = TCPS_LISTEN;
  397         COMMON_END(PRU_LISTEN);
  398 }
  399 #endif /* INET6 */
  400 
  401 /*
  402  * Initiate connection to peer.
  403  * Create a template for use in transmissions on this connection.
  404  * Enter SYN_SENT state, and mark socket as connecting.
  405  * Start keep-alive timer, and seed output sequence space.
  406  * Send initial segment on connection.
  407  */
  408 static int
  409 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
  410 {
  411         int error = 0;
  412         struct inpcb *inp = sotoinpcb(so);
  413         struct tcpcb *tp;
  414         struct sockaddr_in *sinp;
  415 
  416         TCPDEBUG0;
  417         if (inp == 0)
  418                 return EINVAL;
  419         else if (inp->inp_state == INPCB_STATE_DEAD) {
  420                 if (so->so_error) {
  421                         error = so->so_error;
  422                         so->so_error = 0;
  423                         return error;
  424                 } else
  425                         return EINVAL;
  426         }
  427         tp = intotcpcb(inp);
  428         TCPDEBUG1();
  429 
  430         calculate_tcp_clock();
  431 
  432         if (nam->sa_family != 0 && nam->sa_family != AF_INET) {
  433                 error = EAFNOSUPPORT;
  434                 goto out;
  435         }
  436         /*
  437          * Must disallow TCP ``connections'' to multicast addresses.
  438          */
  439         sinp = (struct sockaddr_in *)nam;
  440         if (sinp->sin_family == AF_INET
  441             && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
  442                 error = EAFNOSUPPORT;
  443                 goto out;
  444         }
  445 
  446 
  447         if ((error = tcp_connect(tp, nam, p)) != 0)
  448                 goto out;
  449         error = tcp_output(tp);
  450         COMMON_END(PRU_CONNECT);
  451 }
  452 
  453 #if INET6
  454 static int
  455 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
  456 {
  457         int error = 0;
  458         struct inpcb *inp = sotoinpcb(so);
  459         struct tcpcb *tp;
  460         struct sockaddr_in6 *sin6p;
  461 
  462         COMMON_START();
  463 
  464         if (nam->sa_family != 0 && nam->sa_family != AF_INET6) {
  465                 error = EAFNOSUPPORT;
  466                 goto out;
  467         }
  468 
  469         /*
  470          * Must disallow TCP ``connections'' to multicast addresses.
  471          */
  472         sin6p = (struct sockaddr_in6 *)nam;
  473         if (sin6p->sin6_family == AF_INET6
  474             && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
  475                 error = EAFNOSUPPORT;
  476                 goto out;
  477         }
  478 
  479         if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
  480                 struct sockaddr_in sin;
  481 
  482                 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
  483                         return (EINVAL);
  484 
  485                 in6_sin6_2_sin(&sin, sin6p);
  486                 inp->inp_vflag |= INP_IPV4;
  487                 inp->inp_vflag &= ~INP_IPV6;
  488                 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0)
  489                         goto out;
  490                 error = tcp_output(tp);
  491                 goto out;
  492         }
  493         inp->inp_vflag &= ~INP_IPV4;
  494         inp->inp_vflag |= INP_IPV6;
  495         if ((error = tcp6_connect(tp, nam, p)) != 0)
  496                 goto out;
  497         error = tcp_output(tp);
  498         if (error)
  499                 goto out;
  500         COMMON_END(PRU_CONNECT);
  501 }
  502 #endif /* INET6 */
  503 
  504 /*
  505  * Initiate disconnect from peer.
  506  * If connection never passed embryonic stage, just drop;
  507  * else if don't need to let data drain, then can just drop anyways,
  508  * else have to begin TCP shutdown process: mark socket disconnecting,
  509  * drain unread data, state switch to reflect user close, and
  510  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
  511  * when peer sends FIN and acks ours.
  512  *
  513  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
  514  */
  515 static int
  516 tcp_usr_disconnect(struct socket *so)
  517 {
  518         int error = 0;
  519         struct inpcb *inp = sotoinpcb(so);
  520         struct tcpcb *tp;
  521         
  522         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
  523         COMMON_START();
  524         /* In case we got disconnected from the peer */
  525         if (tp == 0)
  526             goto out;
  527         tp = tcp_disconnect(tp);
  528         COMMON_END(PRU_DISCONNECT);
  529 }
  530 
  531 /*
  532  * Accept a connection.  Essentially all the work is
  533  * done at higher levels; just return the address
  534  * of the peer, storing through addr.
  535  */
  536 static int
  537 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
  538 {
  539         int error = 0;
  540         struct inpcb *inp = sotoinpcb(so);
  541         struct tcpcb *tp = NULL;
  542         TCPDEBUG0;
  543 
  544         in_setpeeraddr(so, nam);
  545                 
  546         if (so->so_state & SS_ISDISCONNECTED) {
  547                 error = ECONNABORTED;
  548                 goto out;
  549         }
  550         if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
  551                 return (EINVAL);
  552         }
  553         tp = intotcpcb(inp);
  554         TCPDEBUG1();
  555 
  556         calculate_tcp_clock();
  557 
  558         COMMON_END(PRU_ACCEPT);
  559 }
  560 
  561 #if INET6
  562 static int
  563 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
  564 {
  565         int error = 0;
  566         struct inpcb *inp = sotoinpcb(so);
  567         struct tcpcb *tp = NULL;
  568         TCPDEBUG0;
  569 
  570         if (so->so_state & SS_ISDISCONNECTED) {
  571                 error = ECONNABORTED;
  572                 goto out;
  573         }
  574         if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) {
  575                 return (EINVAL);
  576         }
  577         tp = intotcpcb(inp);
  578         TCPDEBUG1();
  579 
  580         calculate_tcp_clock();
  581 
  582         in6_mapped_peeraddr(so, nam);
  583         COMMON_END(PRU_ACCEPT);
  584 }
  585 #endif /* INET6 */
  586 
  587 /*
  588  * Mark the connection as being incapable of further output.
  589  *
  590  * Returns:     0                       Success
  591  *              EINVAL [COMMON_START]
  592  *      tcp_output:EADDRNOTAVAIL
  593  *      tcp_output:ENOBUFS
  594  *      tcp_output:EMSGSIZE
  595  *      tcp_output:EHOSTUNREACH
  596  *      tcp_output:ENETUNREACH
  597  *      tcp_output:ENETDOWN
  598  *      tcp_output:ENOMEM
  599  *      tcp_output:EACCES
  600  *      tcp_output:EMSGSIZE
  601  *      tcp_output:ENOBUFS
  602  *      tcp_output:???                  [ignorable: mostly IPSEC/firewall/DLIL]
  603  */
  604 static int
  605 tcp_usr_shutdown(struct socket *so)
  606 {
  607         int error = 0;
  608         struct inpcb *inp = sotoinpcb(so);
  609         struct tcpcb *tp;
  610 
  611         COMMON_START();
  612         socantsendmore(so);
  613         /* In case we got disconnected from the peer */
  614         if (tp == 0)
  615             goto out;
  616         tp = tcp_usrclosed(tp);
  617         if (tp)
  618                 error = tcp_output(tp);
  619         COMMON_END(PRU_SHUTDOWN);
  620 }
  621 
  622 /*
  623  * After a receive, possibly send window update to peer.
  624  */
  625 static int
  626 tcp_usr_rcvd(struct socket *so, __unused int flags)
  627 {
  628         int error = 0;
  629         struct inpcb *inp = sotoinpcb(so);
  630         struct tcpcb *tp;
  631 
  632         COMMON_START();
  633         /* In case we got disconnected from the peer */
  634         if (tp == 0)
  635             goto out;
  636         tcp_output(tp);
  637         COMMON_END(PRU_RCVD);
  638 }
  639 
  640 /*
  641  * Do a send by putting data in output queue and updating urgent
  642  * marker if URG set.  Possibly send more data.  Unlike the other
  643  * pru_*() routines, the mbuf chains are our responsibility.  We
  644  * must either enqueue them or free them.  The other pru_* routines
  645  * generally are caller-frees.
  646  *
  647  * Returns:     0                       Success
  648  *              ECONNRESET
  649  *              EINVAL
  650  *              ENOBUFS
  651  *      tcp_connect:EADDRINUSE          Address in use
  652  *      tcp_connect:EADDRNOTAVAIL       Address not available.
  653  *      tcp_connect:EINVAL              Invalid argument
  654  *      tcp_connect:EAFNOSUPPORT        Address family not supported [notdef]
  655  *      tcp_connect:EACCES              Permission denied
  656  *      tcp_connect:EAGAIN              Resource unavailable, try again
  657  *      tcp_connect:EPERM               Operation not permitted
  658  *      tcp_output:EADDRNOTAVAIL
  659  *      tcp_output:ENOBUFS
  660  *      tcp_output:EMSGSIZE
  661  *      tcp_output:EHOSTUNREACH
  662  *      tcp_output:ENETUNREACH
  663  *      tcp_output:ENETDOWN
  664  *      tcp_output:ENOMEM
  665  *      tcp_output:EACCES
  666  *      tcp_output:EMSGSIZE
  667  *      tcp_output:ENOBUFS
  668  *      tcp_output:???                  [ignorable: mostly IPSEC/firewall/DLIL]
  669  *      tcp6_connect:???                [IPV6 only]
  670  */
  671 static int
  672 tcp_usr_send(struct socket *so, int flags, struct mbuf *m, 
  673              struct sockaddr *nam, struct mbuf *control, struct proc *p)
  674 {
  675         int error = 0;
  676         struct inpcb *inp = sotoinpcb(so);
  677         struct tcpcb *tp;
  678 #if INET6
  679         int isipv6;
  680 #endif
  681         TCPDEBUG0;
  682 
  683         if (inp == NULL || inp->inp_state == INPCB_STATE_DEAD) {
  684                 /*
  685                  * OOPS! we lost a race, the TCP session got reset after
  686                  * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
  687                  * network interrupt in the non-splnet() section of sosend().
  688                  */
  689                 if (m)
  690                         m_freem(m);
  691                 if (control)
  692                         m_freem(control);
  693                 error = ECONNRESET;     /* XXX EPIPE? */
  694                 tp = NULL;
  695                 TCPDEBUG1();
  696                 goto out;
  697         }
  698 #if INET6
  699         isipv6 = nam && nam->sa_family == AF_INET6;
  700 #endif /* INET6 */
  701         tp = intotcpcb(inp);
  702         TCPDEBUG1();
  703 
  704         calculate_tcp_clock();
  705 
  706         if (control) {
  707                 /* TCP doesn't do control messages (rights, creds, etc) */
  708                 if (control->m_len) {
  709                         m_freem(control);
  710                         if (m)
  711                                 m_freem(m);
  712                         error = EINVAL;
  713                         goto out;
  714                 }
  715                 m_freem(control);       /* empty control, just free it */
  716         }
  717         if(!(flags & PRUS_OOB)) {
  718                 sbappendstream(&so->so_snd, m);
  719                 if (nam && tp->t_state < TCPS_SYN_SENT) {
  720                         /*
  721                          * Do implied connect if not yet connected,
  722                          * initialize window to default value, and
  723                          * initialize maxseg/maxopd using peer's cached
  724                          * MSS.
  725                          */
  726 #if INET6
  727                         if (isipv6)
  728                                 error = tcp6_connect(tp, nam, p);
  729                         else
  730 #endif /* INET6 */
  731                         error = tcp_connect(tp, nam, p);
  732                         if (error)
  733                                 goto out;
  734                         tp->snd_wnd = TTCP_CLIENT_SND_WND;
  735                         tcp_mss(tp, -1, IFSCOPE_NONE);
  736                 }
  737 
  738                 if (flags & PRUS_EOF) {
  739                         /*
  740                          * Close the send side of the connection after
  741                          * the data is sent.
  742                          */
  743                         socantsendmore(so);
  744                         tp = tcp_usrclosed(tp);
  745                 }
  746                 if (tp != NULL) {
  747                         if (flags & PRUS_MORETOCOME)
  748                                 tp->t_flags |= TF_MORETOCOME;
  749                         error = tcp_output(tp);
  750                         if (flags & PRUS_MORETOCOME)
  751                                 tp->t_flags &= ~TF_MORETOCOME;
  752                 }
  753         } else {
  754                 if (sbspace(&so->so_snd) == 0) { 
  755                         /* if no space is left in sockbuf, 
  756                          * do not try to squeeze in OOB traffic */
  757                         m_freem(m);
  758                         error = ENOBUFS;
  759                         goto out;
  760                 }
  761                 /*
  762                  * According to RFC961 (Assigned Protocols),
  763                  * the urgent pointer points to the last octet
  764                  * of urgent data.  We continue, however,
  765                  * to consider it to indicate the first octet
  766                  * of data past the urgent section.
  767                  * Otherwise, snd_up should be one lower.
  768                  */
  769                 sbappendstream(&so->so_snd, m);
  770                 if (nam && tp->t_state < TCPS_SYN_SENT) {
  771                         /*
  772                          * Do implied connect if not yet connected,
  773                          * initialize window to default value, and
  774                          * initialize maxseg/maxopd using peer's cached
  775                          * MSS.
  776                          */
  777 #if INET6
  778                         if (isipv6)
  779                                 error = tcp6_connect(tp, nam, p);
  780                         else
  781 #endif /* INET6 */
  782                         error = tcp_connect(tp, nam, p);
  783                         if (error)
  784                                 goto out;
  785                         tp->snd_wnd = TTCP_CLIENT_SND_WND;
  786                         tcp_mss(tp, -1, IFSCOPE_NONE);
  787                 }
  788                 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
  789                 tp->t_force = 1;
  790                 error = tcp_output(tp);
  791                 tp->t_force = 0;
  792         }
  793         COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB : 
  794                    ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
  795 }
  796 
  797 /*
  798  * Abort the TCP.
  799  */
  800 static int
  801 tcp_usr_abort(struct socket *so)
  802 {
  803         int error = 0;
  804         struct inpcb *inp = sotoinpcb(so);
  805         struct tcpcb *tp;
  806 
  807         COMMON_START();
  808         /* In case we got disconnected from the peer */
  809         if (tp == 0)
  810             goto out;
  811         tp = tcp_drop(tp, ECONNABORTED);
  812         so->so_usecount--;
  813         COMMON_END(PRU_ABORT);
  814 }
  815 
  816 /*
  817  * Receive out-of-band data.
  818  *
  819  * Returns:     0                       Success
  820  *              EINVAL [COMMON_START]
  821  *              EINVAL
  822  *              EWOULDBLOCK
  823  */
  824 static int
  825 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
  826 {
  827         int error = 0;
  828         struct inpcb *inp = sotoinpcb(so);
  829         struct tcpcb *tp;
  830 
  831         COMMON_START();
  832         if ((so->so_oobmark == 0 &&
  833              (so->so_state & SS_RCVATMARK) == 0) ||
  834             so->so_options & SO_OOBINLINE ||
  835             tp->t_oobflags & TCPOOB_HADDATA) {
  836                 error = EINVAL;
  837                 goto out;
  838         }
  839         if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
  840                 error = EWOULDBLOCK;
  841                 goto out;
  842         }
  843         m->m_len = 1;
  844         *mtod(m, caddr_t) = tp->t_iobc;
  845         if ((flags & MSG_PEEK) == 0)
  846                 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
  847         COMMON_END(PRU_RCVOOB);
  848 }
  849 
  850 /* xxx - should be const */
  851 struct pr_usrreqs tcp_usrreqs = {
  852         tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
  853         tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
  854         tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd,
  855         tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
  856         in_setsockaddr, sosend, soreceive, pru_sopoll_notsupp
  857 };
  858 
  859 #if INET6
  860 struct pr_usrreqs tcp6_usrreqs = {
  861         tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
  862         tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
  863         tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
  864         tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
  865         in6_mapped_sockaddr, sosend, soreceive, pru_sopoll_notsupp
  866 };
  867 #endif /* INET6 */
  868 
  869 /*
  870  * Common subroutine to open a TCP connection to remote host specified
  871  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
  872  * port number if needed.  Call in_pcbladdr to do the routing and to choose
  873  * a local host address (interface).  If there is an existing incarnation
  874  * of the same connection in TIME-WAIT state and if the remote host was
  875  * sending CC options and if the connection duration was < MSL, then
  876  * truncate the previous TIME-WAIT state and proceed.
  877  * Initialize connection parameters and enter SYN-SENT state.
  878  *
  879  * Returns:     0                       Success
  880  *              EADDRINUSE
  881  *              EINVAL
  882  *      in_pcbbind:EADDRNOTAVAIL        Address not available.
  883  *      in_pcbbind:EINVAL               Invalid argument
  884  *      in_pcbbind:EAFNOSUPPORT         Address family not supported [notdef]
  885  *      in_pcbbind:EACCES               Permission denied
  886  *      in_pcbbind:EADDRINUSE           Address in use
  887  *      in_pcbbind:EAGAIN               Resource unavailable, try again
  888  *      in_pcbbind:EPERM                Operation not permitted
  889  *      in_pcbladdr:EINVAL              Invalid argument
  890  *      in_pcbladdr:EAFNOSUPPORT        Address family not supported
  891  *      in_pcbladdr:EADDRNOTAVAIL       Address not available
  892  */
  893 static int
  894 tcp_connect(tp, nam, p)
  895         register struct tcpcb *tp;
  896         struct sockaddr *nam;
  897         struct proc *p;
  898 {
  899         struct inpcb *inp = tp->t_inpcb, *oinp;
  900         struct socket *so = inp->inp_socket;
  901         struct tcpcb *otp;
  902         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
  903         struct sockaddr_in ifaddr;
  904         struct rmxp_tao *taop;
  905         struct rmxp_tao tao_noncached;
  906         int error;
  907         unsigned int outif = 0;
  908 
  909         if (inp->inp_lport == 0) {
  910                 error = in_pcbbind(inp, (struct sockaddr *)0, p);
  911                 if (error)
  912                         return error;
  913         }
  914 
  915         /*
  916          * Cannot simply call in_pcbconnect, because there might be an
  917          * earlier incarnation of this same connection still in
  918          * TIME_WAIT state, creating an ADDRINUSE error.
  919          */
  920         error = in_pcbladdr(inp, nam, &ifaddr, &outif);
  921         if (error)
  922                 return error;
  923 
  924         tcp_unlock(inp->inp_socket, 0, 0);
  925         oinp = in_pcblookup_hash(inp->inp_pcbinfo,
  926             sin->sin_addr, sin->sin_port,
  927             inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
  928                                                 : ifaddr.sin_addr,
  929             inp->inp_lport,  0, NULL);
  930 
  931         tcp_lock(inp->inp_socket, 0, 0);
  932         if (oinp) {
  933                 if (oinp != inp) /* 4143933: avoid deadlock if inp == oinp */
  934                         tcp_lock(oinp->inp_socket, 1, 0);
  935                 if (in_pcb_checkstate(oinp, WNT_RELEASE, 1) == WNT_STOPUSING) {
  936                         if (oinp != inp)
  937                                 tcp_unlock(oinp->inp_socket, 1, 0);
  938                         goto skip_oinp;
  939                 }
  940 
  941                 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
  942                 otp->t_state == TCPS_TIME_WAIT &&
  943                     ((int)(tcp_now - otp->t_starttime)) < tcp_msl &&
  944                     (otp->t_flags & TF_RCVD_CC))
  945                         otp = tcp_close(otp);
  946                 else {
  947                         printf("tcp_connect: inp=%p err=EADDRINUSE\n", inp);
  948                         if (oinp != inp)
  949                                 tcp_unlock(oinp->inp_socket, 1, 0);
  950                         return EADDRINUSE;
  951                 }
  952                 if (oinp != inp)
  953                         tcp_unlock(oinp->inp_socket, 1, 0);
  954         }
  955 skip_oinp:
  956         if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr.sin_addr.s_addr :
  957                  inp->inp_laddr.s_addr) == sin->sin_addr.s_addr &&
  958             inp->inp_lport == sin->sin_port)
  959                         return EINVAL;
  960         if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
  961                 /*lock inversion issue, mostly with udp multicast packets */
  962                 socket_unlock(inp->inp_socket, 0);
  963                 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
  964                 socket_lock(inp->inp_socket, 0);
  965         }
  966         if (inp->inp_laddr.s_addr == INADDR_ANY) {
  967                 inp->inp_laddr = ifaddr.sin_addr;
  968                 inp->inp_last_outif = outif;
  969         }
  970         inp->inp_faddr = sin->sin_addr;
  971         inp->inp_fport = sin->sin_port;
  972         in_pcbrehash(inp);
  973         lck_rw_done(inp->inp_pcbinfo->mtx);
  974 
  975         /* Compute window scaling to requesti according to sb_hiwat
  976          * or leave us some room to increase potentially increase the window size depending
  977          * on the default win scale
  978          */
  979         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
  980          (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
  981                 tp->request_r_scale++;
  982 
  983         /*
  984          * Inflate window size only if no setsockopt was performed on the recv sockbuf and
  985          * if we're not over our number of active pcbs.
  986          */
  987 
  988         if (((so->so_rcv.sb_flags & SB_USRSIZE) == 0) && (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold)) {
  989                 tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
  990                 so->so_rcv.sb_hiwat = min(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES);  
  991         }
  992 
  993         soisconnecting(so);
  994         tcpstat.tcps_connattempt++;
  995         tp->t_state = TCPS_SYN_SENT;
  996         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 
  997                 tp->t_keepinit ? tp->t_keepinit : tcp_keepinit);
  998         tp->iss = tcp_new_isn(tp);
  999         tcp_sendseqinit(tp);
 1000         if (nstat_collect)
 1001                 nstat_route_connect_attempt(inp->inp_route.ro_rt);
 1002 
 1003         /*
 1004          * Generate a CC value for this connection and
 1005          * check whether CC or CCnew should be used.
 1006          */
 1007         if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
 1008                 taop = &tao_noncached;
 1009                 bzero(taop, sizeof(*taop));
 1010         }
 1011 
 1012         tp->cc_send = CC_INC(tcp_ccgen);
 1013         if (taop->tao_ccsent != 0 &&
 1014             CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
 1015                 taop->tao_ccsent = tp->cc_send;
 1016         } else {
 1017                 taop->tao_ccsent = 0;
 1018                 tp->t_flags |= TF_SENDCCNEW;
 1019         }
 1020 
 1021         return 0;
 1022 }
 1023 
 1024 #if INET6
 1025 static int
 1026 tcp6_connect(tp, nam, p)
 1027         register struct tcpcb *tp;
 1028         struct sockaddr *nam;
 1029         struct proc *p;
 1030 {
 1031         struct inpcb *inp = tp->t_inpcb, *oinp;
 1032         struct socket *so = inp->inp_socket;
 1033         struct tcpcb *otp;
 1034         struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
 1035         struct in6_addr addr6;
 1036         struct rmxp_tao *taop;
 1037         struct rmxp_tao tao_noncached;
 1038         int error;
 1039         unsigned int outif = 0;
 1040 
 1041         if (inp->inp_lport == 0) {
 1042                 error = in6_pcbbind(inp, (struct sockaddr *)0, p);
 1043                 if (error)
 1044                         return error;
 1045         }
 1046 
 1047         /*
 1048          * Cannot simply call in_pcbconnect, because there might be an
 1049          * earlier incarnation of this same connection still in
 1050          * TIME_WAIT state, creating an ADDRINUSE error.
 1051          */
 1052         error = in6_pcbladdr(inp, nam, &addr6, &outif);
 1053         if (error)
 1054                 return error;
 1055         tcp_unlock(inp->inp_socket, 0, 0);
 1056         oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
 1057                                   &sin6->sin6_addr, sin6->sin6_port,
 1058                                   IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
 1059                                   ? &addr6
 1060                                   : &inp->in6p_laddr,
 1061                                   inp->inp_lport,  0, NULL);
 1062         tcp_lock(inp->inp_socket, 0, 0);
 1063         if (oinp) {
 1064                 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
 1065                     otp->t_state == TCPS_TIME_WAIT &&
 1066                     ((int)(tcp_now - otp->t_starttime)) < tcp_msl &&
 1067                     (otp->t_flags & TF_RCVD_CC))
 1068                         otp = tcp_close(otp);
 1069                 else
 1070                         return EADDRINUSE;
 1071         }
 1072         if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
 1073                 /*lock inversion issue, mostly with udp multicast packets */
 1074                 socket_unlock(inp->inp_socket, 0);
 1075                 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
 1076                 socket_lock(inp->inp_socket, 0);
 1077         }
 1078         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 1079                 inp->in6p_laddr = addr6;
 1080                 inp->in6p_last_outif = outif;
 1081         }
 1082         inp->in6p_faddr = sin6->sin6_addr;
 1083         inp->inp_fport = sin6->sin6_port;
 1084         if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
 1085                 inp->in6p_flowinfo = sin6->sin6_flowinfo;
 1086         in_pcbrehash(inp);
 1087         lck_rw_done(inp->inp_pcbinfo->mtx);
 1088 
 1089         /* Compute window scaling to request.  */
 1090         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
 1091             (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
 1092                 tp->request_r_scale++;
 1093 
 1094         soisconnecting(so);
 1095         tcpstat.tcps_connattempt++;
 1096         tp->t_state = TCPS_SYN_SENT;
 1097         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 
 1098                 tp->t_keepinit ? tp->t_keepinit : tcp_keepinit);
 1099         tp->iss = tcp_new_isn(tp);
 1100         tcp_sendseqinit(tp);
 1101         if (nstat_collect)
 1102                 nstat_route_connect_attempt(inp->inp_route.ro_rt);
 1103 
 1104         /*
 1105          * Generate a CC value for this connection and
 1106          * check whether CC or CCnew should be used.
 1107          */
 1108         if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
 1109                 taop = &tao_noncached;
 1110                 bzero(taop, sizeof(*taop));
 1111         }
 1112 
 1113         tp->cc_send = CC_INC(tcp_ccgen);
 1114         if (taop->tao_ccsent != 0 &&
 1115             CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
 1116                 taop->tao_ccsent = tp->cc_send;
 1117         } else {
 1118                 taop->tao_ccsent = 0;
 1119                 tp->t_flags |= TF_SENDCCNEW;
 1120         }
 1121 
 1122         return 0;
 1123 }
 1124 #endif /* INET6 */
 1125 
 1126 /*
 1127  * Export TCP internal state information via a struct tcp_info
 1128  */
 1129 __private_extern__ void
 1130 tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
 1131 {
 1132         bzero(ti, sizeof(*ti));
 1133 
 1134         ti->tcpi_state = tp->t_state;
 1135         
 1136     if (tp->t_state > TCPS_LISTEN) {
 1137                 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
 1138                         ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
 1139                 if (tp->t_flags & TF_SACK_PERMIT)
 1140                         ti->tcpi_options |= TCPI_OPT_SACK;
 1141                 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
 1142                         ti->tcpi_options |= TCPI_OPT_WSCALE;
 1143                         ti->tcpi_snd_wscale = tp->snd_scale;
 1144                         ti->tcpi_rcv_wscale = tp->rcv_scale;
 1145                 }
 1146                 
 1147                 ti->tcpi_snd_mss = tp->t_maxseg;
 1148                 ti->tcpi_rcv_mss = tp->t_maxseg;
 1149 
 1150                 ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
 1151                 ti->tcpi_snd_cwnd = tp->snd_cwnd;
 1152         
 1153                 ti->tcpi_rcv_space = tp->rcv_wnd;
 1154 
 1155                 ti->tcpi_snd_wnd = tp->snd_wnd;
 1156                 ti->tcpi_snd_bwnd = tp->snd_bwnd;
 1157                 ti->tcpi_snd_nxt = tp->snd_nxt;
 1158                 ti->tcpi_rcv_nxt = tp->rcv_nxt;
 1159                 
 1160                 ti->tcpi_last_outif = tp->t_inpcb->inp_last_outif;
 1161         }
 1162 }
 1163 
 1164 __private_extern__ errno_t
 1165 tcp_fill_info_for_info_tuple(struct info_tuple *itpl, struct tcp_info *ti)
 1166 {
 1167         struct inpcbinfo *pcbinfo = NULL;
 1168         struct inpcb *inp = NULL;
 1169         struct socket *so;
 1170         struct tcpcb *tp;
 1171         
 1172         if (itpl->itpl_proto == IPPROTO_TCP)
 1173                 pcbinfo = &tcbinfo;
 1174         else
 1175                 return EINVAL;
 1176         
 1177         if (itpl->itpl_local_sa.sa_family == AF_INET &&
 1178                 itpl->itpl_remote_sa.sa_family == AF_INET) {
 1179                 inp = in_pcblookup_hash(pcbinfo, 
 1180                                                                 itpl->itpl_remote_sin.sin_addr,
 1181                                                                 itpl->itpl_remote_sin.sin_port,
 1182                                                                 itpl->itpl_local_sin.sin_addr,
 1183                                                                 itpl->itpl_local_sin.sin_port,
 1184                                                                 0, NULL);
 1185         } else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
 1186                 itpl->itpl_remote_sa.sa_family == AF_INET6) {
 1187                 struct in6_addr ina6_local;
 1188                 struct in6_addr ina6_remote;
 1189                 
 1190                 ina6_local = itpl->itpl_local_sin6.sin6_addr;
 1191                 if (IN6_IS_SCOPE_LINKLOCAL(&ina6_local) && itpl->itpl_local_sin6.sin6_scope_id)
 1192                         ina6_local.s6_addr16[1] = htons(itpl->itpl_local_sin6.sin6_scope_id);
 1193 
 1194                 ina6_remote = itpl->itpl_remote_sin6.sin6_addr;
 1195                 if (IN6_IS_SCOPE_LINKLOCAL(&ina6_remote) && itpl->itpl_remote_sin6.sin6_scope_id)
 1196                         ina6_remote.s6_addr16[1] = htons(itpl->itpl_remote_sin6.sin6_scope_id);
 1197                 
 1198                 inp = in6_pcblookup_hash(pcbinfo, 
 1199                                                                 &ina6_remote,
 1200                                                                 itpl->itpl_remote_sin6.sin6_port,
 1201                                                                 &ina6_local,
 1202                                                                 itpl->itpl_local_sin6.sin6_port,
 1203                                                                 0, NULL);
 1204         } else
 1205                 return EINVAL;
 1206         if (inp == NULL || (so = inp->inp_socket) == NULL)
 1207                 return ENOENT;
 1208 
 1209         socket_lock(so, 0);
 1210         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
 1211                 socket_unlock(so, 0);
 1212                 return ENOENT;
 1213         }
 1214         tp = intotcpcb(inp);
 1215 
 1216         tcp_fill_info(tp, ti);
 1217         socket_unlock(so, 0);
 1218 
 1219         return 0;
 1220 }
 1221 
 1222 
 1223 __private_extern__ int 
 1224 tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
 1225 {
 1226         int error;
 1227         struct tcp_info ti;
 1228         struct info_tuple itpl;
 1229         
 1230         if (req->newptr == USER_ADDR_NULL) {
 1231                 return EINVAL;
 1232         }
 1233         if (req->newlen < sizeof(struct info_tuple)) {
 1234                 return EINVAL;
 1235         }
 1236         error = SYSCTL_IN(req, &itpl, sizeof(struct info_tuple));
 1237         if (error != 0) {
 1238                 return error;
 1239         }
 1240         error = tcp_fill_info_for_info_tuple(&itpl, &ti);
 1241         if (error != 0) {
 1242                 return error;
 1243         }
 1244         error = SYSCTL_OUT(req, &ti, sizeof(struct tcp_info));
 1245         if (error != 0) {
 1246                 return error;
 1247         }
 1248         
 1249         return 0;
 1250 }
 1251 
 1252 /*
 1253  * The new sockopt interface makes it possible for us to block in the
 1254  * copyin/out step (if we take a page fault).  Taking a page fault at
 1255  * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
 1256  * use TSM, there probably isn't any need for this function to run at
 1257  * splnet() any more.  This needs more examination.)
 1258  */
 1259 int
 1260 tcp_ctloutput(so, sopt)
 1261         struct socket *so;
 1262         struct sockopt *sopt;
 1263 {
 1264         int     error, opt, optval;
 1265         struct  inpcb *inp;
 1266         struct  tcpcb *tp;
 1267 
 1268         error = 0;
 1269         inp = sotoinpcb(so);
 1270         if (inp == NULL) {
 1271                 return (ECONNRESET);
 1272         }
 1273         if (sopt->sopt_level != IPPROTO_TCP) {
 1274 #if INET6
 1275                 if (INP_CHECK_SOCKAF(so, AF_INET6))
 1276                         error = ip6_ctloutput(so, sopt);
 1277                 else
 1278 #endif /* INET6 */
 1279                 error = ip_ctloutput(so, sopt);
 1280                 return (error);
 1281         }
 1282         tp = intotcpcb(inp);
 1283         if (tp == NULL) {
 1284                 return (ECONNRESET);
 1285         }
 1286 
 1287         calculate_tcp_clock();
 1288 
 1289         switch (sopt->sopt_dir) {
 1290         case SOPT_SET:
 1291                 switch (sopt->sopt_name) {
 1292                 case TCP_NODELAY:
 1293                 case TCP_NOOPT:
 1294                 case TCP_NOPUSH:
 1295                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1296                                             sizeof optval);
 1297                         if (error)
 1298                                 break;
 1299 
 1300                         switch (sopt->sopt_name) {
 1301                         case TCP_NODELAY:
 1302                                 opt = TF_NODELAY;
 1303                                 break;
 1304                         case TCP_NOOPT:
 1305                                 opt = TF_NOOPT;
 1306                                 break;
 1307                         case TCP_NOPUSH:
 1308                                 opt = TF_NOPUSH;
 1309                                 break;
 1310                         default:
 1311                                 opt = 0; /* dead code to fool gcc */
 1312                                 break;
 1313                         }
 1314 
 1315                         if (optval)
 1316                                 tp->t_flags |= opt;
 1317                         else
 1318                                 tp->t_flags &= ~opt;
 1319                         break;
 1320                 case TCP_RXT_FINDROP:
 1321                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1322                                 sizeof optval);
 1323                         if (error)
 1324                                 break;
 1325                         opt = TF_RXTFINDROP;
 1326                         if (optval)
 1327                                 tp->t_flagsext |= opt;
 1328                         else
 1329                                 tp->t_flagsext &= ~opt;
 1330                         break;
 1331                 case TCP_MAXSEG:
 1332                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1333                                             sizeof optval);
 1334                         if (error)
 1335                                 break;
 1336 
 1337                         if (optval > 0 && optval <= tp->t_maxseg &&
 1338                             optval + 40 >= tcp_minmss)
 1339                                 tp->t_maxseg = optval;
 1340                         else
 1341                                 error = EINVAL;
 1342                         break;
 1343 
 1344                 case TCP_KEEPALIVE:
 1345                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1346                                                 sizeof optval);
 1347                         if (error)
 1348                                 break;
 1349                         if (optval < 0)
 1350                                 error = EINVAL;
 1351                         else {
 1352                                 tp->t_keepidle = optval * TCP_RETRANSHZ;
 1353                                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 
 1354                                         TCP_KEEPIDLE(tp)); /* reset the timer to new value */
 1355                                 tcp_check_timer_state(tp);
 1356                         }
 1357                         break;
 1358 
 1359                 case TCP_CONNECTIONTIMEOUT:
 1360                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1361                                                 sizeof optval);
 1362                         if (error)
 1363                                 break;
 1364                         if (optval < 0)
 1365                                 error = EINVAL;
 1366                         else 
 1367                                 tp->t_keepinit = optval * TCP_RETRANSHZ;
 1368                         break;
 1369                 
 1370                 case PERSIST_TIMEOUT:
 1371                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1372                                                 sizeof optval);
 1373                         if (error)
 1374                                 break;
 1375                         if (optval < 0)
 1376                                 error = EINVAL;
 1377                         else 
 1378                                 tp->t_persist_timeout = optval * TCP_RETRANSHZ;
 1379                         break;
 1380                 case TCP_RXT_CONNDROPTIME:
 1381                         error = sooptcopyin(sopt, &optval, sizeof(optval),
 1382                                         sizeof(optval));
 1383                         if (error)
 1384                                 break;
 1385                         if (optval < 0)
 1386                                 error = EINVAL;
 1387                         else
 1388                                 tp->rxt_conndroptime = optval * TCP_RETRANSHZ;
 1389                         break;
 1390                 default:
 1391                         error = ENOPROTOOPT;
 1392                         break;
 1393                 }
 1394                 break;
 1395 
 1396         case SOPT_GET:
 1397                 switch (sopt->sopt_name) {
 1398                 case TCP_NODELAY:
 1399                         optval = tp->t_flags & TF_NODELAY;
 1400                         break;
 1401                 case TCP_MAXSEG:
 1402                         optval = tp->t_maxseg;
 1403                         break;
 1404                 case TCP_KEEPALIVE:
 1405                         optval = tp->t_keepidle / TCP_RETRANSHZ;
 1406                         break;
 1407                 case TCP_NOOPT:
 1408                         optval = tp->t_flags & TF_NOOPT;
 1409                         break;
 1410                 case TCP_NOPUSH:
 1411                         optval = tp->t_flags & TF_NOPUSH;
 1412                         break;
 1413                 case TCP_CONNECTIONTIMEOUT:
 1414                         optval = tp->t_keepinit / TCP_RETRANSHZ;
 1415                         break;
 1416                 case PERSIST_TIMEOUT:
 1417                         optval = tp->t_persist_timeout / TCP_RETRANSHZ;
 1418                         break;
 1419                 case TCP_RXT_CONNDROPTIME:
 1420                         optval = tp->rxt_conndroptime / TCP_RETRANSHZ;
 1421                         break;
 1422                 case TCP_RXT_FINDROP:
 1423                         optval = tp->t_flagsext & TF_RXTFINDROP;
 1424                         break; 
 1425                 case TCP_INFO: {
 1426                         struct tcp_info ti;
 1427 
 1428                         tcp_fill_info(tp, &ti);
 1429                         error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info));
 1430                         goto done;
 1431                 }
 1432                 default:
 1433                         error = ENOPROTOOPT;
 1434                         break;
 1435                 }
 1436                 if (error == 0)
 1437                         error = sooptcopyout(sopt, &optval, sizeof optval);
 1438                 break;
 1439         }
 1440 done:
 1441         return (error);
 1442 }
 1443 
 1444 /*
 1445  * tcp_sendspace and tcp_recvspace are the default send and receive window
 1446  * sizes, respectively.  These are obsolescent (this information should
 1447  * be set by the route).
 1448  */
 1449 u_int32_t       tcp_sendspace = 1448*256;
 1450 u_int32_t       tcp_recvspace = 1448*384;
 1451 
 1452 /* During attach, the size of socket buffer allocated is limited to
 1453  * sb_max in sbreserve. Disallow setting the tcp send and recv space
 1454  * to be more than sb_max because that will cause tcp_attach to fail
 1455  * (see radar 5713060)
 1456  */  
 1457 static int
 1458 sysctl_tcp_sospace(struct sysctl_oid *oidp, __unused void *arg1,
 1459         __unused int arg2, struct sysctl_req *req) {
 1460         u_int32_t new_value = 0, *space_p = NULL;
 1461         int changed = 0, error = 0;
 1462         u_quad_t sb_effective_max = (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES;
 1463 
 1464         switch (oidp->oid_number) {
 1465                 case TCPCTL_SENDSPACE:
 1466                         space_p = &tcp_sendspace;
 1467                         break;
 1468                 case TCPCTL_RECVSPACE:
 1469                         space_p = &tcp_recvspace;
 1470                         break;
 1471                 default:
 1472                         return EINVAL;
 1473         }
 1474         error = sysctl_io_number(req, *space_p, sizeof(u_int32_t),
 1475                 &new_value, &changed);
 1476         if (changed) {
 1477                 if (new_value > 0 && new_value <= sb_effective_max) {
 1478                         *space_p = new_value;
 1479                 } else {
 1480                         error = ERANGE;
 1481                 }
 1482         }
 1483         return error;
 1484 }
 1485 
 1486 SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 1487     &tcp_sendspace , 0, &sysctl_tcp_sospace, "IU", "Maximum outgoing TCP datagram size");
 1488 SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 1489     &tcp_recvspace , 0, &sysctl_tcp_sospace, "IU", "Maximum incoming TCP datagram size");
 1490 
 1491 
 1492 /*
 1493  * Attach TCP protocol to socket, allocating
 1494  * internet protocol control block, tcp control block,
 1495  * bufer space, and entering LISTEN state if to accept connections.
 1496  *
 1497  * Returns:     0                       Success
 1498  *      in_pcballoc:ENOBUFS
 1499  *      in_pcballoc:ENOMEM
 1500  *      in_pcballoc:???                 [IPSEC specific]
 1501  *      soreserve:ENOBUFS
 1502  */
 1503 static int
 1504 tcp_attach(so, p)
 1505         struct socket *so;
 1506         struct proc *p;
 1507 {
 1508         register struct tcpcb *tp;
 1509         struct inpcb *inp;
 1510         int error;
 1511         u_long sb_effective_max;
 1512 #if INET6
 1513         int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
 1514 #endif
 1515 
 1516         error = in_pcballoc(so, &tcbinfo, p);
 1517         if (error)
 1518                 return (error);
 1519 
 1520         inp = sotoinpcb(so);
 1521 
 1522         if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 1523                 /*
 1524                  * The goal is to let clients machines use large send/rcv default windows to compensate for link
 1525                  * latency and make sure the receiver is not constraining the sender window.
 1526                  * But we doon't want to have a few connections use all our mbuf space for servers.
 1527                  * This is done by watching a threshold of tcpcbs in use and bumping the default send and rcvspace
 1528                  * only if that threshold isn't reached.
 1529                  * We're also advertising a much bigger window size (tuneable by sysctl) in correlation with                             * the max socket buffer size if 
 1530                  * we consider that we have enough ressources for it. This window will be adjusted depending on the
 1531                  * global socket layer buffer use with the use of tcp_sbpace
 1532                  */
 1533 
 1534                 if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold) {
 1535                         sb_effective_max = (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES;  
 1536                         error = soreserve(so, max(min((TCP_MAXWIN << tcp_win_scale)/4, sb_effective_max), tcp_sendspace),
 1537                                         max(min((TCP_MAXWIN << tcp_win_scale)/2, sb_effective_max), tcp_recvspace));
 1538                 }
 1539                 else    
 1540                         error = soreserve(so, tcp_sendspace, tcp_recvspace);
 1541                 if (error)
 1542                         return (error);
 1543         }
 1544 
 1545 #if INET6
 1546         if (isipv6) {
 1547                 inp->inp_vflag |= INP_IPV6;
 1548                 inp->in6p_hops = -1;    /* use kernel default */
 1549         }
 1550         else
 1551 #endif /* INET6 */
 1552         inp->inp_vflag |= INP_IPV4;
 1553         tp = tcp_newtcpcb(inp);
 1554         if (tp == 0) {
 1555                 int nofd = so->so_state & SS_NOFDREF;   /* XXX */
 1556 
 1557                 so->so_state &= ~SS_NOFDREF;    /* don't free the socket yet */
 1558 #if INET6
 1559                 if (isipv6)
 1560                         in6_pcbdetach(inp);
 1561                 else
 1562 #endif /* INET6 */
 1563                 in_pcbdetach(inp);
 1564                 so->so_state |= nofd;
 1565                 return (ENOBUFS);
 1566         }
 1567         if (nstat_collect) {
 1568                 nstat_tcp_new_pcb(inp);
 1569         }
 1570         tp->t_state = TCPS_CLOSED;
 1571         return (0);
 1572 }
 1573 
 1574 /*
 1575  * Initiate (or continue) disconnect.
 1576  * If embryonic state, just send reset (once).
 1577  * If in ``let data drain'' option and linger null, just drop.
 1578  * Otherwise (hard), mark socket disconnecting and drop
 1579  * current input data; switch states based on user close, and
 1580  * send segment to peer (with FIN).
 1581  */
 1582 static struct tcpcb *
 1583 tcp_disconnect(tp)
 1584         register struct tcpcb *tp;
 1585 {
 1586         struct socket *so = tp->t_inpcb->inp_socket;
 1587 
 1588         if (tp->t_state < TCPS_ESTABLISHED)
 1589                 tp = tcp_close(tp);
 1590         else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
 1591                 tp = tcp_drop(tp, 0);
 1592         else {
 1593                 soisdisconnecting(so);
 1594                 sbflush(&so->so_rcv);
 1595                 tp = tcp_usrclosed(tp);
 1596                 if (tp)
 1597                         (void) tcp_output(tp);
 1598         }
 1599         return (tp);
 1600 }
 1601 
 1602 /*
 1603  * User issued close, and wish to trail through shutdown states:
 1604  * if never received SYN, just forget it.  If got a SYN from peer,
 1605  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
 1606  * If already got a FIN from peer, then almost done; go to LAST_ACK
 1607  * state.  In all other cases, have already sent FIN to peer (e.g.
 1608  * after PRU_SHUTDOWN), and just have to play tedious game waiting
 1609  * for peer to send FIN or not respond to keep-alives, etc.
 1610  * We can let the user exit from the close as soon as the FIN is acked.
 1611  */
 1612 static struct tcpcb *
 1613 tcp_usrclosed(tp)
 1614         register struct tcpcb *tp;
 1615 {
 1616 
 1617         switch (tp->t_state) {
 1618 
 1619         case TCPS_CLOSED:
 1620         case TCPS_LISTEN:
 1621                 tp->t_state = TCPS_CLOSED;
 1622                 tp = tcp_close(tp);
 1623                 break;
 1624 
 1625         case TCPS_SYN_SENT:
 1626         case TCPS_SYN_RECEIVED:
 1627                 tp->t_flags |= TF_NEEDFIN;
 1628                 break;
 1629 
 1630         case TCPS_ESTABLISHED:
 1631                 tp->t_state = TCPS_FIN_WAIT_1;
 1632                 break;
 1633 
 1634         case TCPS_CLOSE_WAIT:
 1635                 tp->t_state = TCPS_LAST_ACK;
 1636                 break;
 1637         }
 1638         if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
 1639                 soisdisconnected(tp->t_inpcb->inp_socket);
 1640                 /* To prevent the connection hanging in FIN_WAIT_2 forever. */
 1641                 if (tp->t_state == TCPS_FIN_WAIT_2)
 1642                         tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, tcp_maxidle);
 1643         }
 1644         return (tp);
 1645 }
 1646 
 1647 void
 1648 tcp_in_cksum_stats(u_int32_t len)
 1649 {
 1650         tcps_in_sw_cksum++;
 1651         tcps_in_sw_cksum_bytes += len;
 1652 }
 1653 
 1654 void
 1655 tcp_out_cksum_stats(u_int32_t len)
 1656 {
 1657         tcps_out_sw_cksum++;
 1658         tcps_out_sw_cksum_bytes += len;
 1659 }

Cache object: 85a66babb9075919703d5cad88aaff98


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.