The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_usrreq.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $OpenBSD: tcp_usrreq.c,v 1.216 2023/01/22 12:05:44 mvs Exp $    */
    2 /*      $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */
    3 
    4 /*
    5  * Copyright (c) 1982, 1986, 1988, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 3. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
   33  *
   34  * NRL grants permission for redistribution and use in source and binary
   35  * forms, with or without modification, of the software and documentation
   36  * created at NRL provided that the following conditions are met:
   37  *
   38  * 1. Redistributions of source code must retain the above copyright
   39  *    notice, this list of conditions and the following disclaimer.
   40  * 2. Redistributions in binary form must reproduce the above copyright
   41  *    notice, this list of conditions and the following disclaimer in the
   42  *    documentation and/or other materials provided with the distribution.
   43  * 3. All advertising materials mentioning features or use of this software
   44  *    must display the following acknowledgements:
   45  *      This product includes software developed by the University of
   46  *      California, Berkeley and its contributors.
   47  *      This product includes software developed at the Information
   48  *      Technology Division, US Naval Research Laboratory.
   49  * 4. Neither the name of the NRL nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
   54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
   57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   64  *
   65  * The views and conclusions contained in the software and documentation
   66  * are those of the authors and should not be interpreted as representing
   67  * official policies, either expressed or implied, of the US Naval
   68  * Research Laboratory (NRL).
   69  */
   70 
   71 #include <sys/param.h>
   72 #include <sys/systm.h>
   73 #include <sys/mbuf.h>
   74 #include <sys/socket.h>
   75 #include <sys/socketvar.h>
   76 #include <sys/protosw.h>
   77 #include <sys/stat.h>
   78 #include <sys/sysctl.h>
   79 #include <sys/domain.h>
   80 #include <sys/kernel.h>
   81 #include <sys/pool.h>
   82 #include <sys/proc.h>
   83 
   84 #include <net/if.h>
   85 #include <net/if_var.h>
   86 #include <net/route.h>
   87 
   88 #include <netinet/in.h>
   89 #include <netinet/in_var.h>
   90 #include <netinet/ip.h>
   91 #include <netinet/in_pcb.h>
   92 #include <netinet/ip_var.h>
   93 #include <netinet/tcp.h>
   94 #include <netinet/tcp_fsm.h>
   95 #include <netinet/tcp_seq.h>
   96 #include <netinet/tcp_timer.h>
   97 #include <netinet/tcp_var.h>
   98 #include <netinet/tcp_debug.h>
   99 
  100 #ifdef INET6
  101 #include <netinet6/in6_var.h>
  102 #endif
  103 
  104 #ifndef TCP_SENDSPACE
  105 #define TCP_SENDSPACE   1024*16
  106 #endif
  107 u_int   tcp_sendspace = TCP_SENDSPACE;
  108 #ifndef TCP_RECVSPACE
  109 #define TCP_RECVSPACE   1024*16
  110 #endif
  111 u_int   tcp_recvspace = TCP_RECVSPACE;
  112 u_int   tcp_autorcvbuf_inc = 16 * 1024;
  113 
  114 const struct pr_usrreqs tcp_usrreqs = {
  115         .pru_attach     = tcp_attach,
  116         .pru_detach     = tcp_detach,
  117         .pru_bind       = tcp_bind,
  118         .pru_listen     = tcp_listen,
  119         .pru_connect    = tcp_connect,
  120         .pru_accept     = tcp_accept,
  121         .pru_disconnect = tcp_disconnect,
  122         .pru_shutdown   = tcp_shutdown,
  123         .pru_rcvd       = tcp_rcvd,
  124         .pru_send       = tcp_send,
  125         .pru_abort      = tcp_abort,
  126         .pru_sense      = tcp_sense,
  127         .pru_rcvoob     = tcp_rcvoob,
  128         .pru_sendoob    = tcp_sendoob,
  129         .pru_control    = in_control,
  130         .pru_sockaddr   = tcp_sockaddr,
  131         .pru_peeraddr   = tcp_peeraddr,
  132 };
  133 
  134 #ifdef INET6
  135 const struct pr_usrreqs tcp6_usrreqs = {
  136         .pru_attach     = tcp_attach,
  137         .pru_detach     = tcp_detach,
  138         .pru_bind       = tcp_bind,
  139         .pru_listen     = tcp_listen,
  140         .pru_connect    = tcp_connect,
  141         .pru_accept     = tcp_accept,
  142         .pru_disconnect = tcp_disconnect,
  143         .pru_shutdown   = tcp_shutdown,
  144         .pru_rcvd       = tcp_rcvd,
  145         .pru_send       = tcp_send,
  146         .pru_abort      = tcp_abort,
  147         .pru_sense      = tcp_sense,
  148         .pru_rcvoob     = tcp_rcvoob,
  149         .pru_sendoob    = tcp_sendoob,
  150         .pru_control    = in6_control,
  151         .pru_sockaddr   = tcp_sockaddr,
  152         .pru_peeraddr   = tcp_peeraddr,
  153 };
  154 #endif
  155 
  156 const struct sysctl_bounded_args tcpctl_vars[] = {
  157         { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 },
  158         { TCPCTL_KEEPINITTIME, &tcptv_keep_init, 1, 3 * TCPTV_KEEP_INIT },
  159         { TCPCTL_KEEPIDLE, &tcp_keepidle, 1, 5 * TCPTV_KEEP_IDLE },
  160         { TCPCTL_KEEPINTVL, &tcp_keepintvl, 1, 3 * TCPTV_KEEPINTVL },
  161         { TCPCTL_SACK, &tcp_do_sack, 0, 1 },
  162         { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 },
  163         { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 },
  164         { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 },
  165 #ifdef TCP_ECN
  166         { TCPCTL_ECN, &tcp_do_ecn, 0, 1 },
  167 #endif
  168         { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 },
  169         { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
  170         { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
  171         { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 },
  172 };
  173 
  174 struct  inpcbtable tcbtable;
  175 
  176 int     tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *);
  177 int     tcp_ident(void *, size_t *, void *, size_t, int);
  178 
  179 static inline int tcp_sogetpcb(struct socket *, struct inpcb **,
  180                       struct tcpcb **);
  181 
  182 static inline int
  183 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp)
  184 {
  185         struct inpcb *inp;
  186         struct tcpcb *tp;
  187 
  188         /*
  189          * When a TCP is attached to a socket, then there will be
  190          * a (struct inpcb) pointed at by the socket, and this
  191          * structure will point at a subsidiary (struct tcpcb).
  192          */
  193         if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) {
  194                 if (so->so_error)
  195                         return so->so_error;
  196                 return EINVAL;
  197         }
  198 
  199         *rinp = inp;
  200         *rtp = tp;
  201 
  202         return 0;
  203 }
  204 
  205 /*
  206  * Export internal TCP state information via a struct tcp_info without
  207  * leaking any sensitive information. Sequence numbers are reported
  208  * relative to the initial sequence number.
  209  */
  210 int
  211 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m)
  212 {
  213         struct proc *p = curproc;
  214         struct tcp_info *ti;
  215         u_int t = 1000;         /* msec => usec */
  216         uint32_t now;
  217 
  218         if (sizeof(*ti) > MLEN) {
  219                 MCLGETL(m, M_WAITOK, sizeof(*ti));
  220                 if (!ISSET(m->m_flags, M_EXT))
  221                         return ENOMEM;
  222         }
  223         ti = mtod(m, struct tcp_info *);
  224         m->m_len = sizeof(*ti);
  225         memset(ti, 0, sizeof(*ti));
  226         now = tcp_now();
  227 
  228         ti->tcpi_state = tp->t_state;
  229         if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
  230                 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
  231         if (tp->t_flags & TF_SACK_PERMIT)
  232                 ti->tcpi_options |= TCPI_OPT_SACK;
  233         if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
  234                 ti->tcpi_options |= TCPI_OPT_WSCALE;
  235                 ti->tcpi_snd_wscale = tp->snd_scale;
  236                 ti->tcpi_rcv_wscale = tp->rcv_scale;
  237         }
  238 #ifdef TCP_ECN
  239         if (tp->t_flags & TF_ECN_PERMIT)
  240                 ti->tcpi_options |= TCPI_OPT_ECN;
  241 #endif
  242 
  243         ti->tcpi_rto = tp->t_rxtcur * t;
  244         ti->tcpi_snd_mss = tp->t_maxseg;
  245         ti->tcpi_rcv_mss = tp->t_peermss;
  246 
  247         ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t;
  248         ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t;
  249         ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t;
  250         ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t;
  251 
  252         ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >>
  253             (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
  254         ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >>
  255             (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT);
  256         ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
  257         ti->tcpi_snd_cwnd = tp->snd_cwnd;
  258 
  259         ti->tcpi_rcv_space = tp->rcv_wnd;
  260 
  261         /*
  262          * Provide only minimal information for unprivileged processes.
  263          */
  264         if (suser(p) != 0)
  265                 return 0;
  266 
  267         /* FreeBSD-specific extension fields for tcp_info.  */
  268         ti->tcpi_snd_wnd = tp->snd_wnd;
  269         ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss;
  270         ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs;
  271         /* missing tcpi_toe_tid */
  272         ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
  273         ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
  274         ti->tcpi_snd_zerowin = tp->t_sndzerowin;
  275 
  276         /* OpenBSD extensions */
  277         ti->tcpi_rttmin = tp->t_rttmin * t;
  278         ti->tcpi_max_sndwnd = tp->max_sndwnd;
  279         ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs;
  280         ti->tcpi_rcv_up = tp->rcv_up - tp->irs;
  281         ti->tcpi_snd_una = tp->snd_una - tp->iss;
  282         ti->tcpi_snd_up = tp->snd_up - tp->iss;
  283         ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss;
  284         ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss;
  285         ti->tcpi_snd_max = tp->snd_max - tp->iss;
  286 
  287         ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */
  288         ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t;
  289         ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt;
  290         ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t;
  291 
  292         ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc;
  293         ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat;
  294         ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
  295         ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
  296         ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
  297         ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
  298         ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
  299         ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
  300 
  301         return 0;
  302 }
  303 
  304 int
  305 tcp_ctloutput(int op, struct socket *so, int level, int optname,
  306     struct mbuf *m)
  307 {
  308         int error = 0;
  309         struct inpcb *inp;
  310         struct tcpcb *tp;
  311         int i;
  312 
  313         inp = sotoinpcb(so);
  314         if (inp == NULL)
  315                 return (ECONNRESET);
  316         if (level != IPPROTO_TCP) {
  317                 switch (so->so_proto->pr_domain->dom_family) {
  318 #ifdef INET6
  319                 case PF_INET6:
  320                         error = ip6_ctloutput(op, so, level, optname, m);
  321                         break;
  322 #endif /* INET6 */
  323                 case PF_INET:
  324                         error = ip_ctloutput(op, so, level, optname, m);
  325                         break;
  326                 default:
  327                         error = EAFNOSUPPORT;   /*?*/
  328                         break;
  329                 }
  330                 return (error);
  331         }
  332         tp = intotcpcb(inp);
  333 
  334         switch (op) {
  335 
  336         case PRCO_SETOPT:
  337                 switch (optname) {
  338 
  339                 case TCP_NODELAY:
  340                         if (m == NULL || m->m_len < sizeof (int))
  341                                 error = EINVAL;
  342                         else if (*mtod(m, int *))
  343                                 tp->t_flags |= TF_NODELAY;
  344                         else
  345                                 tp->t_flags &= ~TF_NODELAY;
  346                         break;
  347 
  348                 case TCP_NOPUSH:
  349                         if (m == NULL || m->m_len < sizeof (int))
  350                                 error = EINVAL;
  351                         else if (*mtod(m, int *))
  352                                 tp->t_flags |= TF_NOPUSH;
  353                         else if (tp->t_flags & TF_NOPUSH) {
  354                                 tp->t_flags &= ~TF_NOPUSH;
  355                                 if (TCPS_HAVEESTABLISHED(tp->t_state))
  356                                         error = tcp_output(tp);
  357                         }
  358                         break;
  359 
  360                 case TCP_MAXSEG:
  361                         if (m == NULL || m->m_len < sizeof (int)) {
  362                                 error = EINVAL;
  363                                 break;
  364                         }
  365 
  366                         i = *mtod(m, int *);
  367                         if (i > 0 && i <= tp->t_maxseg)
  368                                 tp->t_maxseg = i;
  369                         else
  370                                 error = EINVAL;
  371                         break;
  372 
  373                 case TCP_SACK_ENABLE:
  374                         if (m == NULL || m->m_len < sizeof (int)) {
  375                                 error = EINVAL;
  376                                 break;
  377                         }
  378 
  379                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
  380                                 error = EPERM;
  381                                 break;
  382                         }
  383 
  384                         if (tp->t_flags & TF_SIGNATURE) {
  385                                 error = EPERM;
  386                                 break;
  387                         }
  388 
  389                         if (*mtod(m, int *))
  390                                 tp->sack_enable = 1;
  391                         else
  392                                 tp->sack_enable = 0;
  393                         break;
  394 #ifdef TCP_SIGNATURE
  395                 case TCP_MD5SIG:
  396                         if (m == NULL || m->m_len < sizeof (int)) {
  397                                 error = EINVAL;
  398                                 break;
  399                         }
  400 
  401                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
  402                                 error = EPERM;
  403                                 break;
  404                         }
  405 
  406                         if (*mtod(m, int *)) {
  407                                 tp->t_flags |= TF_SIGNATURE;
  408                                 tp->sack_enable = 0;
  409                         } else
  410                                 tp->t_flags &= ~TF_SIGNATURE;
  411                         break;
  412 #endif /* TCP_SIGNATURE */
  413                 default:
  414                         error = ENOPROTOOPT;
  415                         break;
  416                 }
  417                 break;
  418 
  419         case PRCO_GETOPT:
  420                 switch (optname) {
  421                 case TCP_NODELAY:
  422                         m->m_len = sizeof(int);
  423                         *mtod(m, int *) = tp->t_flags & TF_NODELAY;
  424                         break;
  425                 case TCP_NOPUSH:
  426                         m->m_len = sizeof(int);
  427                         *mtod(m, int *) = tp->t_flags & TF_NOPUSH;
  428                         break;
  429                 case TCP_MAXSEG:
  430                         m->m_len = sizeof(int);
  431                         *mtod(m, int *) = tp->t_maxseg;
  432                         break;
  433                 case TCP_SACK_ENABLE:
  434                         m->m_len = sizeof(int);
  435                         *mtod(m, int *) = tp->sack_enable;
  436                         break;
  437                 case TCP_INFO:
  438                         error = tcp_fill_info(tp, so, m);
  439                         break;
  440 #ifdef TCP_SIGNATURE
  441                 case TCP_MD5SIG:
  442                         m->m_len = sizeof(int);
  443                         *mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
  444                         break;
  445 #endif
  446                 default:
  447                         error = ENOPROTOOPT;
  448                         break;
  449                 }
  450                 break;
  451         }
  452         return (error);
  453 }
  454 
  455 /*
  456  * Attach TCP protocol to socket, allocating
  457  * internet protocol control block, tcp control block,
  458  * buffer space, and entering LISTEN state to accept connections.
  459  */
  460 int
  461 tcp_attach(struct socket *so, int proto, int wait)
  462 {
  463         struct tcpcb *tp;
  464         struct inpcb *inp;
  465         int error;
  466 
  467         if (so->so_pcb)
  468                 return EISCONN;
  469         if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
  470             sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
  471             sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
  472                 error = soreserve(so, tcp_sendspace, tcp_recvspace);
  473                 if (error)
  474                         return (error);
  475         }
  476 
  477         NET_ASSERT_LOCKED();
  478         error = in_pcballoc(so, &tcbtable, wait);
  479         if (error)
  480                 return (error);
  481         inp = sotoinpcb(so);
  482         tp = tcp_newtcpcb(inp, wait);
  483         if (tp == NULL) {
  484                 unsigned int nofd = so->so_state & SS_NOFDREF;  /* XXX */
  485 
  486                 so->so_state &= ~SS_NOFDREF;    /* don't free the socket yet */
  487                 in_pcbdetach(inp);
  488                 so->so_state |= nofd;
  489                 return (ENOBUFS);
  490         }
  491         tp->t_state = TCPS_CLOSED;
  492 #ifdef INET6
  493         /* we disallow IPv4 mapped address completely. */
  494         if (inp->inp_flags & INP_IPV6)
  495                 tp->pf = PF_INET6;
  496         else
  497                 tp->pf = PF_INET;
  498 #else
  499         tp->pf = PF_INET;
  500 #endif
  501         if ((so->so_options & SO_LINGER) && so->so_linger == 0)
  502                 so->so_linger = TCP_LINGERTIME;
  503 
  504         if (so->so_options & SO_DEBUG)
  505                 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0);
  506         return (0);
  507 }
  508 
  509 int
  510 tcp_detach(struct socket *so)
  511 {
  512         struct inpcb *inp;
  513         struct tcpcb *otp = NULL, *tp;
  514         int error = 0;
  515         short ostate;
  516 
  517         soassertlocked(so);
  518 
  519         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  520                 return (error);
  521 
  522         if (so->so_options & SO_DEBUG) {
  523                 otp = tp;
  524                 ostate = tp->t_state;
  525         }
  526 
  527         /*
  528          * Detach the TCP protocol from the socket.
  529          * If the protocol state is non-embryonic, then can't
  530          * do this directly: have to initiate a PRU_DISCONNECT,
  531          * which may finish later; embryonic TCB's can just
  532          * be discarded here.
  533          */
  534         tp = tcp_dodisconnect(tp);
  535 
  536         if (otp)
  537                 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0);
  538         return (error);
  539 }
  540 
  541 /*
  542  * Give the socket an address.
  543  */
  544 int
  545 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p)
  546 {
  547         struct inpcb *inp;
  548         struct tcpcb *tp;
  549         int error;
  550         short ostate;
  551 
  552         soassertlocked(so);
  553 
  554         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  555                 return (error);
  556 
  557         if (so->so_options & SO_DEBUG)
  558                 ostate = tp->t_state;
  559 
  560         error = in_pcbbind(inp, nam, p);
  561 
  562         if (so->so_options & SO_DEBUG)
  563                 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0);
  564         return (error);
  565 }
  566 
  567 /*
  568  * Prepare to accept connections.
  569  */
  570 int
  571 tcp_listen(struct socket *so)
  572 {
  573         struct inpcb *inp;
  574         struct tcpcb *tp, *otp = NULL;
  575         int error;
  576         short ostate;
  577 
  578         soassertlocked(so);
  579 
  580         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  581                 return (error);
  582 
  583         if (so->so_options & SO_DEBUG) {
  584                 otp = tp;
  585                 ostate = tp->t_state;
  586         }
  587 
  588         if (inp->inp_lport == 0)
  589                 if ((error = in_pcbbind(inp, NULL, curproc)))
  590                         goto out;
  591         
  592         /*
  593          * If the in_pcbbind() above is called, the tp->pf
  594          * should still be whatever it was before.
  595          */
  596         tp->t_state = TCPS_LISTEN;
  597 
  598 out:
  599         if (otp)
  600                 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0);
  601         return (error);
  602 }
  603 
  604 /*
  605  * Initiate connection to peer.
  606  * Create a template for use in transmissions on this connection.
  607  * Enter SYN_SENT state, and mark socket as connecting.
  608  * Start keep-alive timer, and seed output sequence space.
  609  * Send initial segment on connection.
  610  */
  611 int
  612 tcp_connect(struct socket *so, struct mbuf *nam)
  613 {
  614         struct inpcb *inp;
  615         struct tcpcb *tp, *otp = NULL;
  616         int error;
  617         short ostate;
  618 
  619         soassertlocked(so);
  620 
  621         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  622                 return (error);
  623 
  624         if (so->so_options & SO_DEBUG) {
  625                 otp = tp;
  626                 ostate = tp->t_state;
  627         }
  628 
  629 #ifdef INET6
  630         if (inp->inp_flags & INP_IPV6) {
  631                 struct sockaddr_in6 *sin6;
  632 
  633                 if ((error = in6_nam2sin6(nam, &sin6)))
  634                         goto out;
  635                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
  636                     IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
  637                         error = EINVAL;
  638                         goto out;
  639                 }
  640                 error = in6_pcbconnect(inp, nam);
  641         } else
  642 #endif /* INET6 */
  643         {
  644                 struct sockaddr_in *sin;
  645 
  646                 if ((error = in_nam2sin(nam, &sin)))
  647                         goto out;
  648                 if ((sin->sin_addr.s_addr == INADDR_ANY) ||
  649                     (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
  650                     IN_MULTICAST(sin->sin_addr.s_addr) ||
  651                     in_broadcast(sin->sin_addr, inp->inp_rtableid)) {
  652                         error = EINVAL;
  653                         goto out;
  654                 }
  655                 error = in_pcbconnect(inp, nam);
  656         }
  657         if (error)
  658                 goto out;
  659 
  660         tp->t_template = tcp_template(tp);
  661         if (tp->t_template == 0) {
  662                 in_pcbdisconnect(inp);
  663                 error = ENOBUFS;
  664                 goto out;
  665         }
  666 
  667         so->so_state |= SS_CONNECTOUT;
  668 
  669         /* Compute window scaling to request.  */
  670         tcp_rscale(tp, sb_max);
  671 
  672         soisconnecting(so);
  673         tcpstat_inc(tcps_connattempt);
  674         tp->t_state = TCPS_SYN_SENT;
  675         TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcptv_keep_init));
  676         tcp_set_iss_tsm(tp);
  677         tcp_sendseqinit(tp);
  678         tp->snd_last = tp->snd_una;
  679         error = tcp_output(tp);
  680 
  681 out:
  682         if (otp)
  683                 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0);
  684         return (error);
  685 }
  686 
  687 /*
  688  * Accept a connection.  Essentially all the work is done at higher
  689  * levels; just return the address of the peer, storing through addr.
  690  */
  691 int
  692 tcp_accept(struct socket *so, struct mbuf *nam)
  693 {
  694         struct inpcb *inp;
  695         struct tcpcb *tp;
  696         int error;
  697         short ostate;
  698 
  699         soassertlocked(so);
  700 
  701         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  702                 return (error);
  703 
  704         if (so->so_options & SO_DEBUG)
  705                 ostate = tp->t_state;
  706 
  707 #ifdef INET6
  708         if (inp->inp_flags & INP_IPV6)
  709                 in6_setpeeraddr(inp, nam);
  710         else
  711 #endif
  712                 in_setpeeraddr(inp, nam);
  713 
  714         if (so->so_options & SO_DEBUG)
  715                 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_ACCEPT, 0);
  716         return (error);
  717 }
  718 
  719 /*
  720  * Initiate disconnect from peer.
  721  * If connection never passed embryonic stage, just drop;
  722  * else if don't need to let data drain, then can just drop anyways,
  723  * else have to begin TCP shutdown process: mark socket disconnecting,
  724  * drain unread data, state switch to reflect user close, and
  725  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
  726  * when peer sends FIN and acks ours.
  727  *
  728  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
  729  */
  730 int
  731 tcp_disconnect(struct socket *so)
  732 {
  733         struct inpcb *inp;
  734         struct tcpcb *tp, *otp = NULL;
  735         int error;
  736         short ostate;
  737 
  738         soassertlocked(so);
  739 
  740         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  741                 return (error);
  742 
  743         if (so->so_options & SO_DEBUG) {
  744                 otp = tp;
  745                 ostate = tp->t_state;
  746         }
  747 
  748         tp = tcp_dodisconnect(tp);
  749 
  750         if (otp)
  751                 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0);
  752         return (0);
  753 }
  754 
  755 /*
  756  * Mark the connection as being incapable of further output.
  757  */
  758 int
  759 tcp_shutdown(struct socket *so)
  760 {
  761         struct inpcb *inp;
  762         struct tcpcb *tp, *otp = NULL;
  763         int error;
  764         short ostate;
  765 
  766         soassertlocked(so);
  767 
  768         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  769                 return (error);
  770 
  771         if (so->so_options & SO_DEBUG) {
  772                 otp = tp;
  773                 ostate = tp->t_state;
  774         }
  775 
  776         if (so->so_snd.sb_state & SS_CANTSENDMORE)
  777                 goto out;
  778 
  779         socantsendmore(so);
  780         tp = tcp_usrclosed(tp);
  781         if (tp)
  782                 error = tcp_output(tp);
  783 
  784 out:
  785         if (otp)
  786                 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0);
  787         return (error);
  788 }
  789 
  790 /*
  791  * After a receive, possibly send window update to peer.
  792  */
  793 void
  794 tcp_rcvd(struct socket *so)
  795 {
  796         struct inpcb *inp;
  797         struct tcpcb *tp;
  798         short ostate;
  799 
  800         soassertlocked(so);
  801 
  802         if (tcp_sogetpcb(so, &inp, &tp))
  803                 return;
  804 
  805         if (so->so_options & SO_DEBUG)
  806                 ostate = tp->t_state;
  807 
  808         /*
  809          * soreceive() calls this function when a user receives
  810          * ancillary data on a listening socket. We don't call
  811          * tcp_output in such a case, since there is no header
  812          * template for a listening socket and hence the kernel
  813          * will panic.
  814          */
  815         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
  816                 (void) tcp_output(tp);
  817 
  818         if (so->so_options & SO_DEBUG)
  819                 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0);
  820 }
  821 
  822 /*
  823  * Do a send by putting data in output queue and updating urgent
  824  * marker if URG set.  Possibly send more data.
  825  */
  826 int
  827 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
  828     struct mbuf *control)
  829 {
  830         struct inpcb *inp;
  831         struct tcpcb *tp;
  832         int error;
  833         short ostate;
  834 
  835         soassertlocked(so);
  836 
  837         if (control && control->m_len) {
  838                 error = EINVAL;
  839                 goto out;
  840         }
  841 
  842         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  843                 goto out;
  844 
  845         if (so->so_options & SO_DEBUG)
  846                 ostate = tp->t_state;
  847 
  848         sbappendstream(so, &so->so_snd, m);
  849         m = NULL;
  850 
  851         error = tcp_output(tp);
  852 
  853         if (so->so_options & SO_DEBUG)
  854                 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0);
  855 
  856 out:
  857         m_freem(control);
  858         m_freem(m);
  859 
  860         return (error);
  861 }
  862 
  863 /*
  864  * Abort the TCP.
  865  */
  866 void
  867 tcp_abort(struct socket *so)
  868 {
  869         struct inpcb *inp;
  870         struct tcpcb *tp, *otp = NULL;
  871         short ostate;
  872 
  873         soassertlocked(so);
  874 
  875         if (tcp_sogetpcb(so, &inp, &tp))
  876                 return;
  877 
  878         if (so->so_options & SO_DEBUG) {
  879                 otp = tp;
  880                 ostate = tp->t_state;
  881         }
  882 
  883         tp = tcp_drop(tp, ECONNABORTED);
  884 
  885         if (otp)
  886                 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0);
  887 }
  888 
  889 int
  890 tcp_sense(struct socket *so, struct stat *ub)
  891 {
  892         struct inpcb *inp;
  893         struct tcpcb *tp;
  894         int error;
  895 
  896         soassertlocked(so);
  897 
  898         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  899                 return (error);
  900 
  901         ub->st_blksize = so->so_snd.sb_hiwat;
  902 
  903         if (so->so_options & SO_DEBUG)
  904                 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
  905         return (0);
  906 }
  907 
  908 int
  909 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags)
  910 {
  911         struct inpcb *inp;
  912         struct tcpcb *tp;
  913         int error;
  914 
  915         soassertlocked(so);
  916 
  917         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  918                 return (error);
  919 
  920         if ((so->so_oobmark == 0 &&
  921             (so->so_rcv.sb_state & SS_RCVATMARK) == 0) ||
  922             so->so_options & SO_OOBINLINE ||
  923             tp->t_oobflags & TCPOOB_HADDATA) {
  924                 error = EINVAL;
  925                 goto out;
  926         }
  927         if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
  928                 error = EWOULDBLOCK;
  929                 goto out;
  930         }
  931         m->m_len = 1;
  932         *mtod(m, caddr_t) = tp->t_iobc;
  933         if ((flags & MSG_PEEK) == 0)
  934                 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
  935 out:
  936         if (so->so_options & SO_DEBUG)
  937                 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0);
  938         return (error);
  939 }
  940 
  941 int
  942 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam,
  943     struct mbuf *control)
  944 {
  945         struct inpcb *inp;
  946         struct tcpcb *tp;
  947         int error;
  948         short ostate;
  949 
  950         soassertlocked(so);
  951 
  952         if (control && control->m_len) {
  953                 error = EINVAL;
  954                 goto release;
  955         }
  956 
  957         if ((error = tcp_sogetpcb(so, &inp, &tp)))
  958                 goto release;
  959 
  960         if (so->so_options & SO_DEBUG)
  961                 ostate = tp->t_state;
  962 
  963         if (sbspace(so, &so->so_snd) < -512) {
  964                 error = ENOBUFS;
  965                 goto out;
  966         }
  967 
  968         /*
  969          * According to RFC961 (Assigned Protocols),
  970          * the urgent pointer points to the last octet
  971          * of urgent data.  We continue, however,
  972          * to consider it to indicate the first octet
  973          * of data past the urgent section.
  974          * Otherwise, snd_up should be one lower.
  975          */
  976         sbappendstream(so, &so->so_snd, m);
  977         m = NULL;
  978         tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
  979         tp->t_force = 1;
  980         error = tcp_output(tp);
  981         tp->t_force = 0;
  982 
  983 out:
  984         if (so->so_options & SO_DEBUG)
  985                 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0);
  986 
  987 release:
  988         m_freem(control);
  989         m_freem(m);
  990 
  991         return (error);
  992 }
  993 
  994 int
  995 tcp_sockaddr(struct socket *so, struct mbuf *nam)
  996 {
  997         struct inpcb *inp;
  998         struct tcpcb *tp;
  999         int error;
 1000 
 1001         soassertlocked(so);
 1002 
 1003         if ((error = tcp_sogetpcb(so, &inp, &tp)))
 1004                 return (error);
 1005 
 1006 #ifdef INET6
 1007         if (inp->inp_flags & INP_IPV6)
 1008                 in6_setsockaddr(inp, nam);
 1009         else
 1010 #endif
 1011                 in_setsockaddr(inp, nam);
 1012 
 1013         if (so->so_options & SO_DEBUG)
 1014                 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
 1015                     PRU_SOCKADDR, 0);
 1016         return (0);
 1017 }
 1018 
 1019 int
 1020 tcp_peeraddr(struct socket *so, struct mbuf *nam)
 1021 {
 1022         struct inpcb *inp;
 1023         struct tcpcb *tp;
 1024         int error;
 1025 
 1026         soassertlocked(so);
 1027 
 1028         if ((error = tcp_sogetpcb(so, &inp, &tp)))
 1029                 return (error);
 1030 
 1031 #ifdef INET6
 1032         if (inp->inp_flags & INP_IPV6)
 1033                 in6_setpeeraddr(inp, nam);
 1034         else
 1035 #endif
 1036                 in_setpeeraddr(inp, nam);
 1037 
 1038         if (so->so_options & SO_DEBUG)
 1039                 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
 1040                     PRU_PEERADDR, 0);
 1041         return (0);
 1042 }
 1043 
 1044 /*
 1045  * Initiate (or continue) disconnect.
 1046  * If embryonic state, just send reset (once).
 1047  * If in ``let data drain'' option and linger null, just drop.
 1048  * Otherwise (hard), mark socket disconnecting and drop
 1049  * current input data; switch states based on user close, and
 1050  * send segment to peer (with FIN).
 1051  */
 1052 struct tcpcb *
 1053 tcp_dodisconnect(struct tcpcb *tp)
 1054 {
 1055         struct socket *so = tp->t_inpcb->inp_socket;
 1056 
 1057         if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
 1058                 tp = tcp_close(tp);
 1059         else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
 1060                 tp = tcp_drop(tp, 0);
 1061         else {
 1062                 soisdisconnecting(so);
 1063                 sbflush(so, &so->so_rcv);
 1064                 tp = tcp_usrclosed(tp);
 1065                 if (tp)
 1066                         (void) tcp_output(tp);
 1067         }
 1068         return (tp);
 1069 }
 1070 
 1071 /*
 1072  * User issued close, and wish to trail through shutdown states:
 1073  * if never received SYN, just forget it.  If got a SYN from peer,
 1074  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
 1075  * If already got a FIN from peer, then almost done; go to LAST_ACK
 1076  * state.  In all other cases, have already sent FIN to peer (e.g.
 1077  * after PRU_SHUTDOWN), and just have to play tedious game waiting
 1078  * for peer to send FIN or not respond to keep-alives, etc.
 1079  * We can let the user exit from the close as soon as the FIN is acked.
 1080  */
 1081 struct tcpcb *
 1082 tcp_usrclosed(struct tcpcb *tp)
 1083 {
 1084 
 1085         switch (tp->t_state) {
 1086 
 1087         case TCPS_CLOSED:
 1088         case TCPS_LISTEN:
 1089         case TCPS_SYN_SENT:
 1090                 tp->t_state = TCPS_CLOSED;
 1091                 tp = tcp_close(tp);
 1092                 break;
 1093 
 1094         case TCPS_SYN_RECEIVED:
 1095         case TCPS_ESTABLISHED:
 1096                 tp->t_state = TCPS_FIN_WAIT_1;
 1097                 break;
 1098 
 1099         case TCPS_CLOSE_WAIT:
 1100                 tp->t_state = TCPS_LAST_ACK;
 1101                 break;
 1102         }
 1103         if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
 1104                 soisdisconnected(tp->t_inpcb->inp_socket);
 1105                 /*
 1106                  * If we are in FIN_WAIT_2, we arrived here because the
 1107                  * application did a shutdown of the send side.  Like the
 1108                  * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
 1109                  * a full close, we start a timer to make sure sockets are
 1110                  * not left in FIN_WAIT_2 forever.
 1111                  */
 1112                 if (tp->t_state == TCPS_FIN_WAIT_2)
 1113                         TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(tcp_maxidle));
 1114         }
 1115         return (tp);
 1116 }
 1117 
 1118 /*
 1119  * Look up a socket for ident or tcpdrop, ...
 1120  */
 1121 int
 1122 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
 1123 {
 1124         int error = 0;
 1125         struct tcp_ident_mapping tir;
 1126         struct inpcb *inp;
 1127         struct tcpcb *tp = NULL;
 1128         struct sockaddr_in *fin, *lin;
 1129 #ifdef INET6
 1130         struct sockaddr_in6 *fin6, *lin6;
 1131         struct in6_addr f6, l6;
 1132 #endif
 1133 
 1134         NET_ASSERT_LOCKED();
 1135 
 1136         if (dodrop) {
 1137                 if (oldp != NULL || *oldlenp != 0)
 1138                         return (EINVAL);
 1139                 if (newp == NULL)
 1140                         return (EPERM);
 1141                 if (newlen < sizeof(tir))
 1142                         return (ENOMEM);
 1143                 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
 1144                         return (error);
 1145         } else {
 1146                 if (oldp == NULL)
 1147                         return (EINVAL);
 1148                 if (*oldlenp < sizeof(tir))
 1149                         return (ENOMEM);
 1150                 if (newp != NULL || newlen != 0)
 1151                         return (EINVAL);
 1152                 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
 1153                         return (error);
 1154         }
 1155         switch (tir.faddr.ss_family) {
 1156 #ifdef INET6
 1157         case AF_INET6:
 1158                 fin6 = (struct sockaddr_in6 *)&tir.faddr;
 1159                 error = in6_embedscope(&f6, fin6, NULL);
 1160                 if (error)
 1161                         return EINVAL;  /*?*/
 1162                 lin6 = (struct sockaddr_in6 *)&tir.laddr;
 1163                 error = in6_embedscope(&l6, lin6, NULL);
 1164                 if (error)
 1165                         return EINVAL;  /*?*/
 1166                 break;
 1167 #endif
 1168         case AF_INET:
 1169                 fin = (struct sockaddr_in *)&tir.faddr;
 1170                 lin = (struct sockaddr_in *)&tir.laddr;
 1171                 break;
 1172         default:
 1173                 return (EINVAL);
 1174         }
 1175 
 1176         switch (tir.faddr.ss_family) {
 1177 #ifdef INET6
 1178         case AF_INET6:
 1179                 inp = in6_pcblookup(&tcbtable, &f6,
 1180                     fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain);
 1181                 break;
 1182 #endif
 1183         case AF_INET:
 1184                 inp = in_pcblookup(&tcbtable, fin->sin_addr,
 1185                     fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain);
 1186                 break;
 1187         default:
 1188                 unhandled_af(tir.faddr.ss_family);
 1189         }
 1190 
 1191         if (dodrop) {
 1192                 if (inp && (tp = intotcpcb(inp)) &&
 1193                     ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
 1194                         tp = tcp_drop(tp, ECONNABORTED);
 1195                 else
 1196                         error = ESRCH;
 1197                 in_pcbunref(inp);
 1198                 return (error);
 1199         }
 1200 
 1201         if (inp == NULL) {
 1202                 tcpstat_inc(tcps_pcbhashmiss);
 1203                 switch (tir.faddr.ss_family) {
 1204 #ifdef INET6
 1205                 case AF_INET6:
 1206                         inp = in6_pcblookup_listen(&tcbtable,
 1207                             &l6, lin6->sin6_port, NULL, tir.rdomain);
 1208                         break;
 1209 #endif
 1210                 case AF_INET:
 1211                         inp = in_pcblookup_listen(&tcbtable,
 1212                             lin->sin_addr, lin->sin_port, NULL, tir.rdomain);
 1213                         break;
 1214                 }
 1215         }
 1216 
 1217         if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
 1218                 tir.ruid = inp->inp_socket->so_ruid;
 1219                 tir.euid = inp->inp_socket->so_euid;
 1220         } else {
 1221                 tir.ruid = -1;
 1222                 tir.euid = -1;
 1223         }
 1224 
 1225         *oldlenp = sizeof (tir);
 1226         error = copyout((void *)&tir, oldp, sizeof (tir));
 1227         in_pcbunref(inp);
 1228         return (error);
 1229 }
 1230 
 1231 int
 1232 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp)
 1233 {
 1234         uint64_t counters[tcps_ncounters];
 1235         struct tcpstat tcpstat;
 1236         struct syn_cache_set *set;
 1237         int i = 0;
 1238 
 1239 #define ASSIGN(field)   do { tcpstat.field = counters[i++]; } while (0)
 1240 
 1241         memset(&tcpstat, 0, sizeof tcpstat);
 1242         counters_read(tcpcounters, counters, nitems(counters));
 1243         ASSIGN(tcps_connattempt);
 1244         ASSIGN(tcps_accepts);
 1245         ASSIGN(tcps_connects);
 1246         ASSIGN(tcps_drops);
 1247         ASSIGN(tcps_conndrops);
 1248         ASSIGN(tcps_closed);
 1249         ASSIGN(tcps_segstimed);
 1250         ASSIGN(tcps_rttupdated);
 1251         ASSIGN(tcps_delack);
 1252         ASSIGN(tcps_timeoutdrop);
 1253         ASSIGN(tcps_rexmttimeo);
 1254         ASSIGN(tcps_persisttimeo);
 1255         ASSIGN(tcps_persistdrop);
 1256         ASSIGN(tcps_keeptimeo);
 1257         ASSIGN(tcps_keepprobe);
 1258         ASSIGN(tcps_keepdrops);
 1259         ASSIGN(tcps_sndtotal);
 1260         ASSIGN(tcps_sndpack);
 1261         ASSIGN(tcps_sndbyte);
 1262         ASSIGN(tcps_sndrexmitpack);
 1263         ASSIGN(tcps_sndrexmitbyte);
 1264         ASSIGN(tcps_sndrexmitfast);
 1265         ASSIGN(tcps_sndacks);
 1266         ASSIGN(tcps_sndprobe);
 1267         ASSIGN(tcps_sndurg);
 1268         ASSIGN(tcps_sndwinup);
 1269         ASSIGN(tcps_sndctrl);
 1270         ASSIGN(tcps_rcvtotal);
 1271         ASSIGN(tcps_rcvpack);
 1272         ASSIGN(tcps_rcvbyte);
 1273         ASSIGN(tcps_rcvbadsum);
 1274         ASSIGN(tcps_rcvbadoff);
 1275         ASSIGN(tcps_rcvmemdrop);
 1276         ASSIGN(tcps_rcvnosec);
 1277         ASSIGN(tcps_rcvshort);
 1278         ASSIGN(tcps_rcvduppack);
 1279         ASSIGN(tcps_rcvdupbyte);
 1280         ASSIGN(tcps_rcvpartduppack);
 1281         ASSIGN(tcps_rcvpartdupbyte);
 1282         ASSIGN(tcps_rcvoopack);
 1283         ASSIGN(tcps_rcvoobyte);
 1284         ASSIGN(tcps_rcvpackafterwin);
 1285         ASSIGN(tcps_rcvbyteafterwin);
 1286         ASSIGN(tcps_rcvafterclose);
 1287         ASSIGN(tcps_rcvwinprobe);
 1288         ASSIGN(tcps_rcvdupack);
 1289         ASSIGN(tcps_rcvacktoomuch);
 1290         ASSIGN(tcps_rcvacktooold);
 1291         ASSIGN(tcps_rcvackpack);
 1292         ASSIGN(tcps_rcvackbyte);
 1293         ASSIGN(tcps_rcvwinupd);
 1294         ASSIGN(tcps_pawsdrop);
 1295         ASSIGN(tcps_predack);
 1296         ASSIGN(tcps_preddat);
 1297         ASSIGN(tcps_pcbhashmiss);
 1298         ASSIGN(tcps_noport);
 1299         ASSIGN(tcps_badsyn);
 1300         ASSIGN(tcps_dropsyn);
 1301         ASSIGN(tcps_rcvbadsig);
 1302         ASSIGN(tcps_rcvgoodsig);
 1303         ASSIGN(tcps_inswcsum);
 1304         ASSIGN(tcps_outswcsum);
 1305         ASSIGN(tcps_ecn_accepts);
 1306         ASSIGN(tcps_ecn_rcvece);
 1307         ASSIGN(tcps_ecn_rcvcwr);
 1308         ASSIGN(tcps_ecn_rcvce);
 1309         ASSIGN(tcps_ecn_sndect);
 1310         ASSIGN(tcps_ecn_sndece);
 1311         ASSIGN(tcps_ecn_sndcwr);
 1312         ASSIGN(tcps_cwr_ecn);
 1313         ASSIGN(tcps_cwr_frecovery);
 1314         ASSIGN(tcps_cwr_timeout);
 1315         ASSIGN(tcps_sc_added);
 1316         ASSIGN(tcps_sc_completed);
 1317         ASSIGN(tcps_sc_timed_out);
 1318         ASSIGN(tcps_sc_overflowed);
 1319         ASSIGN(tcps_sc_reset);
 1320         ASSIGN(tcps_sc_unreach);
 1321         ASSIGN(tcps_sc_bucketoverflow);
 1322         ASSIGN(tcps_sc_aborted);
 1323         ASSIGN(tcps_sc_dupesyn);
 1324         ASSIGN(tcps_sc_dropped);
 1325         ASSIGN(tcps_sc_collisions);
 1326         ASSIGN(tcps_sc_retransmitted);
 1327         ASSIGN(tcps_sc_seedrandom);
 1328         ASSIGN(tcps_sc_hash_size);
 1329         ASSIGN(tcps_sc_entry_count);
 1330         ASSIGN(tcps_sc_entry_limit);
 1331         ASSIGN(tcps_sc_bucket_maxlen);
 1332         ASSIGN(tcps_sc_bucket_limit);
 1333         ASSIGN(tcps_sc_uses_left);
 1334         ASSIGN(tcps_conndrained);
 1335         ASSIGN(tcps_sack_recovery_episode);
 1336         ASSIGN(tcps_sack_rexmits);
 1337         ASSIGN(tcps_sack_rexmit_bytes);
 1338         ASSIGN(tcps_sack_rcv_opts);
 1339         ASSIGN(tcps_sack_snd_opts);
 1340         ASSIGN(tcps_sack_drop_opts);
 1341 
 1342 #undef ASSIGN
 1343 
 1344         set = &tcp_syn_cache[tcp_syn_cache_active];
 1345         tcpstat.tcps_sc_hash_size = set->scs_size;
 1346         tcpstat.tcps_sc_entry_count = set->scs_count;
 1347         tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit;
 1348         tcpstat.tcps_sc_bucket_maxlen = 0;
 1349         for (i = 0; i < set->scs_size; i++) {
 1350                 if (tcpstat.tcps_sc_bucket_maxlen <
 1351                     set->scs_buckethead[i].sch_length)
 1352                         tcpstat.tcps_sc_bucket_maxlen =
 1353                                 set->scs_buckethead[i].sch_length;
 1354         }
 1355         tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit;
 1356         tcpstat.tcps_sc_uses_left = set->scs_use;
 1357 
 1358         return (sysctl_rdstruct(oldp, oldlenp, newp,
 1359             &tcpstat, sizeof(tcpstat)));
 1360 }
 1361 
 1362 /*
 1363  * Sysctl for tcp variables.
 1364  */
 1365 int
 1366 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
 1367     size_t newlen)
 1368 {
 1369         int error, nval;
 1370 
 1371         /* All sysctl names at this level are terminal. */
 1372         if (namelen != 1)
 1373                 return (ENOTDIR);
 1374 
 1375         switch (name[0]) {
 1376         case TCPCTL_BADDYNAMIC:
 1377                 NET_LOCK();
 1378                 error = sysctl_struct(oldp, oldlenp, newp, newlen,
 1379                     baddynamicports.tcp, sizeof(baddynamicports.tcp));
 1380                 NET_UNLOCK();
 1381                 return (error);
 1382 
 1383         case TCPCTL_ROOTONLY:
 1384                 if (newp && securelevel > 0)
 1385                         return (EPERM);
 1386                 NET_LOCK();
 1387                 error = sysctl_struct(oldp, oldlenp, newp, newlen,
 1388                     rootonlyports.tcp, sizeof(rootonlyports.tcp));
 1389                 NET_UNLOCK();
 1390                 return (error);
 1391 
 1392         case TCPCTL_IDENT:
 1393                 NET_LOCK();
 1394                 error = tcp_ident(oldp, oldlenp, newp, newlen, 0);
 1395                 NET_UNLOCK();
 1396                 return (error);
 1397 
 1398         case TCPCTL_DROP:
 1399                 NET_LOCK();
 1400                 error = tcp_ident(oldp, oldlenp, newp, newlen, 1);
 1401                 NET_UNLOCK();
 1402                 return (error);
 1403 
 1404         case TCPCTL_REASS_LIMIT:
 1405                 NET_LOCK();
 1406                 nval = tcp_reass_limit;
 1407                 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
 1408                 if (!error && nval != tcp_reass_limit) {
 1409                         error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
 1410                         if (!error)
 1411                                 tcp_reass_limit = nval;
 1412                 }
 1413                 NET_UNLOCK();
 1414                 return (error);
 1415 
 1416         case TCPCTL_SACKHOLE_LIMIT:
 1417                 NET_LOCK();
 1418                 nval = tcp_sackhole_limit;
 1419                 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
 1420                 if (!error && nval != tcp_sackhole_limit) {
 1421                         error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
 1422                         if (!error)
 1423                                 tcp_sackhole_limit = nval;
 1424                 }
 1425                 NET_UNLOCK();
 1426                 return (error);
 1427 
 1428         case TCPCTL_STATS:
 1429                 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp));
 1430 
 1431         case TCPCTL_SYN_USE_LIMIT:
 1432                 NET_LOCK();
 1433                 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
 1434                     &tcp_syn_use_limit, 0, INT_MAX);
 1435                 if (!error && newp != NULL) {
 1436                         /*
 1437                          * Global tcp_syn_use_limit is used when reseeding a
 1438                          * new cache.  Also update the value in active cache.
 1439                          */
 1440                         if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit)
 1441                                 tcp_syn_cache[0].scs_use = tcp_syn_use_limit;
 1442                         if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit)
 1443                                 tcp_syn_cache[1].scs_use = tcp_syn_use_limit;
 1444                 }
 1445                 NET_UNLOCK();
 1446                 return (error);
 1447 
 1448         case TCPCTL_SYN_HASH_SIZE:
 1449                 NET_LOCK();
 1450                 nval = tcp_syn_hash_size;
 1451                 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
 1452                     &nval, 1, 100000);
 1453                 if (!error && nval != tcp_syn_hash_size) {
 1454                         /*
 1455                          * If global hash size has been changed,
 1456                          * switch sets as soon as possible.  Then
 1457                          * the actual hash array will be reallocated.
 1458                          */
 1459                         if (tcp_syn_cache[0].scs_size != nval)
 1460                                 tcp_syn_cache[0].scs_use = 0;
 1461                         if (tcp_syn_cache[1].scs_size != nval)
 1462                                 tcp_syn_cache[1].scs_use = 0;
 1463                         tcp_syn_hash_size = nval;
 1464                 }
 1465                 NET_UNLOCK();
 1466                 return (error);
 1467 
 1468         default:
 1469                 NET_LOCK();
 1470                 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name,
 1471                      namelen, oldp, oldlenp, newp, newlen);
 1472                 NET_UNLOCK();
 1473                 return (error);
 1474         }
 1475         /* NOTREACHED */
 1476 }
 1477 
 1478 /*
 1479  * Scale the send buffer so that inflight data is not accounted against
 1480  * the limit. The buffer will scale with the congestion window, if the
 1481  * the receiver stops acking data the window will shrink and therefore
 1482  * the buffer size will shrink as well.
 1483  * In low memory situation try to shrink the buffer to the initial size
 1484  * disabling the send buffer scaling as long as the situation persists.
 1485  */
 1486 void
 1487 tcp_update_sndspace(struct tcpcb *tp)
 1488 {
 1489         struct socket *so = tp->t_inpcb->inp_socket;
 1490         u_long nmax = so->so_snd.sb_hiwat;
 1491 
 1492         if (sbchecklowmem()) {
 1493                 /* low on memory try to get rid of some */
 1494                 if (tcp_sendspace < nmax)
 1495                         nmax = tcp_sendspace;
 1496         } else if (so->so_snd.sb_wat != tcp_sendspace)
 1497                 /* user requested buffer size, auto-scaling disabled */
 1498                 nmax = so->so_snd.sb_wat;
 1499         else
 1500                 /* automatic buffer scaling */
 1501                 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
 1502                     tp->snd_una);
 1503 
 1504         /* a writable socket must be preserved because of poll(2) semantics */
 1505         if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
 1506                 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
 1507                         nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
 1508                 /* keep in sync with sbreserve() calculation */
 1509                 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat)
 1510                         nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8;
 1511         }
 1512 
 1513         /* round to MSS boundary */
 1514         nmax = roundup(nmax, tp->t_maxseg);
 1515 
 1516         if (nmax != so->so_snd.sb_hiwat)
 1517                 sbreserve(so, &so->so_snd, nmax);
 1518 }
 1519 
 1520 /*
 1521  * Scale the recv buffer by looking at how much data was transferred in
 1522  * one approximated RTT. If more than a big part of the recv buffer was
 1523  * transferred during that time we increase the buffer by a constant.
 1524  * In low memory situation try to shrink the buffer to the initial size.
 1525  */
 1526 void
 1527 tcp_update_rcvspace(struct tcpcb *tp)
 1528 {
 1529         struct socket *so = tp->t_inpcb->inp_socket;
 1530         u_long nmax = so->so_rcv.sb_hiwat;
 1531 
 1532         if (sbchecklowmem()) {
 1533                 /* low on memory try to get rid of some */
 1534                 if (tcp_recvspace < nmax)
 1535                         nmax = tcp_recvspace;
 1536         } else if (so->so_rcv.sb_wat != tcp_recvspace)
 1537                 /* user requested buffer size, auto-scaling disabled */
 1538                 nmax = so->so_rcv.sb_wat;
 1539         else {
 1540                 /* automatic buffer scaling */
 1541                 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
 1542                         nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
 1543                             tcp_autorcvbuf_inc);
 1544         }
 1545 
 1546         /* a readable socket must be preserved because of poll(2) semantics */
 1547         if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
 1548             nmax < so->so_snd.sb_lowat)
 1549                 nmax = so->so_snd.sb_lowat;
 1550 
 1551         if (nmax == so->so_rcv.sb_hiwat)
 1552                 return;
 1553 
 1554         /* round to MSS boundary */
 1555         nmax = roundup(nmax, tp->t_maxseg);
 1556         sbreserve(so, &so->so_rcv, nmax);
 1557 }

Cache object: 5523041c5f4e697d384ff49c08e3e858


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.