tcp_output.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: tcp_output.c,v 1.153.2.2 2011/04/03 15:05:13 riz Exp $ */
    2 
    3 /*
    4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 3. Neither the name of the project nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
   34  *
   35  * NRL grants permission for redistribution and use in source and binary
   36  * forms, with or without modification, of the software and documentation
   37  * created at NRL provided that the following conditions are met:
   38  *
   39  * 1. Redistributions of source code must retain the above copyright
   40  *    notice, this list of conditions and the following disclaimer.
   41  * 2. Redistributions in binary form must reproduce the above copyright
   42  *    notice, this list of conditions and the following disclaimer in the
   43  *    documentation and/or other materials provided with the distribution.
   44  * 3. All advertising materials mentioning features or use of this software
   45  *    must display the following acknowledgements:
   46  *      This product includes software developed by the University of
   47  *      California, Berkeley and its contributors.
   48  *      This product includes software developed at the Information
   49  *      Technology Division, US Naval Research Laboratory.
   50  * 4. Neither the name of the NRL nor the names of its contributors
   51  *    may be used to endorse or promote products derived from this software
   52  *    without specific prior written permission.
   53  *
   54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
   55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
   58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   65  *
   66  * The views and conclusions contained in the software and documentation
   67  * are those of the authors and should not be interpreted as representing
   68  * official policies, either expressed or implied, of the US Naval
   69  * Research Laboratory (NRL).
   70  */
   71 
   72 /*-
   73  * Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
   74  * All rights reserved.
   75  *
   76  * This code is derived from software contributed to The NetBSD Foundation
   77  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
   78  * Facility, NASA Ames Research Center.
   79  * This code is derived from software contributed to The NetBSD Foundation
   80  * by Charles M. Hannum.
   81  * This code is derived from software contributed to The NetBSD Foundation
   82  * by Rui Paulo.
   83  *
   84  * Redistribution and use in source and binary forms, with or without
   85  * modification, are permitted provided that the following conditions
   86  * are met:
   87  * 1. Redistributions of source code must retain the above copyright
   88  *    notice, this list of conditions and the following disclaimer.
   89  * 2. Redistributions in binary form must reproduce the above copyright
   90  *    notice, this list of conditions and the following disclaimer in the
   91  *    documentation and/or other materials provided with the distribution.
   92  * 3. All advertising materials mentioning features or use of this software
   93  *    must display the following acknowledgement:
   94  *      This product includes software developed by the NetBSD
   95  *      Foundation, Inc. and its contributors.
   96  * 4. Neither the name of The NetBSD Foundation nor the names of its
   97  *    contributors may be used to endorse or promote products derived
   98  *    from this software without specific prior written permission.
   99  *
  100  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  101  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  102  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  103  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  104  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  105  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  106  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  107  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  108  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  109  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  110  * POSSIBILITY OF SUCH DAMAGE.
  111  */
  112 
  113 /*
  114  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  115  *      The Regents of the University of California.  All rights reserved.
  116  *
  117  * Redistribution and use in source and binary forms, with or without
  118  * modification, are permitted provided that the following conditions
  119  * are met:
  120  * 1. Redistributions of source code must retain the above copyright
  121  *    notice, this list of conditions and the following disclaimer.
  122  * 2. Redistributions in binary form must reproduce the above copyright
  123  *    notice, this list of conditions and the following disclaimer in the
  124  *    documentation and/or other materials provided with the distribution.
  125  * 3. Neither the name of the University nor the names of its contributors
  126  *    may be used to endorse or promote products derived from this software
  127  *    without specific prior written permission.
  128  *
  129  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  130  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  131  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  132  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  133  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  134  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  135  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  136  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  137  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  138  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  139  * SUCH DAMAGE.
  140  *
  141  *      @(#)tcp_output.c        8.4 (Berkeley) 5/24/95
  142  */
  143 
  144 #include <sys/cdefs.h>
  145 __KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.153.2.2 2011/04/03 15:05:13 riz Exp $");
  146 
  147 #include "opt_inet.h"
  148 #include "opt_ipsec.h"
  149 #include "opt_tcp_debug.h"
  150 
  151 #include <sys/param.h>
  152 #include <sys/systm.h>
  153 #include <sys/malloc.h>
  154 #include <sys/mbuf.h>
  155 #include <sys/protosw.h>
  156 #include <sys/socket.h>
  157 #include <sys/socketvar.h>
  158 #include <sys/errno.h>
  159 #include <sys/domain.h>
  160 #include <sys/kernel.h>
  161 #ifdef TCP_SIGNATURE
  162 #include <sys/md5.h>
  163 #endif
  164 
  165 #include <net/if.h>
  166 #include <net/route.h>
  167 
  168 #include <netinet/in.h>
  169 #include <netinet/in_systm.h>
  170 #include <netinet/ip.h>
  171 #include <netinet/in_pcb.h>
  172 #include <netinet/ip_var.h>
  173 
  174 #ifdef INET6
  175 #ifndef INET
  176 #include <netinet/in.h>
  177 #endif
  178 #include <netinet/ip6.h>
  179 #include <netinet6/in6_var.h>
  180 #include <netinet6/ip6_var.h>
  181 #include <netinet6/in6_pcb.h>
  182 #include <netinet6/nd6.h>
  183 #endif
  184 
  185 #ifdef FAST_IPSEC
  186 #include <netipsec/ipsec.h>
  187 #include <netipsec/key.h>
  188 #ifdef INET6
  189 #include <netipsec/ipsec6.h>
  190 #endif
  191 #endif  /* FAST_IPSEC*/
  192 #ifdef IPSEC
  193 #include <netinet6/ipsec.h>
  194 #endif
  195 
  196 #include <netinet/tcp.h>
  197 #define TCPOUTFLAGS
  198 #include <netinet/tcp_fsm.h>
  199 #include <netinet/tcp_seq.h>
  200 #include <netinet/tcp_timer.h>
  201 #include <netinet/tcp_var.h>
  202 #include <netinet/tcp_congctl.h>
  203 #include <netinet/tcpip.h>
  204 #include <netinet/tcp_debug.h>
  205 #include <netinet/in_offload.h>
  206 #include <netinet6/in6_offload.h>
  207 
  208 #ifdef IPSEC
  209 #include <netkey/key.h>
  210 #endif
  211 
  212 #ifdef notyet
  213 extern struct mbuf *m_copypack();
  214 #endif
  215 
  216 /*
  217  * Knob to enable Congestion Window Monitoring, and control
  218  * the burst size it allows.  Default burst is 4 packets, per
  219  * the Internet draft.
  220  */
  221 int     tcp_cwm = 0;
  222 int     tcp_cwm_burstsize = 4;
  223 
  224 #ifdef TCP_OUTPUT_COUNTERS
  225 #include <sys/device.h>
  226 
  227 extern struct evcnt tcp_output_bigheader;
  228 extern struct evcnt tcp_output_predict_hit;
  229 extern struct evcnt tcp_output_predict_miss;
  230 extern struct evcnt tcp_output_copysmall;
  231 extern struct evcnt tcp_output_copybig;
  232 extern struct evcnt tcp_output_refbig;
  233 
  234 #define TCP_OUTPUT_COUNTER_INCR(ev)     (ev)->ev_count++
  235 #else
  236 
  237 #define TCP_OUTPUT_COUNTER_INCR(ev)     /* nothing */
  238 
  239 #endif /* TCP_OUTPUT_COUNTERS */
  240 
  241 static
  242 #ifndef GPROF
  243 inline
  244 #endif
  245 int
  246 tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep,
  247     boolean_t *alwaysfragp)
  248 {
  249 #ifdef INET
  250         struct inpcb *inp = tp->t_inpcb;
  251 #endif
  252 #ifdef INET6
  253         struct in6pcb *in6p = tp->t_in6pcb;
  254 #endif
  255         struct socket *so = NULL;
  256         struct rtentry *rt;
  257         struct ifnet *ifp;
  258         int size;
  259         int hdrlen;
  260         int optlen;
  261 
  262         *alwaysfragp = FALSE;
  263 
  264 #ifdef DIAGNOSTIC
  265         if (tp->t_inpcb && tp->t_in6pcb)
  266                 panic("tcp_segsize: both t_inpcb and t_in6pcb are set");
  267 #endif
  268         switch (tp->t_family) {
  269 #ifdef INET
  270         case AF_INET:
  271                 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
  272                 break;
  273 #endif
  274 #ifdef INET6
  275         case AF_INET6:
  276                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
  277                 break;
  278 #endif
  279         default:
  280                 size = tcp_mssdflt;
  281                 goto out;
  282         }
  283 
  284         rt = NULL;
  285 #ifdef INET
  286         if (inp) {
  287                 rt = in_pcbrtentry(inp);
  288                 so = inp->inp_socket;
  289         }
  290 #endif
  291 #ifdef INET6
  292         if (in6p) {
  293                 rt = in6_pcbrtentry(in6p);
  294                 so = in6p->in6p_socket;
  295         }
  296 #endif
  297         if (rt == NULL) {
  298                 size = tcp_mssdflt;
  299                 goto out;
  300         }
  301 
  302         ifp = rt->rt_ifp;
  303 
  304         size = tcp_mssdflt;
  305         if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) {
  306 #ifdef INET6
  307                 if (in6p && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
  308                         /*
  309                          * RFC2460 section 5, last paragraph: if path MTU is
  310                          * smaller than 1280, use 1280 as packet size and
  311                          * attach fragment header.
  312                          */
  313                         size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
  314                         *alwaysfragp = TRUE;
  315                 } else
  316                         size = rt->rt_rmx.rmx_mtu - hdrlen;
  317 #else
  318                 size = rt->rt_rmx.rmx_mtu - hdrlen;
  319 #endif
  320         } else if (ifp->if_flags & IFF_LOOPBACK)
  321                 size = ifp->if_mtu - hdrlen;
  322 #ifdef INET
  323         else if (inp && tp->t_mtudisc)
  324                 size = ifp->if_mtu - hdrlen;
  325         else if (inp && in_localaddr(inp->inp_faddr))
  326                 size = ifp->if_mtu - hdrlen;
  327 #endif
  328 #ifdef INET6
  329         else if (in6p) {
  330 #ifdef INET
  331                 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
  332                         /* mapped addr case */
  333                         struct in_addr d;
  334                         bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d));
  335                         if (tp->t_mtudisc || in_localaddr(d))
  336                                 size = ifp->if_mtu - hdrlen;
  337                 } else
  338 #endif
  339                 {
  340                         /*
  341                          * for IPv6, path MTU discovery is always turned on,
  342                          * or the node must use packet size <= 1280.
  343                          */
  344                         size = tp->t_mtudisc ? IN6_LINKMTU(ifp) : IPV6_MMTU;
  345                         size -= hdrlen;
  346                 }
  347         }
  348 #endif
  349  out:
  350         /*
  351          * Now we must make room for whatever extra TCP/IP options are in
  352          * the packet.
  353          */
  354         optlen = tcp_optlen(tp);
  355 
  356         /*
  357          * XXX tp->t_ourmss should have the right size, but without this code
  358          * fragmentation will occur... need more investigation
  359          */
  360 #ifdef INET
  361         if (inp) {
  362 #if defined(IPSEC) || defined(FAST_IPSEC)
  363                 if (! IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND))
  364                         optlen += ipsec4_hdrsiz_tcp(tp);
  365 #endif
  366                 optlen += ip_optlen(inp);
  367         }
  368 #endif
  369 #ifdef INET6
  370 #ifdef INET
  371         if (in6p && tp->t_family == AF_INET) {
  372 #if defined(IPSEC) || defined(FAST_IPSEC)
  373                 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
  374                         optlen += ipsec4_hdrsiz_tcp(tp);
  375 #endif
  376                 /* XXX size -= ip_optlen(in6p); */
  377         } else
  378 #endif
  379         if (in6p && tp->t_family == AF_INET6) {
  380 #if defined(IPSEC) || defined(FAST_IPSEC)
  381                 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
  382                         optlen += ipsec6_hdrsiz_tcp(tp);
  383 #endif
  384                 optlen += ip6_optlen(in6p);
  385         }
  386 #endif
  387         size -= optlen;
  388 
  389         /* there may not be any room for data if mtu is too small */
  390         if (size < 0)
  391                 return (EMSGSIZE);
  392 
  393         /*
  394          * *rxsegsizep holds *estimated* inbound segment size (estimation
  395          * assumes that path MTU is the same for both ways).  this is only
  396          * for silly window avoidance, do not use the value for other purposes.
  397          *
  398          * ipseclen is subtracted from both sides, this may not be right.
  399          * I'm not quite sure about this (could someone comment).
  400          */
  401         *txsegsizep = min(tp->t_peermss - optlen, size);
  402         /*
  403          * Never send more than half a buffer full.  This insures that we can
  404          * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
  405          * therefore acks will never be delayed unless we run out of data to
  406          * transmit.
  407          */
  408         if (so)
  409                 *txsegsizep = min(so->so_snd.sb_hiwat >> 1, *txsegsizep);
  410         *rxsegsizep = min(tp->t_ourmss - optlen, size);
  411 
  412         if (*txsegsizep != tp->t_segsz) {
  413                 /*
  414                  * If the new segment size is larger, we don't want to
  415                  * mess up the congestion window, but if it is smaller
  416                  * we'll have to reduce the congestion window to ensure
  417                  * that we don't get into trouble with initial windows
  418                  * and the rest.  In any case, if the segment size
  419                  * has changed, chances are the path has, too, and
  420                  * our congestion window will be different.
  421                  */
  422                 if (*txsegsizep < tp->t_segsz) {
  423                         tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz)
  424                                            * *txsegsizep, *txsegsizep);
  425                         tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz)
  426                                                 * *txsegsizep, *txsegsizep);
  427                 }
  428                 tp->t_segsz = *txsegsizep;
  429         }
  430 
  431         return (0);
  432 }
  433 
  434 static
  435 #ifndef GPROF
  436 inline
  437 #endif
  438 int
  439 tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
  440     long len, int hdrlen, struct mbuf **mp)
  441 {
  442         struct mbuf *m, *m0;
  443 
  444         if (tp->t_force && len == 1)
  445                 tcpstat.tcps_sndprobe++;
  446         else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
  447                 tcpstat.tcps_sndrexmitpack++;
  448                 tcpstat.tcps_sndrexmitbyte += len;
  449         } else {
  450                 tcpstat.tcps_sndpack++;
  451                 tcpstat.tcps_sndbyte += len;
  452         }
  453 #ifdef notyet
  454         if ((m = m_copypack(so->so_snd.sb_mb, off,
  455             (int)len, max_linkhdr + hdrlen)) == 0)
  456                 return (ENOBUFS);
  457         /*
  458          * m_copypack left space for our hdr; use it.
  459          */
  460         m->m_len += hdrlen;
  461         m->m_data -= hdrlen;
  462 #else
  463         MGETHDR(m, M_DONTWAIT, MT_HEADER);
  464         if (__predict_false(m == NULL))
  465                 return (ENOBUFS);
  466         MCLAIM(m, &tcp_tx_mowner);
  467 
  468         /*
  469          * XXX Because other code assumes headers will fit in
  470          * XXX one header mbuf.
  471          *
  472          * (This code should almost *never* be run.)
  473          */
  474         if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
  475                 TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
  476                 MCLGET(m, M_DONTWAIT);
  477                 if ((m->m_flags & M_EXT) == 0) {
  478                         m_freem(m);
  479                         return (ENOBUFS);
  480                 }
  481         }
  482 
  483         m->m_data += max_linkhdr;
  484         m->m_len = hdrlen;
  485 
  486         /*
  487          * To avoid traversing the whole sb_mb chain for correct
  488          * data to send, remember last sent mbuf, its offset and
  489          * the sent size.  When called the next time, see if the
  490          * data to send is directly following the previous transfer.
  491          * This is important for large TCP windows.
  492          */
  493         if (off == 0 || tp->t_lastm == NULL ||
  494             (tp->t_lastoff + tp->t_lastlen) != off) {
  495                 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
  496                 /*
  497                  * Either a new packet or a retransmit.
  498                  * Start from the beginning.
  499                  */
  500                 tp->t_lastm = so->so_snd.sb_mb;
  501                 tp->t_inoff = off;
  502         } else {
  503                 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
  504                 tp->t_inoff += tp->t_lastlen;
  505         }
  506 
  507         /* Traverse forward to next packet */
  508         while (tp->t_inoff > 0) {
  509                 if (tp->t_lastm == NULL)
  510                         panic("tp->t_lastm == NULL");
  511                 if (tp->t_inoff < tp->t_lastm->m_len)
  512                         break;
  513                 tp->t_inoff -= tp->t_lastm->m_len;
  514                 tp->t_lastm = tp->t_lastm->m_next;
  515         }
  516 
  517         tp->t_lastoff = off;
  518         tp->t_lastlen = len;
  519         m0 = tp->t_lastm;
  520         off = tp->t_inoff;
  521 
  522         if (len <= M_TRAILINGSPACE(m)) {
  523                 m_copydata(m0, off, (int) len, mtod(m, caddr_t) + hdrlen);
  524                 m->m_len += len;
  525                 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
  526         } else {
  527                 m->m_next = m_copy(m0, off, (int) len);
  528                 if (m->m_next == NULL) {
  529                         m_freem(m);
  530                         return (ENOBUFS);
  531                 }
  532 #ifdef TCP_OUTPUT_COUNTERS
  533                 if (m->m_next->m_flags & M_EXT)
  534                         TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
  535                 else
  536                         TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
  537 #endif /* TCP_OUTPUT_COUNTERS */
  538         }
  539 #endif
  540 
  541         *mp = m;
  542         return (0);
  543 }
  544 
  545 /*
  546  * Tcp output routine: figure out what should be sent and send it.
  547  */
  548 int
  549 tcp_output(struct tcpcb *tp)
  550 {
  551         struct socket *so;
  552         struct route *ro;
  553         long len, win;
  554         int off, flags, error;
  555         struct mbuf *m;
  556         struct ip *ip;
  557 #ifdef INET6
  558         struct ip6_hdr *ip6;
  559 #endif
  560         struct tcphdr *th;
  561         u_char opt[MAX_TCPOPTLEN];
  562         unsigned optlen, hdrlen, packetlen;
  563         unsigned int sack_numblks;
  564         int idle, sendalot, txsegsize, rxsegsize;
  565         int txsegsize_nosack;
  566         int maxburst = TCP_MAXBURST;
  567         int af;         /* address family on the wire */
  568         int iphdrlen;
  569         int has_tso4, has_tso6;
  570         int has_tso, use_tso;
  571         boolean_t alwaysfrag;
  572         int sack_rxmit;
  573         int sack_bytes_rxmt;
  574         int ecn_tos;
  575         struct sackhole *p;
  576 #ifdef TCP_SIGNATURE
  577         int sigoff = 0;
  578 #endif
  579 
  580 #ifdef DIAGNOSTIC
  581         if (tp->t_inpcb && tp->t_in6pcb)
  582                 panic("tcp_output: both t_inpcb and t_in6pcb are set");
  583 #endif
  584         so = NULL;
  585         ro = NULL;
  586         if (tp->t_inpcb) {
  587                 so = tp->t_inpcb->inp_socket;
  588                 ro = &tp->t_inpcb->inp_route;
  589         }
  590 #ifdef INET6
  591         else if (tp->t_in6pcb) {
  592                 so = tp->t_in6pcb->in6p_socket;
  593                 ro = (struct route *)&tp->t_in6pcb->in6p_route;
  594         }
  595 #endif
  596 
  597         switch (af = tp->t_family) {
  598 #ifdef INET
  599         case AF_INET:
  600                 if (tp->t_inpcb)
  601                         break;
  602 #ifdef INET6
  603                 /* mapped addr case */
  604                 if (tp->t_in6pcb)
  605                         break;
  606 #endif
  607                 return (EINVAL);
  608 #endif
  609 #ifdef INET6
  610         case AF_INET6:
  611                 if (tp->t_in6pcb)
  612                         break;
  613                 return (EINVAL);
  614 #endif
  615         default:
  616                 return (EAFNOSUPPORT);
  617         }
  618 
  619         if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag))
  620                 return (EMSGSIZE);
  621 
  622         idle = (tp->snd_max == tp->snd_una);
  623 
  624         /*
  625          * Determine if we can use TCP segmentation offload:
  626          * - If we're using IPv4
  627          * - If there is not an IPsec policy that prevents it
  628          * - If the interface can do it
  629          */
  630         has_tso4 = has_tso6 = FALSE;
  631 #if defined(INET)
  632         has_tso4 = tp->t_inpcb != NULL &&
  633 #if defined(IPSEC) || defined(FAST_IPSEC)
  634                   IPSEC_PCB_SKIP_IPSEC(tp->t_inpcb->inp_sp,
  635                                        IPSEC_DIR_OUTBOUND) &&
  636 #endif
  637                   tp->t_inpcb->inp_route.ro_rt != NULL &&
  638                   (tp->t_inpcb->inp_route.ro_rt->rt_ifp->if_capenable &
  639                    IFCAP_TSOv4) != 0;
  640 #endif /* defined(INET) */
  641 #if defined(INET6)
  642         has_tso6 = tp->t_in6pcb != NULL &&
  643 #if defined(IPSEC) || defined(FAST_IPSEC)
  644                   IPSEC_PCB_SKIP_IPSEC(tp->t_in6pcb->in6p_sp,
  645                                        IPSEC_DIR_OUTBOUND) &&
  646 #endif
  647                   tp->t_in6pcb->in6p_route.ro_rt != NULL &&
  648                   (tp->t_in6pcb->in6p_route.ro_rt->rt_ifp->if_capenable &
  649                    IFCAP_TSOv6) != 0;
  650 #endif /* defined(INET6) */
  651         has_tso = (has_tso4 || has_tso6) && !alwaysfrag;
  652 
  653         /*
  654          * Restart Window computation.  From draft-floyd-incr-init-win-03:
  655          *
  656          *      Optionally, a TCP MAY set the restart window to the
  657          *      minimum of the value used for the initial window and
  658          *      the current value of cwnd (in other words, using a
  659          *      larger value for the restart window should never increase
  660          *      the size of cwnd).
  661          */
  662         if (tcp_cwm) {
  663                 /*
  664                  * Hughes/Touch/Heidemann Congestion Window Monitoring.
  665                  * Count the number of packets currently pending
  666                  * acknowledgement, and limit our congestion window
  667                  * to a pre-determined allowed burst size plus that count.
  668                  * This prevents bursting once all pending packets have
  669                  * been acknowledged (i.e. transmission is idle).
  670                  *
  671                  * XXX Link this to Initial Window?
  672                  */
  673                 tp->snd_cwnd = min(tp->snd_cwnd,
  674                     (tcp_cwm_burstsize * txsegsize) +
  675                     (tp->snd_nxt - tp->snd_una));
  676         } else {
  677                 if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
  678                         /*
  679                          * We have been idle for "a while" and no acks are
  680                          * expected to clock out any data we send --
  681                          * slow start to get ack "clock" running again.
  682                          */
  683                         int ss = tcp_init_win;
  684 #ifdef INET
  685                         if (tp->t_inpcb &&
  686                             in_localaddr(tp->t_inpcb->inp_faddr))
  687                                 ss = tcp_init_win_local;
  688 #endif
  689 #ifdef INET6
  690                         if (tp->t_in6pcb &&
  691                             in6_localaddr(&tp->t_in6pcb->in6p_faddr))
  692                                 ss = tcp_init_win_local;
  693 #endif
  694                         tp->snd_cwnd = min(tp->snd_cwnd,
  695                             TCP_INITIAL_WINDOW(ss, txsegsize));
  696                 }
  697         }
  698 
  699         txsegsize_nosack = txsegsize;
  700 again:
  701         ecn_tos = 0;
  702         use_tso = has_tso;
  703         if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) {
  704                 /* don't duplicate CWR/ECE. */
  705                 use_tso = 0;
  706         }
  707         TCP_REASS_LOCK(tp);
  708         sack_numblks = tcp_sack_numblks(tp);
  709         if (sack_numblks) {
  710                 int sackoptlen;
  711 
  712                 sackoptlen = TCP_SACK_OPTLEN(sack_numblks);
  713                 if (sackoptlen > txsegsize_nosack) {
  714                         sack_numblks = 0; /* give up SACK */
  715                         txsegsize = txsegsize_nosack;
  716                 } else {
  717                         if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
  718                                 /* don't duplicate D-SACK. */
  719                                 use_tso = 0;
  720                         }
  721                         txsegsize = txsegsize_nosack - sackoptlen;
  722                 }
  723         } else {
  724                 txsegsize = txsegsize_nosack;
  725         }
  726 
  727         /*
  728          * Determine length of data that should be transmitted, and
  729          * flags that should be used.  If there is some data or critical
  730          * controls (SYN, RST) to send, then transmit; otherwise,
  731          * investigate further.
  732          *
  733          * Readjust SACK information to avoid resending duplicate data.
  734          */
  735         if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
  736                 tcp_sack_adjust(tp);
  737         sendalot = 0;
  738         off = tp->snd_nxt - tp->snd_una;
  739         win = min(tp->snd_wnd, tp->snd_cwnd);
  740 
  741         flags = tcp_outflags[tp->t_state];
  742 
  743         /*
  744          * Send any SACK-generated retransmissions.  If we're explicitly trying
  745          * to send out new data (when sendalot is 1), bypass this function.
  746          * If we retransmit in fast recovery mode, decrement snd_cwnd, since
  747          * we're replacing a (future) new transmission with a retransmission
  748          * now, and we previously incremented snd_cwnd in tcp_input().
  749          */
  750         /*
  751          * Still in sack recovery , reset rxmit flag to zero.
  752          */
  753         sack_rxmit = 0;
  754         sack_bytes_rxmt = 0;
  755         len = 0;
  756         p = NULL;
  757         do {
  758                 long cwin;
  759                 if (!TCP_SACK_ENABLED(tp))
  760                         break;
  761                 if (tp->t_partialacks < 0) 
  762                         break;
  763                 p = tcp_sack_output(tp, &sack_bytes_rxmt);
  764                 if (p == NULL)
  765                         break;
  766                 
  767                 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
  768                 if (cwin < 0)
  769                         cwin = 0;
  770                 /* Do not retransmit SACK segments beyond snd_recover */
  771                 if (SEQ_GT(p->end, tp->snd_recover)) {
  772                         /*
  773                          * (At least) part of sack hole extends beyond
  774                          * snd_recover. Check to see if we can rexmit data
  775                          * for this hole.
  776                          */
  777                         if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
  778                                 /*
  779                                  * Can't rexmit any more data for this hole.
  780                                  * That data will be rexmitted in the next
  781                                  * sack recovery episode, when snd_recover
  782                                  * moves past p->rxmit.
  783                                  */
  784                                 p = NULL;
  785                                 break;
  786                         }
  787                         /* Can rexmit part of the current hole */
  788                         len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
  789                 } else
  790                         len = ((long)ulmin(cwin, p->end - p->rxmit));
  791                 off = p->rxmit - tp->snd_una;
  792                 if (off + len > so->so_snd.sb_cc) {
  793                         /* 1 for TH_FIN */
  794                         KASSERT(off + len == so->so_snd.sb_cc + 1);
  795                         KASSERT(p->rxmit + len == tp->snd_max);
  796                         len = so->so_snd.sb_cc - off;
  797                 }
  798                 if (len > 0) {
  799                         sack_rxmit = 1;
  800                         sendalot = 1;
  801                 }
  802         } while (/*CONSTCOND*/0);
  803 
  804         /*
  805          * If in persist timeout with window of 0, send 1 byte.
  806          * Otherwise, if window is small but nonzero
  807          * and timer expired, we will send what we can
  808          * and go to transmit state.
  809          */
  810         if (tp->t_force) {
  811                 if (win == 0) {
  812                         /*
  813                          * If we still have some data to send, then
  814                          * clear the FIN bit.  Usually this would
  815                          * happen below when it realizes that we
  816                          * aren't sending all the data.  However,
  817                          * if we have exactly 1 byte of unset data,
  818                          * then it won't clear the FIN bit below,
  819                          * and if we are in persist state, we wind
  820                          * up sending the packet without recording
  821                          * that we sent the FIN bit.
  822                          *
  823                          * We can't just blindly clear the FIN bit,
  824                          * because if we don't have any more data
  825                          * to send then the probe will be the FIN
  826                          * itself.
  827                          */
  828                         if (off < so->so_snd.sb_cc)
  829                                 flags &= ~TH_FIN;
  830                         win = 1;
  831                 } else {
  832                         TCP_TIMER_DISARM(tp, TCPT_PERSIST);
  833                         tp->t_rxtshift = 0;
  834                 }
  835         }
  836 
  837         if (sack_rxmit == 0) {
  838                 if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) {
  839                         long cwin;
  840 
  841                         /*
  842                          * We are inside of a SACK recovery episode and are
  843                          * sending new data, having retransmitted all the
  844                          * data possible in the scoreboard.
  845                          */
  846                         if (tp->snd_wnd < so->so_snd.sb_cc) {
  847                                 len = tp->snd_wnd - off;
  848                                 flags &= ~TH_FIN;
  849                         } else {
  850                                 len = so->so_snd.sb_cc - off;
  851                         }
  852 
  853                         /*
  854                          * From FreeBSD:
  855                          *  Don't remove this (len > 0) check !
  856                          *  We explicitly check for len > 0 here (although it 
  857                          *  isn't really necessary), to work around a gcc 
  858                          *  optimization issue - to force gcc to compute
  859                          *  len above. Without this check, the computation
  860                          *  of len is bungled by the optimizer.
  861                          */
  862                         if (len > 0) {
  863                                 cwin = tp->snd_cwnd - 
  864                                     (tp->snd_nxt - tp->sack_newdata) -
  865                                     sack_bytes_rxmt;
  866                                 if (cwin < 0)
  867                                         cwin = 0;
  868                                 if (cwin < len) {
  869                                         len = cwin;
  870                                         flags &= ~TH_FIN;
  871                                 }
  872                         }
  873                 } else if (win < so->so_snd.sb_cc) {
  874                         len = win - off;
  875                         flags &= ~TH_FIN;
  876                 } else {
  877                         len = so->so_snd.sb_cc - off;
  878                 }
  879         }
  880 
  881         if (len < 0) {
  882                 /*
  883                  * If FIN has been sent but not acked,
  884                  * but we haven't been called to retransmit,
  885                  * len will be -1.  Otherwise, window shrank
  886                  * after we sent into it.  If window shrank to 0,
  887                  * cancel pending retransmit, pull snd_nxt back
  888                  * to (closed) window, and set the persist timer
  889                  * if it isn't already going.  If the window didn't
  890                  * close completely, just wait for an ACK.
  891                  *
  892                  * If we have a pending FIN, either it has already been
  893                  * transmitted or it is outside the window, so drop it.
  894                  * If the FIN has been transmitted, but this is not a
  895                  * retransmission, then len must be -1.  Therefore we also
  896                  * prevent here the sending of `gratuitous FINs'.  This
  897                  * eliminates the need to check for that case below (e.g.
  898                  * to back up snd_nxt before the FIN so that the sequence
  899                  * number is correct).
  900                  */
  901                 len = 0;
  902                 flags &= ~TH_FIN;
  903                 if (win == 0) {
  904                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
  905                         tp->t_rxtshift = 0;
  906                         tp->snd_nxt = tp->snd_una;
  907                         if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
  908                                 tcp_setpersist(tp);
  909                 }
  910         }
  911         if (len > txsegsize) {
  912                 if (use_tso) {
  913                         /*
  914                          * Truncate TSO transfers to IP_MAXPACKET, and make
  915                          * sure that we send equal size transfers down the
  916                          * stack (rather than big-small-big-small-...).
  917                          */
  918 #ifdef INET6
  919 #if IPV6_MAXPACKET != IP_MAXPACKET
  920 #error IPV6_MAXPACKET != IP_MAXPACKET
  921 #endif
  922 #endif
  923                         len = (min(len, IP_MAXPACKET) / txsegsize) * txsegsize;
  924                         if (len <= txsegsize) {
  925                                 use_tso = 0;
  926                         }
  927                 } else
  928                         len = txsegsize;
  929                 flags &= ~TH_FIN;
  930                 sendalot = 1;
  931         } else
  932                 use_tso = 0;
  933         if (sack_rxmit) {
  934                 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
  935                         flags &= ~TH_FIN;
  936         }
  937 
  938         win = sbspace(&so->so_rcv);
  939 
  940         /*
  941          * Sender silly window avoidance.  If connection is idle
  942          * and can send all data, a maximum segment,
  943          * at least a maximum default-size segment do it,
  944          * or are forced, do it; otherwise don't bother.
  945          * If peer's buffer is tiny, then send
  946          * when window is at least half open.
  947          * If retransmitting (possibly after persist timer forced us
  948          * to send into a small window), then must resend.
  949          */
  950         if (len) {
  951                 if (len >= txsegsize)
  952                         goto send;
  953                 if ((so->so_state & SS_MORETOCOME) == 0 &&
  954                     ((idle || tp->t_flags & TF_NODELAY) &&
  955                      len + off >= so->so_snd.sb_cc))
  956                         goto send;
  957                 if (tp->t_force)
  958                         goto send;
  959                 if (len >= tp->max_sndwnd / 2)
  960                         goto send;
  961                 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
  962                         goto send;
  963                 if (sack_rxmit)
  964                         goto send;
  965         }
  966 
  967         /*
  968          * Compare available window to amount of window known to peer
  969          * (as advertised window less next expected input).  If the
  970          * difference is at least twice the size of the largest segment
  971          * we expect to receive (i.e. two segments) or at least 50% of
  972          * the maximum possible window, then want to send a window update
  973          * to peer.
  974          */
  975         if (win > 0) {
  976                 /*
  977                  * "adv" is the amount we can increase the window,
  978                  * taking into account that we are limited by
  979                  * TCP_MAXWIN << tp->rcv_scale.
  980                  */
  981                 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
  982                         (tp->rcv_adv - tp->rcv_nxt);
  983 
  984                 if (adv >= (long) (2 * rxsegsize))
  985                         goto send;
  986                 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
  987                         goto send;
  988         }
  989 
  990         /*
  991          * Send if we owe peer an ACK.
  992          */
  993         if (tp->t_flags & TF_ACKNOW)
  994                 goto send;
  995         if (flags & (TH_SYN|TH_FIN|TH_RST))
  996                 goto send;
  997         if (SEQ_GT(tp->snd_up, tp->snd_una))
  998                 goto send;
  999         /*
 1000          * In SACK, it is possible for tcp_output to fail to send a segment
 1001          * after the retransmission timer has been turned off.  Make sure
 1002          * that the retransmission timer is set.
 1003          */
 1004         if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) &&
 1005             !TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
 1006             !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
 1007                 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 1008                 goto just_return;
 1009         }
 1010 
 1011         /*
 1012          * TCP window updates are not reliable, rather a polling protocol
 1013          * using ``persist'' packets is used to insure receipt of window
 1014          * updates.  The three ``states'' for the output side are:
 1015          *      idle                    not doing retransmits or persists
 1016          *      persisting              to move a small or zero window
 1017          *      (re)transmitting        and thereby not persisting
 1018          *
 1019          * tp->t_timer[TCPT_PERSIST]
 1020          *      is set when we are in persist state.
 1021          * tp->t_force
 1022          *      is set when we are called to send a persist packet.
 1023          * tp->t_timer[TCPT_REXMT]
 1024          *      is set when we are retransmitting
 1025          * The output side is idle when both timers are zero.
 1026          *
 1027          * If send window is too small, there is data to transmit, and no
 1028          * retransmit or persist is pending, then go to persist state.
 1029          * If nothing happens soon, send when timer expires:
 1030          * if window is nonzero, transmit what we can,
 1031          * otherwise force out a byte.
 1032          */
 1033         if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
 1034             TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
 1035                 tp->t_rxtshift = 0;
 1036                 tcp_setpersist(tp);
 1037         }
 1038 
 1039         /*
 1040          * No reason to send a segment, just return.
 1041          */
 1042 just_return:
 1043         TCP_REASS_UNLOCK(tp);
 1044         return (0);
 1045 
 1046 send:
 1047         /*
 1048          * Before ESTABLISHED, force sending of initial options
 1049          * unless TCP set not to do any options.
 1050          * NOTE: we assume that the IP/TCP header plus TCP options
 1051          * always fit in a single mbuf, leaving room for a maximum
 1052          * link header, i.e.
 1053          *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 1054          */
 1055         optlen = 0;
 1056         switch (af) {
 1057 #ifdef INET
 1058         case AF_INET:
 1059                 iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
 1060                 break;
 1061 #endif
 1062 #ifdef INET6
 1063         case AF_INET6:
 1064                 iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 1065                 break;
 1066 #endif
 1067         default:        /*pacify gcc*/
 1068                 iphdrlen = 0;
 1069                 break;
 1070         }
 1071         hdrlen = iphdrlen;
 1072         if (flags & TH_SYN) {
 1073                 struct rtentry *rt;
 1074 
 1075                 rt = NULL;
 1076 #ifdef INET
 1077                 if (tp->t_inpcb)
 1078                         rt = in_pcbrtentry(tp->t_inpcb);
 1079 #endif
 1080 #ifdef INET6
 1081                 if (tp->t_in6pcb)
 1082                         rt = in6_pcbrtentry(tp->t_in6pcb);
 1083 #endif
 1084 
 1085                 tp->snd_nxt = tp->iss;
 1086                 tp->t_ourmss = tcp_mss_to_advertise(rt != NULL ?
 1087                                                     rt->rt_ifp : NULL, af);
 1088                 if ((tp->t_flags & TF_NOOPT) == 0) {
 1089                         opt[0] = TCPOPT_MAXSEG;
 1090                         opt[1] = 4;
 1091                         opt[2] = (tp->t_ourmss >> 8) & 0xff;
 1092                         opt[3] = tp->t_ourmss & 0xff;
 1093                         optlen = 4;
 1094 
 1095                         if ((tp->t_flags & TF_REQ_SCALE) &&
 1096                             ((flags & TH_ACK) == 0 ||
 1097                             (tp->t_flags & TF_RCVD_SCALE))) {
 1098                                 *((u_int32_t *) (opt + optlen)) = htonl(
 1099                                         TCPOPT_NOP << 24 |
 1100                                         TCPOPT_WINDOW << 16 |
 1101                                         TCPOLEN_WINDOW << 8 |
 1102                                         tp->request_r_scale);
 1103                                 optlen += 4;
 1104                         }
 1105                         if (tcp_do_sack) {
 1106                                 u_int8_t *cp = (u_int8_t *)(opt + optlen);
 1107 
 1108                                 cp[0] = TCPOPT_SACK_PERMITTED;
 1109                                 cp[1] = 2;
 1110                                 cp[2] = TCPOPT_NOP;
 1111                                 cp[3] = TCPOPT_NOP;
 1112                                 optlen += 4;
 1113                         }
 1114                 }
 1115         }
 1116 
 1117         /*
 1118          * Send a timestamp and echo-reply if this is a SYN and our side
 1119          * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
 1120          * and our peer have sent timestamps in our SYN's.
 1121          */
 1122         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 1123              (flags & TH_RST) == 0 &&
 1124             ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
 1125              (tp->t_flags & TF_RCVD_TSTMP))) {
 1126                 u_int32_t *lp = (u_int32_t *)(opt + optlen);
 1127 
 1128                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 1129                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 1130                 *lp++ = htonl(TCP_TIMESTAMP(tp));
 1131                 *lp   = htonl(tp->ts_recent);
 1132                 optlen += TCPOLEN_TSTAMP_APPA;
 1133         }
 1134 
 1135         /*
 1136          * Tack on the SACK block if it is necessary.
 1137          */
 1138         if (sack_numblks) {
 1139                 int sack_len;
 1140                 u_char *bp = (u_char *)(opt + optlen);
 1141                 u_int32_t *lp = (u_int32_t *)(bp + 4);
 1142                 struct ipqent *tiqe;
 1143 
 1144                 sack_len = sack_numblks * 8 + 2;
 1145                 bp[0] = TCPOPT_NOP;
 1146                 bp[1] = TCPOPT_NOP;
 1147                 bp[2] = TCPOPT_SACK;
 1148                 bp[3] = sack_len;
 1149                 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
 1150                         sack_numblks--;
 1151                         *lp++ = htonl(tp->rcv_dsack_block.left);
 1152                         *lp++ = htonl(tp->rcv_dsack_block.right);
 1153                         tp->rcv_sack_flags &= ~TCPSACK_HAVED;
 1154                 }
 1155                 for (tiqe = TAILQ_FIRST(&tp->timeq);
 1156                     sack_numblks > 0; tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
 1157                         KASSERT(tiqe != NULL);
 1158                         sack_numblks--;
 1159                         *lp++ = htonl(tiqe->ipqe_seq);
 1160                         *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
 1161                             ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
 1162                 }
 1163                 optlen += sack_len + 2;
 1164         }
 1165         TCP_REASS_UNLOCK(tp);
 1166 
 1167 #ifdef TCP_SIGNATURE
 1168         if (tp->t_flags & TF_SIGNATURE) {
 1169                 u_char *bp;
 1170                 /*
 1171                  * Initialize TCP-MD5 option (RFC2385)
 1172                  */
 1173                 bp = (u_char *)opt + optlen;
 1174                 *bp++ = TCPOPT_SIGNATURE;
 1175                 *bp++ = TCPOLEN_SIGNATURE;
 1176                 sigoff = optlen + 2;
 1177                 bzero(bp, TCP_SIGLEN);
 1178                 bp += TCP_SIGLEN;
 1179                 optlen += TCPOLEN_SIGNATURE;
 1180                 /*
 1181                  * Terminate options list and maintain 32-bit alignment.
 1182                  */
 1183                 *bp++ = TCPOPT_NOP;
 1184                 *bp++ = TCPOPT_EOL;
 1185                 optlen += 2;
 1186         }
 1187 #endif /* TCP_SIGNATURE */
 1188 
 1189         hdrlen += optlen;
 1190 
 1191 #ifdef DIAGNOSTIC
 1192         if (!use_tso && len > txsegsize)
 1193                 panic("tcp data to be sent is larger than segment");
 1194         else if (use_tso && len > IP_MAXPACKET)
 1195                 panic("tcp data to be sent is larger than max TSO size");
 1196         if (max_linkhdr + hdrlen > MCLBYTES)
 1197                 panic("tcphdr too big");
 1198 #endif
 1199 
 1200         /*
 1201          * Grab a header mbuf, attaching a copy of data to
 1202          * be transmitted, and initialize the header from
 1203          * the template for sends on this connection.
 1204          */
 1205         if (len) {
 1206                 error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
 1207                 if (error)
 1208                         goto out;
 1209                 /*
 1210                  * If we're sending everything we've got, set PUSH.
 1211                  * (This will keep happy those implementations which only
 1212                  * give data to the user when a buffer fills or
 1213                  * a PUSH comes in.)
 1214                  */
 1215                 if (off + len == so->so_snd.sb_cc)
 1216                         flags |= TH_PUSH;
 1217         } else {
 1218                 if (tp->t_flags & TF_ACKNOW)
 1219                         tcpstat.tcps_sndacks++;
 1220                 else if (flags & (TH_SYN|TH_FIN|TH_RST))
 1221                         tcpstat.tcps_sndctrl++;
 1222                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
 1223                         tcpstat.tcps_sndurg++;
 1224                 else
 1225                         tcpstat.tcps_sndwinup++;
 1226 
 1227                 MGETHDR(m, M_DONTWAIT, MT_HEADER);
 1228                 if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
 1229                         MCLGET(m, M_DONTWAIT);
 1230                         if ((m->m_flags & M_EXT) == 0) {
 1231                                 m_freem(m);
 1232                                 m = NULL;
 1233                         }
 1234                 }
 1235                 if (m == NULL) {
 1236                         error = ENOBUFS;
 1237                         goto out;
 1238                 }
 1239                 MCLAIM(m, &tcp_tx_mowner);
 1240                 m->m_data += max_linkhdr;
 1241                 m->m_len = hdrlen;
 1242         }
 1243         m->m_pkthdr.rcvif = (struct ifnet *)0;
 1244         switch (af) {
 1245 #ifdef INET
 1246         case AF_INET:
 1247                 ip = mtod(m, struct ip *);
 1248 #ifdef INET6
 1249                 ip6 = NULL;
 1250 #endif
 1251                 th = (struct tcphdr *)(ip + 1);
 1252                 break;
 1253 #endif
 1254 #ifdef INET6
 1255         case AF_INET6:
 1256                 ip = NULL;
 1257                 ip6 = mtod(m, struct ip6_hdr *);
 1258                 th = (struct tcphdr *)(ip6 + 1);
 1259                 break;
 1260 #endif
 1261         default:        /*pacify gcc*/
 1262                 ip = NULL;
 1263 #ifdef INET6
 1264                 ip6 = NULL;
 1265 #endif
 1266                 th = NULL;
 1267                 break;
 1268         }
 1269         if (tp->t_template == 0)
 1270                 panic("tcp_output");
 1271         if (tp->t_template->m_len < iphdrlen)
 1272                 panic("tcp_output");
 1273         bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), iphdrlen);
 1274 
 1275         /*
 1276          * If we are starting a connection, send ECN setup
 1277          * SYN packet. If we are on a retransmit, we may
 1278          * resend those bits a number of times as per
 1279          * RFC 3168.
 1280          */
 1281         if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
 1282                 if (tp->t_flags & TF_SYN_REXMT) {
 1283                         if (tp->t_ecn_retries--)
 1284                                 flags |= TH_ECE|TH_CWR;
 1285                 } else {
 1286                         flags |= TH_ECE|TH_CWR;
 1287                         tp->t_ecn_retries = tcp_ecn_maxretries;
 1288                 }
 1289         }
 1290 
 1291         if (TCP_ECN_ALLOWED(tp)) {
 1292                 /*
 1293                  * If the peer has ECN, mark data packets
 1294                  * ECN capable. Ignore pure ack packets, retransmissions
 1295                  * and window probes.
 1296                  */
 1297                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
 1298                     !(tp->t_force && len == 1)) {
 1299                         ecn_tos = IPTOS_ECN_ECT0;
 1300                         tcpstat.tcps_ecn_ect++;
 1301                 }
 1302 
 1303                 /*
 1304                  * Reply with proper ECN notifications.
 1305                  */
 1306                 if (tp->t_flags & TF_ECN_SND_CWR) {
 1307                         flags |= TH_CWR;
 1308                         tp->t_flags &= ~TF_ECN_SND_CWR;
 1309                 } 
 1310                 if (tp->t_flags & TF_ECN_SND_ECE) {
 1311                         flags |= TH_ECE;
 1312                 }
 1313         }
 1314 
 1315 
 1316         /*
 1317          * If we are doing retransmissions, then snd_nxt will
 1318          * not reflect the first unsent octet.  For ACK only
 1319          * packets, we do not want the sequence number of the
 1320          * retransmitted packet, we want the sequence number
 1321          * of the next unsent octet.  So, if there is no data
 1322          * (and no SYN or FIN), use snd_max instead of snd_nxt
 1323          * when filling in ti_seq.  But if we are in persist
 1324          * state, snd_max might reflect one byte beyond the
 1325          * right edge of the window, so use snd_nxt in that
 1326          * case, since we know we aren't doing a retransmission.
 1327          * (retransmit and persist are mutually exclusive...)
 1328          */
 1329         if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
 1330                 th->th_seq = htonl(p->rxmit);
 1331                 p->rxmit += len;
 1332         } else {
 1333                 if (len || (flags & (TH_SYN|TH_FIN)) ||
 1334                     TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
 1335                         th->th_seq = htonl(tp->snd_nxt);
 1336                 else
 1337                         th->th_seq = htonl(tp->snd_max);
 1338         }
 1339         th->th_ack = htonl(tp->rcv_nxt);
 1340         if (optlen) {
 1341                 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
 1342                 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 1343         }
 1344         th->th_flags = flags;
 1345         /*
 1346          * Calculate receive window.  Don't shrink window,
 1347          * but avoid silly window syndrome.
 1348          */
 1349         if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
 1350                 win = 0;
 1351         if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 1352                 win = (long)TCP_MAXWIN << tp->rcv_scale;
 1353         if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
 1354                 win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
 1355         th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
 1356         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 1357                 u_int32_t urp = tp->snd_up - tp->snd_nxt;
 1358                 if (urp > IP_MAXPACKET)
 1359                         urp = IP_MAXPACKET;
 1360                 th->th_urp = htons((u_int16_t)urp);
 1361                 th->th_flags |= TH_URG;
 1362         } else
 1363                 /*
 1364                  * If no urgent pointer to send, then we pull
 1365                  * the urgent pointer to the left edge of the send window
 1366                  * so that it doesn't drift into the send window on sequence
 1367                  * number wraparound.
 1368                  */
 1369                 tp->snd_up = tp->snd_una;               /* drag it along */
 1370 
 1371 #ifdef TCP_SIGNATURE
 1372         if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
 1373                 struct secasvar *sav;
 1374                 u_int8_t *sigp;
 1375 
 1376                 sav = tcp_signature_getsav(m, th);
 1377 
 1378                 if (sav == NULL) {
 1379                         if (m)
 1380                                 m_freem(m);
 1381                         return (EPERM);
 1382                 }
 1383 
 1384                 m->m_pkthdr.len = hdrlen + len;
 1385                 sigp = (caddr_t)th + sizeof(*th) + sigoff;
 1386                 tcp_signature(m, th, (caddr_t)th - mtod(m, caddr_t), sav, sigp);
 1387 
 1388                 key_sa_recordxfer(sav, m);
 1389 #ifdef FAST_IPSEC
 1390                 KEY_FREESAV(&sav);
 1391 #else
 1392                 key_freesav(sav);
 1393 #endif
 1394         }
 1395 #endif
 1396 
 1397         /*
 1398          * Set ourselves up to be checksummed just before the packet
 1399          * hits the wire.
 1400          */
 1401         switch (af) {
 1402 #ifdef INET
 1403         case AF_INET:
 1404                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 1405                 if (use_tso) {
 1406                         m->m_pkthdr.segsz = txsegsize;
 1407                         m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
 1408                 } else {
 1409                         m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
 1410                         if (len + optlen) {
 1411                                 /* Fixup the pseudo-header checksum. */
 1412                                 /* XXXJRT Not IP Jumbogram safe. */
 1413                                 th->th_sum = in_cksum_addword(th->th_sum,
 1414                                     htons((u_int16_t) (len + optlen)));
 1415                         }
 1416                 }
 1417                 break;
 1418 #endif
 1419 #ifdef INET6
 1420         case AF_INET6:
 1421                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 1422                 if (use_tso) {
 1423                         m->m_pkthdr.segsz = txsegsize;
 1424                         m->m_pkthdr.csum_flags = M_CSUM_TSOv6;
 1425                 } else {
 1426                         m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
 1427                         if (len + optlen) {
 1428                                 /* Fixup the pseudo-header checksum. */
 1429                                 /* XXXJRT: Not IPv6 Jumbogram safe. */
 1430                                 th->th_sum = in_cksum_addword(th->th_sum,
 1431                                     htons((u_int16_t) (len + optlen)));
 1432                         }
 1433                 }
 1434                 break;
 1435 #endif
 1436         }
 1437 
 1438         /*
 1439          * In transmit state, time the transmission and arrange for
 1440          * the retransmit.  In persist state, just set snd_max.
 1441          */
 1442         if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
 1443                 tcp_seq startseq = tp->snd_nxt;
 1444 
 1445                 /*
 1446                  * Advance snd_nxt over sequence space of this segment.
 1447                  * There are no states in which we send both a SYN and a FIN,
 1448                  * so we collapse the tests for these flags.
 1449                  */
 1450                 if (flags & (TH_SYN|TH_FIN))
 1451                         tp->snd_nxt++;
 1452                 if (sack_rxmit)
 1453                         goto timer;
 1454                 tp->snd_nxt += len;
 1455                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 1456                         tp->snd_max = tp->snd_nxt;
 1457                         /*
 1458                          * Time this transmission if not a retransmission and
 1459                          * not currently timing anything.
 1460                          */
 1461                         if (tp->t_rtttime == 0) {
 1462                                 tp->t_rtttime = tcp_now;
 1463                                 tp->t_rtseq = startseq;
 1464                                 tcpstat.tcps_segstimed++;
 1465                         }
 1466                 }
 1467 
 1468                 /*
 1469                  * Set retransmit timer if not currently set,
 1470                  * and not doing an ack or a keep-alive probe.
 1471                  * Initial value for retransmit timer is smoothed
 1472                  * round-trip time + 2 * round-trip time variance.
 1473                  * Initialize shift counter which is used for backoff
 1474                  * of retransmit time.
 1475                  */
 1476 timer:
 1477                 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
 1478                         ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
 1479                     tp->snd_nxt != tp->snd_una)) {
 1480                         if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
 1481                                 TCP_TIMER_DISARM(tp, TCPT_PERSIST);
 1482                                 tp->t_rxtshift = 0;
 1483                         }
 1484                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 1485                 }
 1486         } else
 1487                 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
 1488                         tp->snd_max = tp->snd_nxt + len;
 1489 
 1490 #ifdef TCP_DEBUG
 1491         /*
 1492          * Trace.
 1493          */
 1494         if (so->so_options & SO_DEBUG)
 1495                 tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
 1496 #endif
 1497 
 1498         /*
 1499          * Fill in IP length and desired time to live and
 1500          * send to IP level.  There should be a better way
 1501          * to handle ttl and tos; we could keep them in
 1502          * the template, but need a way to checksum without them.
 1503          */
 1504         m->m_pkthdr.len = hdrlen + len;
 1505 
 1506         switch (af) {
 1507 #ifdef INET
 1508         case AF_INET:
 1509                 ip->ip_len = htons(m->m_pkthdr.len);
 1510                 packetlen = m->m_pkthdr.len;
 1511                 if (tp->t_inpcb) {
 1512                         ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
 1513                         ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos | ecn_tos;
 1514                 }
 1515 #ifdef INET6
 1516                 else if (tp->t_in6pcb) {
 1517                         ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/
 1518                         ip->ip_tos = ecn_tos;   /*XXX*/
 1519                 }
 1520 #endif
 1521                 break;
 1522 #endif
 1523 #ifdef INET6
 1524         case AF_INET6:
 1525                 packetlen = m->m_pkthdr.len;
 1526                 ip6->ip6_nxt = IPPROTO_TCP;
 1527                 if (tp->t_in6pcb) {
 1528                         /*
 1529                          * we separately set hoplimit for every segment, since
 1530                          * the user might want to change the value via
 1531                          * setsockopt. Also, desired default hop limit might
 1532                          * be changed via Neighbor Discovery.
 1533                          */
 1534                         ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb,
 1535                                 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
 1536                 }
 1537                 ip6->ip6_flow |= htonl(ecn_tos << 20);
 1538                 /* ip6->ip6_flow = ??? (from template) */
 1539                 /* ip6_plen will be filled in ip6_output(). */
 1540                 break;
 1541 #endif
 1542         default:        /*pacify gcc*/
 1543                 packetlen = 0;
 1544                 break;
 1545         }
 1546 
 1547         switch (af) {
 1548 #ifdef INET
 1549         case AF_INET:
 1550             {
 1551                 struct mbuf *opts;
 1552 
 1553                 if (tp->t_inpcb)
 1554                         opts = tp->t_inpcb->inp_options;
 1555                 else
 1556                         opts = NULL;
 1557                 error = ip_output(m, opts, ro,
 1558                         (tp->t_mtudisc ? IP_MTUDISC : 0) |
 1559                         (so->so_options & SO_DONTROUTE),
 1560                         (struct ip_moptions *)0, so);
 1561                 break;
 1562             }
 1563 #endif
 1564 #ifdef INET6
 1565         case AF_INET6:
 1566             {
 1567                 struct ip6_pktopts *opts;
 1568 
 1569                 if (tp->t_in6pcb)
 1570                         opts = tp->t_in6pcb->in6p_outputopts;
 1571                 else
 1572                         opts = NULL;
 1573                 error = ip6_output(m, opts, (struct route_in6 *)ro,
 1574                         so->so_options & SO_DONTROUTE,
 1575                         (struct ip6_moptions *)0, so, NULL);
 1576                 break;
 1577             }
 1578 #endif
 1579         default:
 1580                 error = EAFNOSUPPORT;
 1581                 break;
 1582         }
 1583         if (error) {
 1584 out:
 1585                 if (error == ENOBUFS) {
 1586                         tcpstat.tcps_selfquench++;
 1587 #ifdef INET
 1588                         if (tp->t_inpcb)
 1589                                 tcp_quench(tp->t_inpcb, 0);
 1590 #endif
 1591 #ifdef INET6
 1592                         if (tp->t_in6pcb)
 1593                                 tcp6_quench(tp->t_in6pcb, 0);
 1594 #endif
 1595                         error = 0;
 1596                 } else if ((error == EHOSTUNREACH || error == ENETDOWN) &&
 1597                     TCPS_HAVERCVDSYN(tp->t_state)) {
 1598                         tp->t_softerror = error;
 1599                         error = 0;
 1600                 }
 1601 
 1602                 /* Back out the seqence number advance. */
 1603                 if (sack_rxmit)
 1604                         p->rxmit -= len;
 1605 
 1606                 /* Restart the delayed ACK timer, if necessary. */
 1607                 if (tp->t_flags & TF_DELACK)
 1608                         TCP_RESTART_DELACK(tp);
 1609 
 1610                 return (error);
 1611         }
 1612 
 1613         if (packetlen > tp->t_pmtud_mtu_sent)
 1614                 tp->t_pmtud_mtu_sent = packetlen;
 1615         
 1616         tcpstat.tcps_sndtotal++;
 1617         if (tp->t_flags & TF_DELACK)
 1618                 tcpstat.tcps_delack++;
 1619 
 1620         /*
 1621          * Data sent (as far as we can tell).
 1622          * If this advertises a larger window than any other segment,
 1623          * then remember the size of the advertised window.
 1624          * Any pending ACK has now been sent.
 1625          */
 1626         if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
 1627                 tp->rcv_adv = tp->rcv_nxt + win;
 1628         tp->last_ack_sent = tp->rcv_nxt;
 1629         tp->t_flags &= ~TF_ACKNOW;
 1630         TCP_CLEAR_DELACK(tp);
 1631 #ifdef DIAGNOSTIC
 1632         if (maxburst < 0)
 1633                 printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
 1634 #endif
 1635         if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
 1636                 goto again;
 1637         return (0);
 1638 }
 1639 
 1640 void
 1641 tcp_setpersist(struct tcpcb *tp)
 1642 {
 1643         int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
 1644         int nticks;
 1645 
 1646         if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
 1647                 panic("tcp_output REXMT");
 1648         /*
 1649          * Start/restart persistance timer.
 1650          */
 1651         if (t < tp->t_rttmin)
 1652                 t = tp->t_rttmin;
 1653         TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
 1654             TCPTV_PERSMIN, TCPTV_PERSMAX);
 1655         TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
 1656         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 1657                 tp->t_rxtshift++;
 1658 }
Cache object: 1f537d6a529ac49f76c6fd5c7df5491c
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/netinet/tcp_output.c

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_output.c