The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/net/ipv4/tcp_output.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
    3  *              operating system.  INET is implemented using the  BSD Socket
    4  *              interface as the means of communication with the user level.
    5  *
    6  *              Implementation of the Transmission Control Protocol(TCP).
    7  *
    8  * Version:     $Id: tcp_output.c,v 1.144 2001/11/06 22:21:08 davem Exp $
    9  *
   10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
   11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
   13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
   14  *              Florian La Roche, <flla@stud.uni-sb.de>
   15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
   16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
   17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
   18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
   19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
   20  *              Jorge Cwik, <jorge@laser.satlink.net>
   21  */
   22 
   23 /*
   24  * Changes:     Pedro Roque     :       Retransmit queue handled by TCP.
   25  *                              :       Fragmentation on mtu decrease
   26  *                              :       Segment collapse on retransmit
   27  *                              :       AF independence
   28  *
   29  *              Linus Torvalds  :       send_delayed_ack
   30  *              David S. Miller :       Charge memory using the right skb
   31  *                                      during syn/ack processing.
   32  *              David S. Miller :       Output engine completely rewritten.
   33  *              Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
   34  *              Cacophonix Gaul :       draft-minshall-nagle-01
   35  *              J Hadi Salim    :       ECN support
   36  *
   37  */
   38 
   39 #include <net/tcp.h>
   40 
   41 #include <linux/compiler.h>
   42 #include <linux/smp_lock.h>
   43 
   44 /* People can turn this off for buggy TCP's found in printers etc. */
   45 int sysctl_tcp_retrans_collapse = 1;
   46 
   47 static __inline__
   48 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
   49 {
   50         tp->send_head = skb->next;
   51         if (tp->send_head == (struct sk_buff *) &sk->write_queue)
   52                 tp->send_head = NULL;
   53         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
   54         if (tp->packets_out++ == 0)
   55                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
   56 }
   57 
   58 /* SND.NXT, if window was not shrunk.
   59  * If window has been shrunk, what should we make? It is not clear at all.
   60  * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
   61  * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
   62  * invalid. OK, let's make this for now:
   63  */
   64 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
   65 {
   66         if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
   67                 return tp->snd_nxt;
   68         else
   69                 return tp->snd_una+tp->snd_wnd;
   70 }
   71 
   72 /* Calculate mss to advertise in SYN segment.
   73  * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
   74  *
   75  * 1. It is independent of path mtu.
   76  * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
   77  * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
   78  *    attached devices, because some buggy hosts are confused by
   79  *    large MSS.
   80  * 4. We do not make 3, we advertise MSS, calculated from first
   81  *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
   82  *    This may be overriden via information stored in routing table.
   83  * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
   84  *    probably even Jumbo".
   85  */
   86 static __u16 tcp_advertise_mss(struct sock *sk)
   87 {
   88         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
   89         struct dst_entry *dst = __sk_dst_get(sk);
   90         int mss = tp->advmss;
   91 
   92         if (dst && dst->advmss < mss) {
   93                 mss = dst->advmss;
   94                 tp->advmss = mss;
   95         }
   96 
   97         return (__u16)mss;
   98 }
   99 
  100 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
  101  * This is the first part of cwnd validation mechanism. */
  102 static void tcp_cwnd_restart(struct tcp_opt *tp)
  103 {
  104         s32 delta = tcp_time_stamp - tp->lsndtime;
  105         u32 restart_cwnd = tcp_init_cwnd(tp);
  106         u32 cwnd = tp->snd_cwnd;
  107 
  108         tp->snd_ssthresh = tcp_current_ssthresh(tp);
  109         restart_cwnd = min(restart_cwnd, cwnd);
  110 
  111         while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
  112                 cwnd >>= 1;
  113         tp->snd_cwnd = max(cwnd, restart_cwnd);
  114         tp->snd_cwnd_stamp = tcp_time_stamp;
  115         tp->snd_cwnd_used = 0;
  116 }
  117 
  118 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
  119 {
  120         u32 now = tcp_time_stamp;
  121 
  122         if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
  123                 tcp_cwnd_restart(tp);
  124 
  125         tp->lsndtime = now;
  126 
  127         /* If it is a reply for ato after last received
  128          * packet, enter pingpong mode.
  129          */
  130         if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
  131                 tp->ack.pingpong = 1;
  132 }
  133 
  134 static __inline__ void tcp_event_ack_sent(struct sock *sk)
  135 {
  136         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  137 
  138         tcp_dec_quickack_mode(tp);
  139         tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
  140 }
  141 
  142 /* Chose a new window to advertise, update state in tcp_opt for the
  143  * socket, and return result with RFC1323 scaling applied.  The return
  144  * value can be stuffed directly into th->window for an outgoing
  145  * frame.
  146  */
  147 static __inline__ u16 tcp_select_window(struct sock *sk)
  148 {
  149         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  150         u32 cur_win = tcp_receive_window(tp);
  151         u32 new_win = __tcp_select_window(sk);
  152 
  153         /* Never shrink the offered window */
  154         if(new_win < cur_win) {
  155                 /* Danger Will Robinson!
  156                  * Don't update rcv_wup/rcv_wnd here or else
  157                  * we will not be able to advertise a zero
  158                  * window in time.  --DaveM
  159                  *
  160                  * Relax Will Robinson.
  161                  */
  162                 new_win = cur_win;
  163         }
  164         tp->rcv_wnd = new_win;
  165         tp->rcv_wup = tp->rcv_nxt;
  166 
  167         /* RFC1323 scaling applied */
  168         new_win >>= tp->rcv_wscale;
  169 
  170         /* If we advertise zero window, disable fast path. */
  171         if (new_win == 0)
  172                 tp->pred_flags = 0;
  173 
  174         return new_win;
  175 }
  176 
  177 
  178 /* This routine actually transmits TCP packets queued in by
  179  * tcp_do_sendmsg().  This is used by both the initial
  180  * transmission and possible later retransmissions.
  181  * All SKB's seen here are completely headerless.  It is our
  182  * job to build the TCP header, and pass the packet down to
  183  * IP so it can do the same plus pass the packet off to the
  184  * device.
  185  *
  186  * We are working here with either a clone of the original
  187  * SKB, or a fresh unique copy made by the retransmit engine.
  188  */
  189 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
  190 {
  191         if(skb != NULL) {
  192                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  193                 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
  194                 int tcp_header_size = tp->tcp_header_len;
  195                 struct tcphdr *th;
  196                 int sysctl_flags;
  197                 int err;
  198 
  199 #define SYSCTL_FLAG_TSTAMPS     0x1
  200 #define SYSCTL_FLAG_WSCALE      0x2
  201 #define SYSCTL_FLAG_SACK        0x4
  202 
  203                 sysctl_flags = 0;
  204                 if (tcb->flags & TCPCB_FLAG_SYN) {
  205                         tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
  206                         if(sysctl_tcp_timestamps) {
  207                                 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
  208                                 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
  209                         }
  210                         if(sysctl_tcp_window_scaling) {
  211                                 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
  212                                 sysctl_flags |= SYSCTL_FLAG_WSCALE;
  213                         }
  214                         if(sysctl_tcp_sack) {
  215                                 sysctl_flags |= SYSCTL_FLAG_SACK;
  216                                 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
  217                                         tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
  218                         }
  219                 } else if (tp->eff_sacks) {
  220                         /* A SACK is 2 pad bytes, a 2 byte header, plus
  221                          * 2 32-bit sequence numbers for each SACK block.
  222                          */
  223                         tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
  224                                             (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
  225                 }
  226                 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
  227                 skb->h.th = th;
  228                 skb_set_owner_w(skb, sk);
  229 
  230                 /* Build TCP header and checksum it. */
  231                 th->source              = sk->sport;
  232                 th->dest                = sk->dport;
  233                 th->seq                 = htonl(tcb->seq);
  234                 th->ack_seq             = htonl(tp->rcv_nxt);
  235                 *(((__u16 *)th) + 6)    = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
  236                 if (tcb->flags & TCPCB_FLAG_SYN) {
  237                         /* RFC1323: The window in SYN & SYN/ACK segments
  238                          * is never scaled.
  239                          */
  240                         th->window      = htons(tp->rcv_wnd);
  241                 } else {
  242                         th->window      = htons(tcp_select_window(sk));
  243                 }
  244                 th->check               = 0;
  245                 th->urg_ptr             = 0;
  246 
  247                 if (tp->urg_mode &&
  248                     between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
  249                         th->urg_ptr             = htons(tp->snd_up-tcb->seq);
  250                         th->urg                 = 1;
  251                 }
  252 
  253                 if (tcb->flags & TCPCB_FLAG_SYN) {
  254                         tcp_syn_build_options((__u32 *)(th + 1),
  255                                               tcp_advertise_mss(sk),
  256                                               (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
  257                                               (sysctl_flags & SYSCTL_FLAG_SACK),
  258                                               (sysctl_flags & SYSCTL_FLAG_WSCALE),
  259                                               tp->rcv_wscale,
  260                                               tcb->when,
  261                                               tp->ts_recent);
  262                 } else {
  263                         tcp_build_and_update_options((__u32 *)(th + 1),
  264                                                      tp, tcb->when);
  265 
  266                         TCP_ECN_send(sk, tp, skb, tcp_header_size);
  267                 }
  268                 tp->af_specific->send_check(sk, th, skb->len, skb);
  269 
  270                 if (tcb->flags & TCPCB_FLAG_ACK)
  271                         tcp_event_ack_sent(sk);
  272 
  273                 if (skb->len != tcp_header_size)
  274                         tcp_event_data_sent(tp, skb);
  275 
  276                 TCP_INC_STATS(TcpOutSegs);
  277 
  278                 err = tp->af_specific->queue_xmit(skb);
  279                 if (err <= 0)
  280                         return err;
  281 
  282                 tcp_enter_cwr(tp);
  283 
  284                 /* NET_XMIT_CN is special. It does not guarantee,
  285                  * that this packet is lost. It tells that device
  286                  * is about to start to drop packets or already
  287                  * drops some packets of the same priority and
  288                  * invokes us to send less aggressively.
  289                  */
  290                 return err == NET_XMIT_CN ? 0 : err;
  291         }
  292         return -ENOBUFS;
  293 #undef SYSCTL_FLAG_TSTAMPS
  294 #undef SYSCTL_FLAG_WSCALE
  295 #undef SYSCTL_FLAG_SACK
  296 }
  297 
  298 
  299 /* This is the main buffer sending routine. We queue the buffer
  300  * and decide whether to queue or transmit now.
  301  *
  302  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
  303  * otherwise socket can stall.
  304  */
  305 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
  306 {
  307         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  308 
  309         /* Advance write_seq and place onto the write_queue. */
  310         tp->write_seq = TCP_SKB_CB(skb)->end_seq;
  311         __skb_queue_tail(&sk->write_queue, skb);
  312         tcp_charge_skb(sk, skb);
  313 
  314         if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
  315                 /* Send it out now. */
  316                 TCP_SKB_CB(skb)->when = tcp_time_stamp;
  317                 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
  318                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
  319                         tcp_minshall_update(tp, cur_mss, skb);
  320                         if (tp->packets_out++ == 0)
  321                                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
  322                         return;
  323                 }
  324         }
  325         /* Queue it, remembering where we must start sending. */
  326         if (tp->send_head == NULL)
  327                 tp->send_head = skb;
  328 }
  329 
  330 /* Send _single_ skb sitting at the send head. This function requires
  331  * true push pending frames to setup probe timer etc.
  332  */
  333 void tcp_push_one(struct sock *sk, unsigned cur_mss)
  334 {
  335         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  336         struct sk_buff *skb = tp->send_head;
  337 
  338         if (tcp_snd_test(tp, skb, cur_mss, 1)) {
  339                 /* Send it out now. */
  340                 TCP_SKB_CB(skb)->when = tcp_time_stamp;
  341                 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
  342                         tp->send_head = NULL;
  343                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
  344                         if (tp->packets_out++ == 0)
  345                                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
  346                         return;
  347                 }
  348         }
  349 }
  350 
  351 /* Split fragmented skb to two parts at length len. */
  352 
  353 static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len)
  354 {
  355         int i;
  356         int pos = skb->len - skb->data_len;
  357 
  358         if (len < pos) {
  359                 /* Split line is inside header. */
  360                 memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len);
  361 
  362                 /* And move data appendix as is. */
  363                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  364                         skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
  365 
  366                 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
  367                 skb_shinfo(skb)->nr_frags = 0;
  368 
  369                 skb1->data_len = skb->data_len;
  370                 skb1->len += skb1->data_len;
  371                 skb->data_len = 0;
  372                 skb->len = len;
  373                 skb->tail = skb->data+len;
  374         } else {
  375                 int k = 0;
  376                 int nfrags = skb_shinfo(skb)->nr_frags;
  377 
  378                 /* Second chunk has no header, nothing to copy. */
  379 
  380                 skb_shinfo(skb)->nr_frags = 0;
  381                 skb1->len = skb1->data_len = skb->len - len;
  382                 skb->len = len;
  383                 skb->data_len = len - pos;
  384 
  385                 for (i=0; i<nfrags; i++) {
  386                         int size = skb_shinfo(skb)->frags[i].size;
  387                         if (pos + size > len) {
  388                                 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
  389 
  390                                 if (pos < len) {
  391                                         /* Split frag.
  392                                          * We have to variants in this case:
  393                                          * 1. Move all the frag to the second
  394                                          *    part, if it is possible. F.e.
  395                                          *    this approach is mandatory for TUX,
  396                                          *    where splitting is expensive.
  397                                          * 2. Split is accurately. We make this.
  398                                          */
  399                                         get_page(skb_shinfo(skb)->frags[i].page);
  400                                         skb_shinfo(skb1)->frags[0].page_offset += (len-pos);
  401                                         skb_shinfo(skb1)->frags[0].size -= (len-pos);
  402                                         skb_shinfo(skb)->frags[i].size = len-pos;
  403                                         skb_shinfo(skb)->nr_frags++;
  404                                 }
  405                                 k++;
  406                         } else {
  407                                 skb_shinfo(skb)->nr_frags++;
  408                         }
  409                         pos += size;
  410                 }
  411                 skb_shinfo(skb1)->nr_frags = k;
  412         }
  413 }
  414 
  415 /* Function to create two new TCP segments.  Shrinks the given segment
  416  * to the specified size and appends a new segment with the rest of the
  417  * packet to the list.  This won't be called frequently, I hope. 
  418  * Remember, these are still headerless SKBs at this point.
  419  */
  420 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
  421 {
  422         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  423         struct sk_buff *buff;
  424         int nsize = skb->len - len;
  425         u16 flags;
  426 
  427         if (skb_cloned(skb) &&
  428             skb_is_nonlinear(skb) &&
  429             pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
  430                 return -ENOMEM;
  431 
  432         /* Get a new skb... force flag on. */
  433         buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC);
  434         if (buff == NULL)
  435                 return -ENOMEM; /* We'll just try again later. */
  436         tcp_charge_skb(sk, buff);
  437 
  438         /* Correct the sequence numbers. */
  439         TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
  440         TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
  441         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
  442 
  443         /* PSH and FIN should only be set in the second packet. */
  444         flags = TCP_SKB_CB(skb)->flags;
  445         TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
  446         TCP_SKB_CB(buff)->flags = flags;
  447         TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
  448         if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
  449                 tp->lost_out++;
  450                 tp->left_out++;
  451         }
  452         TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
  453 
  454         if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
  455                 /* Copy and checksum data tail into the new buffer. */
  456                 buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
  457                                                        nsize, 0);
  458 
  459                 skb_trim(skb, len);
  460 
  461                 skb->csum = csum_block_sub(skb->csum, buff->csum, len);
  462         } else {
  463                 skb->ip_summed = CHECKSUM_HW;
  464                 skb_split(skb, buff, len);
  465         }
  466 
  467         buff->ip_summed = skb->ip_summed;
  468 
  469         /* Looks stupid, but our code really uses when of
  470          * skbs, which it never sent before. --ANK
  471          */
  472         TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
  473 
  474         /* Link BUFF into the send queue. */
  475         __skb_append(skb, buff);
  476 
  477         return 0;
  478 }
  479 
  480 /* This function synchronize snd mss to current pmtu/exthdr set.
  481 
  482    tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
  483    for TCP options, but includes only bare TCP header.
  484 
  485    tp->mss_clamp is mss negotiated at connection setup.
  486    It is minumum of user_mss and mss received with SYN.
  487    It also does not include TCP options.
  488 
  489    tp->pmtu_cookie is last pmtu, seen by this function.
  490 
  491    tp->mss_cache is current effective sending mss, including
  492    all tcp options except for SACKs. It is evaluated,
  493    taking into account current pmtu, but never exceeds
  494    tp->mss_clamp.
  495 
  496    NOTE1. rfc1122 clearly states that advertised MSS
  497    DOES NOT include either tcp or ip options.
  498 
  499    NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
  500    this function.                       --ANK (980731)
  501  */
  502 
  503 int tcp_sync_mss(struct sock *sk, u32 pmtu)
  504 {
  505         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  506         int mss_now;
  507 
  508         /* Calculate base mss without TCP options:
  509            It is MMS_S - sizeof(tcphdr) of rfc1122
  510          */
  511 
  512         mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
  513 
  514         /* Clamp it (mss_clamp does not include tcp options) */
  515         if (mss_now > tp->mss_clamp)
  516                 mss_now = tp->mss_clamp;
  517 
  518         /* Now subtract optional transport overhead */
  519         mss_now -= tp->ext_header_len;
  520 
  521         /* Then reserve room for full set of TCP options and 8 bytes of data */
  522         if (mss_now < 48)
  523                 mss_now = 48;
  524 
  525         /* Now subtract TCP options size, not including SACKs */
  526         mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
  527 
  528         /* Bound mss with half of window */
  529         if (tp->max_window && mss_now > (tp->max_window>>1))
  530                 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
  531 
  532         /* And store cached results */
  533         tp->pmtu_cookie = pmtu;
  534         tp->mss_cache = mss_now;
  535         return mss_now;
  536 }
  537 
  538 
  539 /* This routine writes packets to the network.  It advances the
  540  * send_head.  This happens as incoming acks open up the remote
  541  * window for us.
  542  *
  543  * Returns 1, if no segments are in flight and we have queued segments, but
  544  * cannot send anything now because of SWS or another problem.
  545  */
  546 int tcp_write_xmit(struct sock *sk, int nonagle)
  547 {
  548         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  549         unsigned int mss_now;
  550 
  551         /* If we are closed, the bytes will have to remain here.
  552          * In time closedown will finish, we empty the write queue and all
  553          * will be happy.
  554          */
  555         if(sk->state != TCP_CLOSE) {
  556                 struct sk_buff *skb;
  557                 int sent_pkts = 0;
  558 
  559                 /* Account for SACKS, we may need to fragment due to this.
  560                  * It is just like the real MSS changing on us midstream.
  561                  * We also handle things correctly when the user adds some
  562                  * IP options mid-stream.  Silly to do, but cover it.
  563                  */
  564                 mss_now = tcp_current_mss(sk); 
  565 
  566                 while((skb = tp->send_head) &&
  567                       tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) {
  568                         if (skb->len > mss_now) {
  569                                 if (tcp_fragment(sk, skb, mss_now))
  570                                         break;
  571                         }
  572 
  573                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
  574                         if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
  575                                 break;
  576                         /* Advance the send_head.  This one is sent out. */
  577                         update_send_head(sk, tp, skb);
  578                         tcp_minshall_update(tp, mss_now, skb);
  579                         sent_pkts = 1;
  580                 }
  581 
  582                 if (sent_pkts) {
  583                         tcp_cwnd_validate(sk, tp);
  584                         return 0;
  585                 }
  586 
  587                 return !tp->packets_out && tp->send_head;
  588         }
  589         return 0;
  590 }
  591 
  592 /* This function returns the amount that we can raise the
  593  * usable window based on the following constraints
  594  *  
  595  * 1. The window can never be shrunk once it is offered (RFC 793)
  596  * 2. We limit memory per socket
  597  *
  598  * RFC 1122:
  599  * "the suggested [SWS] avoidance algorithm for the receiver is to keep
  600  *  RECV.NEXT + RCV.WIN fixed until:
  601  *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
  602  *
  603  * i.e. don't raise the right edge of the window until you can raise
  604  * it at least MSS bytes.
  605  *
  606  * Unfortunately, the recommended algorithm breaks header prediction,
  607  * since header prediction assumes th->window stays fixed.
  608  *
  609  * Strictly speaking, keeping th->window fixed violates the receiver
  610  * side SWS prevention criteria. The problem is that under this rule
  611  * a stream of single byte packets will cause the right side of the
  612  * window to always advance by a single byte.
  613  * 
  614  * Of course, if the sender implements sender side SWS prevention
  615  * then this will not be a problem.
  616  * 
  617  * BSD seems to make the following compromise:
  618  * 
  619  *      If the free space is less than the 1/4 of the maximum
  620  *      space available and the free space is less than 1/2 mss,
  621  *      then set the window to 0.
  622  *      [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
  623  *      Otherwise, just prevent the window from shrinking
  624  *      and from being larger than the largest representable value.
  625  *
  626  * This prevents incremental opening of the window in the regime
  627  * where TCP is limited by the speed of the reader side taking
  628  * data out of the TCP receive queue. It does nothing about
  629  * those cases where the window is constrained on the sender side
  630  * because the pipeline is full.
  631  *
  632  * BSD also seems to "accidentally" limit itself to windows that are a
  633  * multiple of MSS, at least until the free space gets quite small.
  634  * This would appear to be a side effect of the mbuf implementation.
  635  * Combining these two algorithms results in the observed behavior
  636  * of having a fixed window size at almost all times.
  637  *
  638  * Below we obtain similar behavior by forcing the offered window to
  639  * a multiple of the mss when it is feasible to do so.
  640  *
  641  * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
  642  * Regular options like TIMESTAMP are taken into account.
  643  */
  644 u32 __tcp_select_window(struct sock *sk)
  645 {
  646         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  647         /* MSS for the peer's data.  Previous verions used mss_clamp
  648          * here.  I don't know if the value based on our guesses
  649          * of peer's MSS is better for the performance.  It's more correct
  650          * but may be worse for the performance because of rcv_mss
  651          * fluctuations.  --SAW  1998/11/1
  652          */
  653         int mss = tp->ack.rcv_mss;
  654         int free_space = tcp_space(sk);
  655         int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
  656         int window;
  657 
  658         if (mss > full_space)
  659                 mss = full_space; 
  660 
  661         if (free_space < full_space/2) {
  662                 tp->ack.quick = 0;
  663 
  664                 if (tcp_memory_pressure)
  665                         tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
  666 
  667                 if (free_space < mss)
  668                         return 0;
  669         }
  670 
  671         if (free_space > tp->rcv_ssthresh)
  672                 free_space = tp->rcv_ssthresh;
  673 
  674         /* Get the largest window that is a nice multiple of mss.
  675          * Window clamp already applied above.
  676          * If our current window offering is within 1 mss of the
  677          * free space we just keep it. This prevents the divide
  678          * and multiply from happening most of the time.
  679          * We also don't do any window rounding when the free space
  680          * is too small.
  681          */
  682         window = tp->rcv_wnd;
  683         if (window <= free_space - mss || window > free_space)
  684                 window = (free_space/mss)*mss;
  685 
  686         return window;
  687 }
  688 
  689 /* Attempt to collapse two adjacent SKB's during retransmission. */
  690 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
  691 {
  692         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  693         struct sk_buff *next_skb = skb->next;
  694 
  695         /* The first test we must make is that neither of these two
  696          * SKB's are still referenced by someone else.
  697          */
  698         if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
  699                 int skb_size = skb->len, next_skb_size = next_skb->len;
  700                 u16 flags = TCP_SKB_CB(skb)->flags;
  701 
  702                 /* Also punt if next skb has been SACK'd. */
  703                 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
  704                         return;
  705 
  706                 /* Next skb is out of window. */
  707                 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
  708                         return;
  709 
  710                 /* Punt if not enough space exists in the first SKB for
  711                  * the data in the second, or the total combined payload
  712                  * would exceed the MSS.
  713                  */
  714                 if ((next_skb_size > skb_tailroom(skb)) ||
  715                     ((skb_size + next_skb_size) > mss_now))
  716                         return;
  717 
  718                 /* Ok.  We will be able to collapse the packet. */
  719                 __skb_unlink(next_skb, next_skb->list);
  720 
  721                 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
  722 
  723                 if (next_skb->ip_summed == CHECKSUM_HW)
  724                         skb->ip_summed = CHECKSUM_HW;
  725 
  726                 if (skb->ip_summed != CHECKSUM_HW)
  727                         skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
  728 
  729                 /* Update sequence range on original skb. */
  730                 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
  731 
  732                 /* Merge over control information. */
  733                 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
  734                 TCP_SKB_CB(skb)->flags = flags;
  735 
  736                 /* All done, get rid of second SKB and account for it so
  737                  * packet counting does not break.
  738                  */
  739                 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
  740                 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
  741                         tp->retrans_out--;
  742                 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
  743                         tp->lost_out--;
  744                         tp->left_out--;
  745                 }
  746                 /* Reno case is special. Sigh... */
  747                 if (!tp->sack_ok && tp->sacked_out) {
  748                         tp->sacked_out--;
  749                         tp->left_out--;
  750                 }
  751 
  752                 /* Not quite right: it can be > snd.fack, but
  753                  * it is better to underestimate fackets.
  754                  */
  755                 if (tp->fackets_out)
  756                         tp->fackets_out--;
  757                 tcp_free_skb(sk, next_skb);
  758                 tp->packets_out--;
  759         }
  760 }
  761 
  762 /* Do a simple retransmit without using the backoff mechanisms in
  763  * tcp_timer. This is used for path mtu discovery. 
  764  * The socket is already locked here.
  765  */ 
  766 void tcp_simple_retransmit(struct sock *sk)
  767 {
  768         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  769         struct sk_buff *skb;
  770         unsigned int mss = tcp_current_mss(sk);
  771         int lost = 0;
  772 
  773         for_retrans_queue(skb, sk, tp) {
  774                 if (skb->len > mss && 
  775                     !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
  776                         if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
  777                                 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
  778                                 tp->retrans_out--;
  779                         }
  780                         if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
  781                                 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
  782                                 tp->lost_out++;
  783                                 lost = 1;
  784                         }
  785                 }
  786         }
  787 
  788         if (!lost)
  789                 return;
  790 
  791         tcp_sync_left_out(tp);
  792 
  793         /* Don't muck with the congestion window here.
  794          * Reason is that we do not increase amount of _data_
  795          * in network, but units changed and effective
  796          * cwnd/ssthresh really reduced now.
  797          */
  798         if (tp->ca_state != TCP_CA_Loss) {
  799                 tp->high_seq = tp->snd_nxt;
  800                 tp->snd_ssthresh = tcp_current_ssthresh(tp);
  801                 tp->prior_ssthresh = 0;
  802                 tp->undo_marker = 0;
  803                 tp->ca_state = TCP_CA_Loss;
  804         }
  805         tcp_xmit_retransmit_queue(sk);
  806 }
  807 
  808 /* This retransmits one SKB.  Policy decisions and retransmit queue
  809  * state updates are done by the caller.  Returns non-zero if an
  810  * error occurred which prevented the send.
  811  */
  812 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
  813 {
  814         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  815         unsigned int cur_mss = tcp_current_mss(sk);
  816         int err;
  817 
  818         /* Do not sent more than we queued. 1/4 is reserved for possible
  819          * copying overhead: frgagmentation, tunneling, mangling etc.
  820          */
  821         if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
  822                 return -EAGAIN;
  823 
  824         /* If receiver has shrunk his window, and skb is out of
  825          * new window, do not retransmit it. The exception is the
  826          * case, when window is shrunk to zero. In this case
  827          * our retransmit serves as a zero window probe.
  828          */
  829         if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
  830             && TCP_SKB_CB(skb)->seq != tp->snd_una)
  831                 return -EAGAIN;
  832 
  833         if(skb->len > cur_mss) {
  834                 if(tcp_fragment(sk, skb, cur_mss))
  835                         return -ENOMEM; /* We'll try again later. */
  836 
  837                 /* New SKB created, account for it. */
  838                 tp->packets_out++;
  839         }
  840 
  841         /* Collapse two adjacent packets if worthwhile and we can. */
  842         if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
  843            (skb->len < (cur_mss >> 1)) &&
  844            (skb->next != tp->send_head) &&
  845            (skb->next != (struct sk_buff *)&sk->write_queue) &&
  846            (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
  847            (sysctl_tcp_retrans_collapse != 0))
  848                 tcp_retrans_try_collapse(sk, skb, cur_mss);
  849 
  850         if(tp->af_specific->rebuild_header(sk))
  851                 return -EHOSTUNREACH; /* Routing failure or similar. */
  852 
  853         /* Some Solaris stacks overoptimize and ignore the FIN on a
  854          * retransmit when old data is attached.  So strip it off
  855          * since it is cheap to do so and saves bytes on the network.
  856          */
  857         if(skb->len > 0 &&
  858            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
  859            tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
  860                 if (!pskb_trim(skb, 0)) {
  861                         TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
  862                         skb->ip_summed = CHECKSUM_NONE;
  863                         skb->csum = 0;
  864                 }
  865         }
  866 
  867         /* Make a copy, if the first transmission SKB clone we made
  868          * is still in somebody's hands, else make a clone.
  869          */
  870         TCP_SKB_CB(skb)->when = tcp_time_stamp;
  871 
  872         err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
  873                                     pskb_copy(skb, GFP_ATOMIC):
  874                                     skb_clone(skb, GFP_ATOMIC)));
  875 
  876         if (err == 0) {
  877                 /* Update global TCP statistics. */
  878                 TCP_INC_STATS(TcpRetransSegs);
  879 
  880 #if FASTRETRANS_DEBUG > 0
  881                 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
  882                         if (net_ratelimit())
  883                                 printk(KERN_DEBUG "retrans_out leaked.\n");
  884                 }
  885 #endif
  886                 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
  887                 tp->retrans_out++;
  888 
  889                 /* Save stamp of the first retransmit. */
  890                 if (!tp->retrans_stamp)
  891                         tp->retrans_stamp = TCP_SKB_CB(skb)->when;
  892 
  893                 tp->undo_retrans++;
  894 
  895                 /* snd_nxt is stored to detect loss of retransmitted segment,
  896                  * see tcp_input.c tcp_sacktag_write_queue().
  897                  */
  898                 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
  899         }
  900         return err;
  901 }
  902 
  903 /* This gets called after a retransmit timeout, and the initially
  904  * retransmitted data is acknowledged.  It tries to continue
  905  * resending the rest of the retransmit queue, until either
  906  * we've sent it all or the congestion window limit is reached.
  907  * If doing SACK, the first ACK which comes back for a timeout
  908  * based retransmit packet might feed us FACK information again.
  909  * If so, we use it to avoid unnecessarily retransmissions.
  910  */
  911 void tcp_xmit_retransmit_queue(struct sock *sk)
  912 {
  913         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  914         struct sk_buff *skb;
  915         int packet_cnt = tp->lost_out;
  916 
  917         /* First pass: retransmit lost packets. */
  918         if (packet_cnt) {
  919                 for_retrans_queue(skb, sk, tp) {
  920                         __u8 sacked = TCP_SKB_CB(skb)->sacked;
  921 
  922                         if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
  923                                 return;
  924 
  925                         if (sacked&TCPCB_LOST) {
  926                                 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
  927                                         if (tcp_retransmit_skb(sk, skb))
  928                                                 return;
  929                                         if (tp->ca_state != TCP_CA_Loss)
  930                                                 NET_INC_STATS_BH(TCPFastRetrans);
  931                                         else
  932                                                 NET_INC_STATS_BH(TCPSlowStartRetrans);
  933 
  934                                         if (skb == skb_peek(&sk->write_queue))
  935                                                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
  936                                 }
  937 
  938                                 if (--packet_cnt <= 0)
  939                                         break;
  940                         }
  941                 }
  942         }
  943 
  944         /* OK, demanded retransmission is finished. */
  945 
  946         /* Forward retransmissions are possible only during Recovery. */
  947         if (tp->ca_state != TCP_CA_Recovery)
  948                 return;
  949 
  950         /* No forward retransmissions in Reno are possible. */
  951         if (!tp->sack_ok)
  952                 return;
  953 
  954         /* Yeah, we have to make difficult choice between forward transmission
  955          * and retransmission... Both ways have their merits...
  956          *
  957          * For now we do not retrnamsit anything, while we have some new
  958          * segments to send.
  959          */
  960 
  961         if (tcp_may_send_now(sk, tp))
  962                 return;
  963 
  964         packet_cnt = 0;
  965 
  966         for_retrans_queue(skb, sk, tp) {
  967                 if(++packet_cnt > tp->fackets_out)
  968                         break;
  969 
  970                 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
  971                         break;
  972 
  973                 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
  974                         continue;
  975 
  976                 /* Ok, retransmit it. */
  977                 if(tcp_retransmit_skb(sk, skb))
  978                         break;
  979 
  980                 if (skb == skb_peek(&sk->write_queue))
  981                         tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
  982 
  983                 NET_INC_STATS_BH(TCPForwardRetrans);
  984         }
  985 }
  986 
  987 
  988 /* Send a fin.  The caller locks the socket for us.  This cannot be
  989  * allowed to fail queueing a FIN frame under any circumstances.
  990  */
  991 void tcp_send_fin(struct sock *sk)
  992 {
  993         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);    
  994         struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
  995         unsigned int mss_now;
  996         
  997         /* Optimization, tack on the FIN if we have a queue of
  998          * unsent frames.  But be careful about outgoing SACKS
  999          * and IP options.
 1000          */
 1001         mss_now = tcp_current_mss(sk); 
 1002 
 1003         if(tp->send_head != NULL) {
 1004                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
 1005                 TCP_SKB_CB(skb)->end_seq++;
 1006                 tp->write_seq++;
 1007         } else {
 1008                 /* Socket is locked, keep trying until memory is available. */
 1009                 for (;;) {
 1010                         skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
 1011                         if (skb)
 1012                                 break;
 1013                         yield();
 1014                 }
 1015 
 1016                 /* Reserve space for headers and prepare control bits. */
 1017                 skb_reserve(skb, MAX_TCP_HEADER);
 1018                 skb->csum = 0;
 1019                 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 1020                 TCP_SKB_CB(skb)->sacked = 0;
 1021 
 1022                 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
 1023                 TCP_SKB_CB(skb)->seq = tp->write_seq;
 1024                 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 1025                 tcp_send_skb(sk, skb, 1, mss_now);
 1026         }
 1027         __tcp_push_pending_frames(sk, tp, mss_now, 1);
 1028 }
 1029 
 1030 /* We get here when a process closes a file descriptor (either due to
 1031  * an explicit close() or as a byproduct of exit()'ing) and there
 1032  * was unread data in the receive queue.  This behavior is recommended
 1033  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
 1034  */
 1035 void tcp_send_active_reset(struct sock *sk, int priority)
 1036 {
 1037         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 1038         struct sk_buff *skb;
 1039 
 1040         /* NOTE: No TCP options attached and we never retransmit this. */
 1041         skb = alloc_skb(MAX_TCP_HEADER, priority);
 1042         if (!skb) {
 1043                 NET_INC_STATS(TCPAbortFailed);
 1044                 return;
 1045         }
 1046 
 1047         /* Reserve space for headers and prepare control bits. */
 1048         skb_reserve(skb, MAX_TCP_HEADER);
 1049         skb->csum = 0;
 1050         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 1051         TCP_SKB_CB(skb)->sacked = 0;
 1052 
 1053         /* Send it off. */
 1054         TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
 1055         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 1056         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 1057         if (tcp_transmit_skb(sk, skb))
 1058                 NET_INC_STATS(TCPAbortFailed);
 1059 }
 1060 
 1061 /* WARNING: This routine must only be called when we have already sent
 1062  * a SYN packet that crossed the incoming SYN that caused this routine
 1063  * to get called. If this assumption fails then the initial rcv_wnd
 1064  * and rcv_wscale values will not be correct.
 1065  */
 1066 int tcp_send_synack(struct sock *sk)
 1067 {
 1068         struct sk_buff* skb;
 1069 
 1070         skb = skb_peek(&sk->write_queue);
 1071         if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
 1072                 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
 1073                 return -EFAULT;
 1074         }
 1075         if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
 1076                 if (skb_cloned(skb)) {
 1077                         struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
 1078                         if (nskb == NULL)
 1079                                 return -ENOMEM;
 1080                         __skb_unlink(skb, &sk->write_queue);
 1081                         __skb_queue_head(&sk->write_queue, nskb);
 1082                         tcp_free_skb(sk, skb);
 1083                         tcp_charge_skb(sk, nskb);
 1084                         skb = nskb;
 1085                 }
 1086 
 1087                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
 1088                 TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
 1089         }
 1090         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 1091         return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 1092 }
 1093 
 1094 /*
 1095  * Prepare a SYN-ACK.
 1096  */
 1097 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 1098                                  struct open_request *req)
 1099 {
 1100         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 1101         struct tcphdr *th;
 1102         int tcp_header_size;
 1103         struct sk_buff *skb;
 1104 
 1105         skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
 1106         if (skb == NULL)
 1107                 return NULL;
 1108 
 1109         /* Reserve space for headers. */
 1110         skb_reserve(skb, MAX_TCP_HEADER);
 1111 
 1112         skb->dst = dst_clone(dst);
 1113 
 1114         tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
 1115                            (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
 1116                            (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
 1117                            /* SACK_PERM is in the place of NOP NOP of TS */
 1118                            ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
 1119         skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 1120 
 1121         memset(th, 0, sizeof(struct tcphdr));
 1122         th->syn = 1;
 1123         th->ack = 1;
 1124         TCP_ECN_make_synack(req, th);
 1125         th->source = sk->sport;
 1126         th->dest = req->rmt_port;
 1127         TCP_SKB_CB(skb)->seq = req->snt_isn;
 1128         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 1129         th->seq = htonl(TCP_SKB_CB(skb)->seq);
 1130         th->ack_seq = htonl(req->rcv_isn + 1);
 1131         if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
 1132                 __u8 rcv_wscale; 
 1133                 /* Set this up on the first call only */
 1134                 req->window_clamp = tp->window_clamp ? : dst->window;
 1135                 /* tcp_full_space because it is guaranteed to be the first packet */
 1136                 tcp_select_initial_window(tcp_full_space(sk), 
 1137                         dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
 1138                         &req->rcv_wnd,
 1139                         &req->window_clamp,
 1140                         req->wscale_ok,
 1141                         &rcv_wscale);
 1142                 req->rcv_wscale = rcv_wscale; 
 1143         }
 1144 
 1145         /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 1146         th->window = htons(req->rcv_wnd);
 1147 
 1148         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 1149         tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
 1150                               req->sack_ok, req->wscale_ok, req->rcv_wscale,
 1151                               TCP_SKB_CB(skb)->when,
 1152                               req->ts_recent);
 1153 
 1154         skb->csum = 0;
 1155         th->doff = (tcp_header_size >> 2);
 1156         TCP_INC_STATS(TcpOutSegs);
 1157         return skb;
 1158 }
 1159 
 1160 /* 
 1161  * Do all connect socket setups that can be done AF independent.
 1162  */ 
 1163 static inline void tcp_connect_init(struct sock *sk)
 1164 {
 1165         struct dst_entry *dst = __sk_dst_get(sk);
 1166         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 1167 
 1168         /* We'll fix this up when we get a response from the other end.
 1169          * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
 1170          */
 1171         tp->tcp_header_len = sizeof(struct tcphdr) +
 1172                 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 1173 
 1174         /* If user gave his TCP_MAXSEG, record it to clamp */
 1175         if (tp->user_mss)
 1176                 tp->mss_clamp = tp->user_mss;
 1177         tp->max_window = 0;
 1178         tcp_sync_mss(sk, dst->pmtu);
 1179 
 1180         if (!tp->window_clamp)
 1181                 tp->window_clamp = dst->window;
 1182         tp->advmss = dst->advmss;
 1183         tcp_initialize_rcv_mss(sk);
 1184 
 1185         tcp_select_initial_window(tcp_full_space(sk),
 1186                                   tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 1187                                   &tp->rcv_wnd,
 1188                                   &tp->window_clamp,
 1189                                   sysctl_tcp_window_scaling,
 1190                                   &tp->rcv_wscale);
 1191 
 1192         tp->rcv_ssthresh = tp->rcv_wnd;
 1193 
 1194         sk->err = 0;
 1195         sk->done = 0;
 1196         tp->snd_wnd = 0;
 1197         tcp_init_wl(tp, tp->write_seq, 0);
 1198         tp->snd_una = tp->write_seq;
 1199         tp->snd_sml = tp->write_seq;
 1200         tp->rcv_nxt = 0;
 1201         tp->rcv_wup = 0;
 1202         tp->copied_seq = 0;
 1203 
 1204         tp->rto = TCP_TIMEOUT_INIT;
 1205         tp->retransmits = 0;
 1206         tcp_clear_retrans(tp);
 1207 }
 1208 
 1209 /*
 1210  * Build a SYN and send it off.
 1211  */ 
 1212 int tcp_connect(struct sock *sk)
 1213 {
 1214         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 1215         struct sk_buff *buff;
 1216 
 1217         tcp_connect_init(sk);
 1218 
 1219         buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
 1220         if (unlikely(buff == NULL))
 1221                 return -ENOBUFS;
 1222 
 1223         /* Reserve space for headers. */
 1224         skb_reserve(buff, MAX_TCP_HEADER);
 1225 
 1226         TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
 1227         TCP_ECN_send_syn(tp, buff);
 1228         TCP_SKB_CB(buff)->sacked = 0;
 1229         buff->csum = 0;
 1230         TCP_SKB_CB(buff)->seq = tp->write_seq++;
 1231         TCP_SKB_CB(buff)->end_seq = tp->write_seq;
 1232         tp->snd_nxt = tp->write_seq;
 1233         tp->pushed_seq = tp->write_seq;
 1234 
 1235         /* Send it off. */
 1236         TCP_SKB_CB(buff)->when = tcp_time_stamp;
 1237         tp->retrans_stamp = TCP_SKB_CB(buff)->when;
 1238         __skb_queue_tail(&sk->write_queue, buff);
 1239         tcp_charge_skb(sk, buff);
 1240         tp->packets_out++;
 1241         tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
 1242         TCP_INC_STATS(TcpActiveOpens);
 1243 
 1244         /* Timer for repeating the SYN until an answer. */
 1245         tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 1246         return 0;
 1247 }
 1248 
 1249 /* Send out a delayed ack, the caller does the policy checking
 1250  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
 1251  * for details.
 1252  */
 1253 void tcp_send_delayed_ack(struct sock *sk)
 1254 {
 1255         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 1256         int ato = tp->ack.ato;
 1257         unsigned long timeout;
 1258 
 1259         if (ato > TCP_DELACK_MIN) {
 1260                 int max_ato = HZ/2;
 1261 
 1262                 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
 1263                         max_ato = TCP_DELACK_MAX;
 1264 
 1265                 /* Slow path, intersegment interval is "high". */
 1266 
 1267                 /* If some rtt estimate is known, use it to bound delayed ack.
 1268                  * Do not use tp->rto here, use results of rtt measurements
 1269                  * directly.
 1270                  */
 1271                 if (tp->srtt) {
 1272                         int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
 1273 
 1274                         if (rtt < max_ato)
 1275                                 max_ato = rtt;
 1276                 }
 1277 
 1278                 ato = min(ato, max_ato);
 1279         }
 1280 
 1281         /* Stay within the limit we were given */
 1282         timeout = jiffies + ato;
 1283 
 1284         /* Use new timeout only if there wasn't a older one earlier. */
 1285         if (tp->ack.pending&TCP_ACK_TIMER) {
 1286                 /* If delack timer was blocked or is about to expire,
 1287                  * send ACK now.
 1288                  */
 1289                 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
 1290                         tcp_send_ack(sk);
 1291                         return;
 1292                 }
 1293 
 1294                 if (!time_before(timeout, tp->ack.timeout))
 1295                         timeout = tp->ack.timeout;
 1296         }
 1297         tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
 1298         tp->ack.timeout = timeout;
 1299         if (!mod_timer(&tp->delack_timer, timeout))
 1300                 sock_hold(sk);
 1301 }
 1302 
 1303 /* This routine sends an ack and also updates the window. */
 1304 void tcp_send_ack(struct sock *sk)
 1305 {
 1306         /* If we have been reset, we may not send again. */
 1307         if(sk->state != TCP_CLOSE) {
 1308                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 1309                 struct sk_buff *buff;
 1310 
 1311                 /* We are not putting this on the write queue, so
 1312                  * tcp_transmit_skb() will set the ownership to this
 1313                  * sock.
 1314                  */
 1315                 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
 1316                 if (buff == NULL) {
 1317                         tcp_schedule_ack(tp);
 1318                         tp->ack.ato = TCP_ATO_MIN;
 1319                         tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
 1320                         return;
 1321                 }
 1322 
 1323                 /* Reserve space for headers and prepare control bits. */
 1324                 skb_reserve(buff, MAX_TCP_HEADER);
 1325                 buff->csum = 0;
 1326                 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
 1327                 TCP_SKB_CB(buff)->sacked = 0;
 1328 
 1329                 /* Send it off, this clears delayed acks for us. */
 1330                 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
 1331                 TCP_SKB_CB(buff)->when = tcp_time_stamp;
 1332                 tcp_transmit_skb(sk, buff);
 1333         }
 1334 }
 1335 
 1336 /* This routine sends a packet with an out of date sequence
 1337  * number. It assumes the other end will try to ack it.
 1338  *
 1339  * Question: what should we make while urgent mode?
 1340  * 4.4BSD forces sending single byte of data. We cannot send
 1341  * out of window data, because we have SND.NXT==SND.MAX...
 1342  *
 1343  * Current solution: to send TWO zero-length segments in urgent mode:
 1344  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
 1345  * out-of-date with SND.UNA-1 to probe window.
 1346  */
 1347 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 1348 {
 1349         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 1350         struct sk_buff *skb;
 1351 
 1352         /* We don't queue it, tcp_transmit_skb() sets ownership. */
 1353         skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
 1354         if (skb == NULL) 
 1355                 return -1;
 1356 
 1357         /* Reserve space for headers and set control bits. */
 1358         skb_reserve(skb, MAX_TCP_HEADER);
 1359         skb->csum = 0;
 1360         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 1361         TCP_SKB_CB(skb)->sacked = urgent;
 1362 
 1363         /* Use a previous sequence.  This should cause the other
 1364          * end to send an ack.  Don't queue or clone SKB, just
 1365          * send it.
 1366          */
 1367         TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
 1368         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 1369         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 1370         return tcp_transmit_skb(sk, skb);
 1371 }
 1372 
 1373 int tcp_write_wakeup(struct sock *sk)
 1374 {
 1375         if (sk->state != TCP_CLOSE) {
 1376                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 1377                 struct sk_buff *skb;
 1378 
 1379                 if ((skb = tp->send_head) != NULL &&
 1380                     before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
 1381                         int err;
 1382                         int mss = tcp_current_mss(sk);
 1383                         int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
 1384 
 1385                         if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
 1386                                 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
 1387 
 1388                         /* We are probing the opening of a window
 1389                          * but the window size is != 0
 1390                          * must have been a result SWS avoidance ( sender )
 1391                          */
 1392                         if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
 1393                             skb->len > mss) {
 1394                                 seg_size = min(seg_size, mss);
 1395                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 1396                                 if (tcp_fragment(sk, skb, seg_size))
 1397                                         return -1;
 1398                         }
 1399                         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 1400                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 1401                         err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 1402                         if (!err) {
 1403                                 update_send_head(sk, tp, skb);
 1404                         }
 1405                         return err;
 1406                 } else {
 1407                         if (tp->urg_mode &&
 1408                             between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
 1409                                 tcp_xmit_probe_skb(sk, TCPCB_URG);
 1410                         return tcp_xmit_probe_skb(sk, 0);
 1411                 }
 1412         }
 1413         return -1;
 1414 }
 1415 
 1416 /* A window probe timeout has occurred.  If window is not closed send
 1417  * a partial packet else a zero probe.
 1418  */
 1419 void tcp_send_probe0(struct sock *sk)
 1420 {
 1421         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 1422         int err;
 1423 
 1424         err = tcp_write_wakeup(sk);
 1425 
 1426         if (tp->packets_out || !tp->send_head) {
 1427                 /* Cancel probe timer, if it is not required. */
 1428                 tp->probes_out = 0;
 1429                 tp->backoff = 0;
 1430                 return;
 1431         }
 1432 
 1433         if (err <= 0) {
 1434                 if (tp->backoff < sysctl_tcp_retries2)
 1435                         tp->backoff++;
 1436                 tp->probes_out++;
 1437                 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
 1438                                       min(tp->rto << tp->backoff, TCP_RTO_MAX));
 1439         } else {
 1440                 /* If packet was not sent due to local congestion,
 1441                  * do not backoff and do not remember probes_out.
 1442                  * Let local senders to fight for local resources.
 1443                  *
 1444                  * Use accumulated backoff yet.
 1445                  */
 1446                 if (!tp->probes_out)
 1447                         tp->probes_out=1;
 1448                 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
 1449                                       min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
 1450         }
 1451 }

Cache object: 25c19a0544d9f0878c59d32e94cab6c7


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.