1 /*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
34 * $FreeBSD: releng/5.0/sys/netinet/tcp_output.c 105291 2002-10-16 19:16:33Z dillon $
35 */
36
37 #include "opt_inet6.h"
38 #include "opt_ipsec.h"
39 #include "opt_mac.h"
40 #include "opt_tcpdebug.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/mac.h>
48 #include <sys/mbuf.h>
49 #include <sys/mutex.h>
50 #include <sys/protosw.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/sysctl.h>
54
55 #include <net/route.h>
56
57 #include <netinet/in.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/ip.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/ip_var.h>
62 #ifdef INET6
63 #include <netinet6/in6_pcb.h>
64 #include <netinet/ip6.h>
65 #include <netinet6/ip6_var.h>
66 #endif
67 #include <netinet/tcp.h>
68 #define TCPOUTFLAGS
69 #include <netinet/tcp_fsm.h>
70 #include <netinet/tcp_seq.h>
71 #include <netinet/tcp_timer.h>
72 #include <netinet/tcp_var.h>
73 #include <netinet/tcpip.h>
74 #ifdef TCPDEBUG
75 #include <netinet/tcp_debug.h>
76 #endif
77
78 #ifdef IPSEC
79 #include <netinet6/ipsec.h>
80 #endif /*IPSEC*/
81
82 #ifdef FAST_IPSEC
83 #include <netipsec/ipsec.h>
84 #define IPSEC
85 #endif /*FAST_IPSEC*/
86
87 #include <machine/in_cksum.h>
88
89 #ifdef notyet
90 extern struct mbuf *m_copypack();
91 #endif
92
93 int path_mtu_discovery = 1;
94 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
95 &path_mtu_discovery, 1, "Enable Path MTU Discovery");
96
97 int ss_fltsz = 1;
98 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
99 &ss_fltsz, 1, "Slow start flight size");
100
101 int ss_fltsz_local = 4;
102 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
103 &ss_fltsz_local, 1, "Slow start flight size for local networks");
104
105 int tcp_do_newreno = 1;
106 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
107 0, "Enable NewReno Algorithms");
108 /*
109 * Tcp output routine: figure out what should be sent and send it.
110 */
111 int
112 tcp_output(struct tcpcb *tp)
113 {
114 struct socket *so = tp->t_inpcb->inp_socket;
115 long len, win;
116 int off, flags, error;
117 struct mbuf *m;
118 struct ip *ip = NULL;
119 struct ipovly *ipov = NULL;
120 struct tcphdr *th;
121 u_char opt[TCP_MAXOLEN];
122 unsigned ipoptlen, optlen, hdrlen;
123 int idle, sendalot;
124 #if 0
125 int maxburst = TCP_MAXBURST;
126 #endif
127 struct rmxp_tao *taop;
128 struct rmxp_tao tao_noncached;
129 #ifdef INET6
130 struct ip6_hdr *ip6 = NULL;
131 int isipv6;
132
133 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
134 #endif
135
136 #ifndef INET6
137 mtx_assert(&tp->t_inpcb->inp_mtx, MA_OWNED);
138 #endif
139
140 /*
141 * Determine length of data that should be transmitted,
142 * and flags that will be used.
143 * If there is some data or critical controls (SYN, RST)
144 * to send, then transmit; otherwise, investigate further.
145 */
146 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
147 if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
148 /*
149 * We have been idle for "a while" and no acks are
150 * expected to clock out any data we send --
151 * slow start to get ack "clock" running again.
152 *
153 * Set the slow-start flight size depending on whether
154 * this is a local network or not.
155 */
156 int ss = ss_fltsz;
157 #ifdef INET6
158 if (isipv6) {
159 if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
160 ss = ss_fltsz_local;
161 } else
162 #endif /* INET6 */
163 if (in_localaddr(tp->t_inpcb->inp_faddr))
164 ss = ss_fltsz_local;
165 tp->snd_cwnd = tp->t_maxseg * ss;
166 }
167 tp->t_flags &= ~TF_LASTIDLE;
168 if (idle) {
169 if (tp->t_flags & TF_MORETOCOME) {
170 tp->t_flags |= TF_LASTIDLE;
171 idle = 0;
172 }
173 }
174 again:
175 sendalot = 0;
176 off = tp->snd_nxt - tp->snd_una;
177 win = min(tp->snd_wnd, tp->snd_cwnd);
178 win = min(win, tp->snd_bwnd);
179
180 flags = tcp_outflags[tp->t_state];
181 /*
182 * Get standard flags, and add SYN or FIN if requested by 'hidden'
183 * state flags.
184 */
185 if (tp->t_flags & TF_NEEDFIN)
186 flags |= TH_FIN;
187 if (tp->t_flags & TF_NEEDSYN)
188 flags |= TH_SYN;
189
190 /*
191 * If in persist timeout with window of 0, send 1 byte.
192 * Otherwise, if window is small but nonzero
193 * and timer expired, we will send what we can
194 * and go to transmit state.
195 */
196 if (tp->t_force) {
197 if (win == 0) {
198 /*
199 * If we still have some data to send, then
200 * clear the FIN bit. Usually this would
201 * happen below when it realizes that we
202 * aren't sending all the data. However,
203 * if we have exactly 1 byte of unsent data,
204 * then it won't clear the FIN bit below,
205 * and if we are in persist state, we wind
206 * up sending the packet without recording
207 * that we sent the FIN bit.
208 *
209 * We can't just blindly clear the FIN bit,
210 * because if we don't have any more data
211 * to send then the probe will be the FIN
212 * itself.
213 */
214 if (off < so->so_snd.sb_cc)
215 flags &= ~TH_FIN;
216 win = 1;
217 } else {
218 callout_stop(tp->tt_persist);
219 tp->t_rxtshift = 0;
220 }
221 }
222
223 /*
224 * If snd_nxt == snd_max and we have transmitted a FIN, the
225 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
226 * a negative length. This can also occur when tcp opens up
227 * its congestion window while receiving additional duplicate
228 * acks after fast-retransmit because TCP will reset snd_nxt
229 * to snd_max after the fast-retransmit.
230 *
231 * In the normal retransmit-FIN-only case, however, snd_nxt will
232 * be set to snd_una, the offset will be 0, and the length may
233 * wind up 0.
234 */
235 len = (long)ulmin(so->so_snd.sb_cc, win) - off;
236
237 if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
238 taop = &tao_noncached;
239 bzero(taop, sizeof(*taop));
240 }
241
242 /*
243 * Lop off SYN bit if it has already been sent. However, if this
244 * is SYN-SENT state and if segment contains data and if we don't
245 * know that foreign host supports TAO, suppress sending segment.
246 */
247 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
248 flags &= ~TH_SYN;
249 off--, len++;
250 if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
251 taop->tao_ccsent == 0)
252 return 0;
253 }
254
255 /*
256 * Be careful not to send data and/or FIN on SYN segments
257 * in cases when no CC option will be sent.
258 * This measure is needed to prevent interoperability problems
259 * with not fully conformant TCP implementations.
260 */
261 if ((flags & TH_SYN) &&
262 ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
263 ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
264 len = 0;
265 flags &= ~TH_FIN;
266 }
267
268 if (len < 0) {
269 /*
270 * If FIN has been sent but not acked,
271 * but we haven't been called to retransmit,
272 * len will be < 0. Otherwise, window shrank
273 * after we sent into it. If window shrank to 0,
274 * cancel pending retransmit, pull snd_nxt back
275 * to (closed) window, and set the persist timer
276 * if it isn't already going. If the window didn't
277 * close completely, just wait for an ACK.
278 */
279 len = 0;
280 if (win == 0) {
281 callout_stop(tp->tt_rexmt);
282 tp->t_rxtshift = 0;
283 tp->snd_nxt = tp->snd_una;
284 if (!callout_active(tp->tt_persist))
285 tcp_setpersist(tp);
286 }
287 }
288
289 /*
290 * len will be >= 0 after this point. Truncate to the maximum
291 * segment length and ensure that FIN is removed if the length
292 * no longer contains the last data byte.
293 */
294 if (len > tp->t_maxseg) {
295 len = tp->t_maxseg;
296 sendalot = 1;
297 }
298 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
299 flags &= ~TH_FIN;
300
301 win = sbspace(&so->so_rcv);
302
303 /*
304 * Sender silly window avoidance. We transmit under the following
305 * conditions when len is non-zero:
306 *
307 * - We have a full segment
308 * - This is the last buffer in a write()/send() and we are
309 * either idle or running NODELAY
310 * - we've timed out (e.g. persist timer)
311 * - we have more then 1/2 the maximum send window's worth of
312 * data (receiver may be limited the window size)
313 * - we need to retransmit
314 */
315 if (len) {
316 if (len == tp->t_maxseg)
317 goto send;
318 /*
319 * NOTE! on localhost connections an 'ack' from the remote
320 * end may occur synchronously with the output and cause
321 * us to flush a buffer queued with moretocome. XXX
322 *
323 * note: the len + off check is almost certainly unnecessary.
324 */
325 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
326 (idle || (tp->t_flags & TF_NODELAY)) &&
327 len + off >= so->so_snd.sb_cc &&
328 (tp->t_flags & TF_NOPUSH) == 0) {
329 goto send;
330 }
331 if (tp->t_force) /* typ. timeout case */
332 goto send;
333 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
334 goto send;
335 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
336 goto send;
337 }
338
339 /*
340 * Compare available window to amount of window
341 * known to peer (as advertised window less
342 * next expected input). If the difference is at least two
343 * max size segments, or at least 50% of the maximum possible
344 * window, then want to send a window update to peer.
345 */
346 if (win > 0) {
347 /*
348 * "adv" is the amount we can increase the window,
349 * taking into account that we are limited by
350 * TCP_MAXWIN << tp->rcv_scale.
351 */
352 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
353 (tp->rcv_adv - tp->rcv_nxt);
354
355 if (adv >= (long) (2 * tp->t_maxseg))
356 goto send;
357 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
358 goto send;
359 }
360
361 /*
362 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
363 * is also a catch-all for the retransmit timer timeout case.
364 */
365 if (tp->t_flags & TF_ACKNOW)
366 goto send;
367 if ((flags & TH_RST) ||
368 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
369 goto send;
370 if (SEQ_GT(tp->snd_up, tp->snd_una))
371 goto send;
372 /*
373 * If our state indicates that FIN should be sent
374 * and we have not yet done so, then we need to send.
375 */
376 if (flags & TH_FIN &&
377 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
378 goto send;
379
380 /*
381 * TCP window updates are not reliable, rather a polling protocol
382 * using ``persist'' packets is used to insure receipt of window
383 * updates. The three ``states'' for the output side are:
384 * idle not doing retransmits or persists
385 * persisting to move a small or zero window
386 * (re)transmitting and thereby not persisting
387 *
388 * callout_active(tp->tt_persist)
389 * is true when we are in persist state.
390 * tp->t_force
391 * is set when we are called to send a persist packet.
392 * callout_active(tp->tt_rexmt)
393 * is set when we are retransmitting
394 * The output side is idle when both timers are zero.
395 *
396 * If send window is too small, there is data to transmit, and no
397 * retransmit or persist is pending, then go to persist state.
398 * If nothing happens soon, send when timer expires:
399 * if window is nonzero, transmit what we can,
400 * otherwise force out a byte.
401 */
402 if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
403 !callout_active(tp->tt_persist)) {
404 tp->t_rxtshift = 0;
405 tcp_setpersist(tp);
406 }
407
408 /*
409 * No reason to send a segment, just return.
410 */
411 return (0);
412
413 send:
414 /*
415 * Before ESTABLISHED, force sending of initial options
416 * unless TCP set not to do any options.
417 * NOTE: we assume that the IP/TCP header plus TCP options
418 * always fit in a single mbuf, leaving room for a maximum
419 * link header, i.e.
420 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
421 */
422 optlen = 0;
423 #ifdef INET6
424 if (isipv6)
425 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
426 else
427 #endif
428 hdrlen = sizeof (struct tcpiphdr);
429 if (flags & TH_SYN) {
430 tp->snd_nxt = tp->iss;
431 if ((tp->t_flags & TF_NOOPT) == 0) {
432 u_short mss;
433
434 opt[0] = TCPOPT_MAXSEG;
435 opt[1] = TCPOLEN_MAXSEG;
436 mss = htons((u_short) tcp_mssopt(tp));
437 (void)memcpy(opt + 2, &mss, sizeof(mss));
438 optlen = TCPOLEN_MAXSEG;
439
440 if ((tp->t_flags & TF_REQ_SCALE) &&
441 ((flags & TH_ACK) == 0 ||
442 (tp->t_flags & TF_RCVD_SCALE))) {
443 *((u_int32_t *)(opt + optlen)) = htonl(
444 TCPOPT_NOP << 24 |
445 TCPOPT_WINDOW << 16 |
446 TCPOLEN_WINDOW << 8 |
447 tp->request_r_scale);
448 optlen += 4;
449 }
450 }
451 }
452
453 /*
454 * Send a timestamp and echo-reply if this is a SYN and our side
455 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
456 * and our peer have sent timestamps in our SYN's.
457 */
458 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
459 (flags & TH_RST) == 0 &&
460 ((flags & TH_ACK) == 0 ||
461 (tp->t_flags & TF_RCVD_TSTMP))) {
462 u_int32_t *lp = (u_int32_t *)(opt + optlen);
463
464 /* Form timestamp option as shown in appendix A of RFC 1323. */
465 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
466 *lp++ = htonl(ticks);
467 *lp = htonl(tp->ts_recent);
468 optlen += TCPOLEN_TSTAMP_APPA;
469 }
470
471 /*
472 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
473 * options are allowed (!TF_NOOPT) and it's not a RST.
474 */
475 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
476 (flags & TH_RST) == 0) {
477 switch (flags & (TH_SYN|TH_ACK)) {
478 /*
479 * This is a normal ACK, send CC if we received CC before
480 * from our peer.
481 */
482 case TH_ACK:
483 if (!(tp->t_flags & TF_RCVD_CC))
484 break;
485 /*FALLTHROUGH*/
486
487 /*
488 * We can only get here in T/TCP's SYN_SENT* state, when
489 * we're a sending a non-SYN segment without waiting for
490 * the ACK of our SYN. A check above assures that we only
491 * do this if our peer understands T/TCP.
492 */
493 case 0:
494 opt[optlen++] = TCPOPT_NOP;
495 opt[optlen++] = TCPOPT_NOP;
496 opt[optlen++] = TCPOPT_CC;
497 opt[optlen++] = TCPOLEN_CC;
498 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
499
500 optlen += 4;
501 break;
502
503 /*
504 * This is our initial SYN, check whether we have to use
505 * CC or CC.new.
506 */
507 case TH_SYN:
508 opt[optlen++] = TCPOPT_NOP;
509 opt[optlen++] = TCPOPT_NOP;
510 opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
511 TCPOPT_CCNEW : TCPOPT_CC;
512 opt[optlen++] = TCPOLEN_CC;
513 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
514 optlen += 4;
515 break;
516
517 /*
518 * This is a SYN,ACK; send CC and CC.echo if we received
519 * CC from our peer.
520 */
521 case (TH_SYN|TH_ACK):
522 if (tp->t_flags & TF_RCVD_CC) {
523 opt[optlen++] = TCPOPT_NOP;
524 opt[optlen++] = TCPOPT_NOP;
525 opt[optlen++] = TCPOPT_CC;
526 opt[optlen++] = TCPOLEN_CC;
527 *(u_int32_t *)&opt[optlen] =
528 htonl(tp->cc_send);
529 optlen += 4;
530 opt[optlen++] = TCPOPT_NOP;
531 opt[optlen++] = TCPOPT_NOP;
532 opt[optlen++] = TCPOPT_CCECHO;
533 opt[optlen++] = TCPOLEN_CC;
534 *(u_int32_t *)&opt[optlen] =
535 htonl(tp->cc_recv);
536 optlen += 4;
537 }
538 break;
539 }
540 }
541
542 hdrlen += optlen;
543
544 #ifdef INET6
545 if (isipv6)
546 ipoptlen = ip6_optlen(tp->t_inpcb);
547 else
548 #endif
549 if (tp->t_inpcb->inp_options)
550 ipoptlen = tp->t_inpcb->inp_options->m_len -
551 offsetof(struct ipoption, ipopt_list);
552 else
553 ipoptlen = 0;
554 #ifdef IPSEC
555 ipoptlen += ipsec_hdrsiz_tcp(tp);
556 #endif
557
558 /*
559 * Adjust data length if insertion of options will
560 * bump the packet length beyond the t_maxopd length.
561 * Clear the FIN bit because we cut off the tail of
562 * the segment.
563 */
564 if (len + optlen + ipoptlen > tp->t_maxopd) {
565 /*
566 * If there is still more to send, don't close the connection.
567 */
568 flags &= ~TH_FIN;
569 len = tp->t_maxopd - optlen - ipoptlen;
570 sendalot = 1;
571 }
572
573 /*#ifdef DIAGNOSTIC*/
574 #ifdef INET6
575 if (max_linkhdr + hdrlen > MCLBYTES)
576 #else
577 if (max_linkhdr + hdrlen > MHLEN)
578 #endif
579 panic("tcphdr too big");
580 /*#endif*/
581
582 /*
583 * Grab a header mbuf, attaching a copy of data to
584 * be transmitted, and initialize the header from
585 * the template for sends on this connection.
586 */
587 if (len) {
588 if (tp->t_force && len == 1)
589 tcpstat.tcps_sndprobe++;
590 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
591 tcpstat.tcps_sndrexmitpack++;
592 tcpstat.tcps_sndrexmitbyte += len;
593 } else {
594 tcpstat.tcps_sndpack++;
595 tcpstat.tcps_sndbyte += len;
596 }
597 #ifdef notyet
598 if ((m = m_copypack(so->so_snd.sb_mb, off,
599 (int)len, max_linkhdr + hdrlen)) == 0) {
600 error = ENOBUFS;
601 goto out;
602 }
603 /*
604 * m_copypack left space for our hdr; use it.
605 */
606 m->m_len += hdrlen;
607 m->m_data -= hdrlen;
608 #else
609 MGETHDR(m, M_DONTWAIT, MT_HEADER);
610 if (m == NULL) {
611 error = ENOBUFS;
612 goto out;
613 }
614 #ifdef INET6
615 if (MHLEN < hdrlen + max_linkhdr) {
616 MCLGET(m, M_DONTWAIT);
617 if ((m->m_flags & M_EXT) == 0) {
618 m_freem(m);
619 error = ENOBUFS;
620 goto out;
621 }
622 }
623 #endif
624 m->m_data += max_linkhdr;
625 m->m_len = hdrlen;
626 if (len <= MHLEN - hdrlen - max_linkhdr) {
627 m_copydata(so->so_snd.sb_mb, off, (int) len,
628 mtod(m, caddr_t) + hdrlen);
629 m->m_len += len;
630 } else {
631 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
632 if (m->m_next == 0) {
633 (void) m_free(m);
634 error = ENOBUFS;
635 goto out;
636 }
637 }
638 #endif
639 /*
640 * If we're sending everything we've got, set PUSH.
641 * (This will keep happy those implementations which only
642 * give data to the user when a buffer fills or
643 * a PUSH comes in.)
644 */
645 if (off + len == so->so_snd.sb_cc)
646 flags |= TH_PUSH;
647 } else {
648 if (tp->t_flags & TF_ACKNOW)
649 tcpstat.tcps_sndacks++;
650 else if (flags & (TH_SYN|TH_FIN|TH_RST))
651 tcpstat.tcps_sndctrl++;
652 else if (SEQ_GT(tp->snd_up, tp->snd_una))
653 tcpstat.tcps_sndurg++;
654 else
655 tcpstat.tcps_sndwinup++;
656
657 MGETHDR(m, M_DONTWAIT, MT_HEADER);
658 if (m == NULL) {
659 error = ENOBUFS;
660 goto out;
661 }
662 #ifdef INET6
663 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
664 MHLEN >= hdrlen) {
665 MH_ALIGN(m, hdrlen);
666 } else
667 #endif
668 m->m_data += max_linkhdr;
669 m->m_len = hdrlen;
670 }
671 m->m_pkthdr.rcvif = (struct ifnet *)0;
672 #ifdef MAC
673 mac_create_mbuf_from_socket(so, m);
674 #endif
675 #ifdef INET6
676 if (isipv6) {
677 ip6 = mtod(m, struct ip6_hdr *);
678 th = (struct tcphdr *)(ip6 + 1);
679 tcp_fillheaders(tp, ip6, th);
680 } else
681 #endif /* INET6 */
682 {
683 ip = mtod(m, struct ip *);
684 ipov = (struct ipovly *)ip;
685 th = (struct tcphdr *)(ip + 1);
686 /* this picks up the pseudo header (w/o the length) */
687 tcp_fillheaders(tp, ip, th);
688 }
689
690 /*
691 * Fill in fields, remembering maximum advertised
692 * window for use in delaying messages about window sizes.
693 * If resending a FIN, be sure not to use a new sequence number.
694 */
695 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
696 tp->snd_nxt == tp->snd_max)
697 tp->snd_nxt--;
698 /*
699 * If we are doing retransmissions, then snd_nxt will
700 * not reflect the first unsent octet. For ACK only
701 * packets, we do not want the sequence number of the
702 * retransmitted packet, we want the sequence number
703 * of the next unsent octet. So, if there is no data
704 * (and no SYN or FIN), use snd_max instead of snd_nxt
705 * when filling in ti_seq. But if we are in persist
706 * state, snd_max might reflect one byte beyond the
707 * right edge of the window, so use snd_nxt in that
708 * case, since we know we aren't doing a retransmission.
709 * (retransmit and persist are mutually exclusive...)
710 */
711 if (len || (flags & (TH_SYN|TH_FIN))
712 || callout_active(tp->tt_persist))
713 th->th_seq = htonl(tp->snd_nxt);
714 else
715 th->th_seq = htonl(tp->snd_max);
716 th->th_ack = htonl(tp->rcv_nxt);
717 if (optlen) {
718 bcopy(opt, th + 1, optlen);
719 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
720 }
721 th->th_flags = flags;
722 /*
723 * Calculate receive window. Don't shrink window,
724 * but avoid silly window syndrome.
725 */
726 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
727 win = 0;
728 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
729 win = (long)(tp->rcv_adv - tp->rcv_nxt);
730 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
731 win = (long)TCP_MAXWIN << tp->rcv_scale;
732 th->th_win = htons((u_short) (win>>tp->rcv_scale));
733
734
735 /*
736 * Adjust the RXWIN0SENT flag - indicate that we have advertised
737 * a 0 window. This may cause the remote transmitter to stall. This
738 * flag tells soreceive() to disable delayed acknowledgements when
739 * draining the buffer. This can occur if the receiver is attempting
740 * to read more data then can be buffered prior to transmitting on
741 * the connection.
742 */
743 if (win == 0)
744 tp->t_flags |= TF_RXWIN0SENT;
745 else
746 tp->t_flags &= ~TF_RXWIN0SENT;
747 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
748 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
749 th->th_flags |= TH_URG;
750 } else
751 /*
752 * If no urgent pointer to send, then we pull
753 * the urgent pointer to the left edge of the send window
754 * so that it doesn't drift into the send window on sequence
755 * number wraparound.
756 */
757 tp->snd_up = tp->snd_una; /* drag it along */
758
759 /*
760 * Put TCP length in extended header, and then
761 * checksum extended header and data.
762 */
763 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
764 #ifdef INET6
765 if (isipv6)
766 /*
767 * ip6_plen is not need to be filled now, and will be filled
768 * in ip6_output.
769 */
770 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
771 sizeof(struct tcphdr) + optlen + len);
772 else
773 #endif /* INET6 */
774 {
775 m->m_pkthdr.csum_flags = CSUM_TCP;
776 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
777 if (len + optlen)
778 th->th_sum = in_addword(th->th_sum,
779 htons((u_short)(optlen + len)));
780
781 /* IP version must be set here for ipv4/ipv6 checking later */
782 KASSERT(ip->ip_v == IPVERSION,
783 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
784 }
785
786 /*
787 * In transmit state, time the transmission and arrange for
788 * the retransmit. In persist state, just set snd_max.
789 */
790 if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
791 tcp_seq startseq = tp->snd_nxt;
792
793 /*
794 * Advance snd_nxt over sequence space of this segment.
795 */
796 if (flags & (TH_SYN|TH_FIN)) {
797 if (flags & TH_SYN)
798 tp->snd_nxt++;
799 if (flags & TH_FIN) {
800 tp->snd_nxt++;
801 tp->t_flags |= TF_SENTFIN;
802 }
803 }
804 tp->snd_nxt += len;
805 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
806 tp->snd_max = tp->snd_nxt;
807 /*
808 * Time this transmission if not a retransmission and
809 * not currently timing anything.
810 */
811 if (tp->t_rtttime == 0) {
812 tp->t_rtttime = ticks;
813 tp->t_rtseq = startseq;
814 tcpstat.tcps_segstimed++;
815 }
816 }
817
818 /*
819 * Set retransmit timer if not currently set,
820 * and not doing a pure ack or a keep-alive probe.
821 * Initial value for retransmit timer is smoothed
822 * round-trip time + 2 * round-trip time variance.
823 * Initialize shift counter which is used for backoff
824 * of retransmit time.
825 */
826 if (!callout_active(tp->tt_rexmt) &&
827 tp->snd_nxt != tp->snd_una) {
828 if (callout_active(tp->tt_persist)) {
829 callout_stop(tp->tt_persist);
830 tp->t_rxtshift = 0;
831 }
832 callout_reset(tp->tt_rexmt, tp->t_rxtcur,
833 tcp_timer_rexmt, tp);
834 }
835 } else {
836 /*
837 * Persist case, update snd_max but since we are in
838 * persist mode (no window) we do not update snd_nxt.
839 */
840 int xlen = len;
841 if (flags & TH_SYN)
842 ++xlen;
843 if (flags & TH_FIN) {
844 ++xlen;
845 tp->t_flags |= TF_SENTFIN;
846 }
847 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
848 tp->snd_max = tp->snd_nxt + len;
849 }
850
851 #ifdef TCPDEBUG
852 /*
853 * Trace.
854 */
855 if (so->so_options & SO_DEBUG)
856 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
857 #endif
858
859 /*
860 * Fill in IP length and desired time to live and
861 * send to IP level. There should be a better way
862 * to handle ttl and tos; we could keep them in
863 * the template, but need a way to checksum without them.
864 */
865 /*
866 * m->m_pkthdr.len should have been set before cksum calcuration,
867 * because in6_cksum() need it.
868 */
869 #ifdef INET6
870 if (isipv6) {
871 /*
872 * we separately set hoplimit for every segment, since the
873 * user might want to change the value via setsockopt.
874 * Also, desired default hop limit might be changed via
875 * Neighbor Discovery.
876 */
877 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
878 tp->t_inpcb->in6p_route.ro_rt ?
879 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
880 : NULL);
881
882 /* TODO: IPv6 IP6TOS_ECT bit on */
883 error = ip6_output(m,
884 tp->t_inpcb->in6p_outputopts,
885 &tp->t_inpcb->in6p_route,
886 (so->so_options & SO_DONTROUTE), NULL, NULL,
887 tp->t_inpcb);
888 } else
889 #endif /* INET6 */
890 {
891 struct rtentry *rt;
892 ip->ip_len = m->m_pkthdr.len;
893 #ifdef INET6
894 if (INP_CHECK_SOCKAF(so, AF_INET6))
895 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
896 tp->t_inpcb->in6p_route.ro_rt ?
897 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
898 : NULL);
899 else
900 #endif /* INET6 */
901 ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */
902 ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */
903 /*
904 * See if we should do MTU discovery. We do it only if the following
905 * are true:
906 * 1) we have a valid route to the destination
907 * 2) the MTU is not locked (if it is, then discovery has been
908 * disabled)
909 */
910 if (path_mtu_discovery
911 && (rt = tp->t_inpcb->inp_route.ro_rt)
912 && rt->rt_flags & RTF_UP
913 && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
914 ip->ip_off |= IP_DF;
915 }
916 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
917 (so->so_options & SO_DONTROUTE), 0, tp->t_inpcb);
918 }
919 if (error) {
920
921 /*
922 * We know that the packet was lost, so back out the
923 * sequence number advance, if any.
924 */
925 if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
926 /*
927 * No need to check for TH_FIN here because
928 * the TF_SENTFIN flag handles that case.
929 */
930 if ((flags & TH_SYN) == 0)
931 tp->snd_nxt -= len;
932 }
933
934 out:
935 if (error == ENOBUFS) {
936 if (!callout_active(tp->tt_rexmt) &&
937 !callout_active(tp->tt_persist))
938 callout_reset(tp->tt_rexmt, tp->t_rxtcur,
939 tcp_timer_rexmt, tp);
940 tcp_quench(tp->t_inpcb, 0);
941 return (0);
942 }
943 if (error == EMSGSIZE) {
944 /*
945 * ip_output() will have already fixed the route
946 * for us. tcp_mtudisc() will, as its last action,
947 * initiate retransmission, so it is important to
948 * not do so here.
949 */
950 tcp_mtudisc(tp->t_inpcb, 0);
951 return 0;
952 }
953 if ((error == EHOSTUNREACH || error == ENETDOWN)
954 && TCPS_HAVERCVDSYN(tp->t_state)) {
955 tp->t_softerror = error;
956 return (0);
957 }
958 return (error);
959 }
960 tcpstat.tcps_sndtotal++;
961
962 /*
963 * Data sent (as far as we can tell).
964 * If this advertises a larger window than any other segment,
965 * then remember the size of the advertised window.
966 * Any pending ACK has now been sent.
967 */
968 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
969 tp->rcv_adv = tp->rcv_nxt + win;
970 tp->last_ack_sent = tp->rcv_nxt;
971 tp->t_flags &= ~TF_ACKNOW;
972 if (tcp_delack_enabled)
973 callout_stop(tp->tt_delack);
974 #if 0
975 /*
976 * This completely breaks TCP if newreno is turned on. What happens
977 * is that if delayed-acks are turned on on the receiver, this code
978 * on the transmitter effectively destroys the TCP window, forcing
979 * it to four packets (1.5Kx4 = 6K window).
980 */
981 if (sendalot && (!tcp_do_newreno || --maxburst))
982 goto again;
983 #endif
984 if (sendalot)
985 goto again;
986 return (0);
987 }
988
989 void
990 tcp_setpersist(tp)
991 register struct tcpcb *tp;
992 {
993 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
994 int tt;
995
996 if (callout_active(tp->tt_rexmt))
997 panic("tcp_setpersist: retransmit pending");
998 /*
999 * Start/restart persistance timer.
1000 */
1001 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
1002 TCPTV_PERSMIN, TCPTV_PERSMAX);
1003 callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp);
1004 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1005 tp->t_rxtshift++;
1006 }
Cache object: c7e5c4965e2c60db5acecc0448a8e4ee
|