FreeBSD/Linux Kernel Cross Reference
sys/ip/tcp.c
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7
8 #include "ip.h"
9
10 enum
11 {
12 QMAX = 64*1024-1,
13 IP_TCPPROTO = 6,
14
15 TCP4_IPLEN = 8,
16 TCP4_PHDRSIZE = 12,
17 TCP4_HDRSIZE = 20,
18 TCP4_TCBPHDRSZ = 40,
19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
20
21 TCP6_IPLEN = 0,
22 TCP6_PHDRSIZE = 40,
23 TCP6_HDRSIZE = 20,
24 TCP6_TCBPHDRSZ = 60,
25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
26
27 TcptimerOFF = 0,
28 TcptimerON = 1,
29 TcptimerDONE = 2,
30 MAX_TIME = (1<<20), /* Forever */
31 TCP_ACK = 50, /* Timed ack sequence in ms */
32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
33
34 URG = 0x20, /* Data marked urgent */
35 ACK = 0x10, /* Acknowledge is valid */
36 PSH = 0x08, /* Whole data pipe is pushed */
37 RST = 0x04, /* Reset connection */
38 SYN = 0x02, /* Pkt. is synchronise */
39 FIN = 0x01, /* Start close down */
40
41 EOLOPT = 0,
42 NOOPOPT = 1,
43 MSSOPT = 2,
44 MSS_LENGTH = 4, /* Mean segment size */
45 WSOPT = 3,
46 WS_LENGTH = 3, /* Bits to scale window size by */
47 MSL2 = 10,
48 MSPTICK = 50, /* Milliseconds per timer tick */
49 DEF_MSS = 1460, /* Default mean segment */
50 DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */
51 DEF_RTT = 500, /* Default round trip */
52 DEF_KAT = 120000, /* Default time (ms) between keep alives */
53 TCP_LISTEN = 0, /* Listen connection */
54 TCP_CONNECT = 1, /* Outgoing connection */
55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
56
57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
58
59 FORCE = 1,
60 CLONE = 2,
61 RETRAN = 4,
62 ACTIVE = 8,
63 SYNACK = 16,
64
65 LOGAGAIN = 3,
66 LOGDGAIN = 2,
67
68 Closed = 0, /* Connection states */
69 Listen,
70 Syn_sent,
71 Syn_received,
72 Established,
73 Finwait1,
74 Finwait2,
75 Close_wait,
76 Closing,
77 Last_ack,
78 Time_wait,
79
80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
81 NLHT = 256, /* hash table size, must be a power of 2 */
82 LHTMASK = NLHT-1,
83
84 HaveWS = 1<<8,
85 };
86
87 /* Must correspond to the enumeration above */
88 char *tcpstates[] =
89 {
90 "Closed", "Listen", "Syn_sent", "Syn_received",
91 "Established", "Finwait1", "Finwait2", "Close_wait",
92 "Closing", "Last_ack", "Time_wait"
93 };
94
95 typedef struct Tcptimer Tcptimer;
96 struct Tcptimer
97 {
98 Tcptimer *next;
99 Tcptimer *prev;
100 Tcptimer *readynext;
101 int state;
102 int start;
103 int count;
104 void (*func)(void*);
105 void *arg;
106 };
107
108 /*
109 * v4 and v6 pseudo headers used for
110 * checksuming tcp
111 */
112 typedef struct Tcp4hdr Tcp4hdr;
113 struct Tcp4hdr
114 {
115 uchar vihl; /* Version and header length */
116 uchar tos; /* Type of service */
117 uchar length[2]; /* packet length */
118 uchar id[2]; /* Identification */
119 uchar frag[2]; /* Fragment information */
120 uchar Unused;
121 uchar proto;
122 uchar tcplen[2];
123 uchar tcpsrc[4];
124 uchar tcpdst[4];
125 uchar tcpsport[2];
126 uchar tcpdport[2];
127 uchar tcpseq[4];
128 uchar tcpack[4];
129 uchar tcpflag[2];
130 uchar tcpwin[2];
131 uchar tcpcksum[2];
132 uchar tcpurg[2];
133 /* Options segment */
134 uchar tcpopt[1];
135 };
136
137 typedef struct Tcp6hdr Tcp6hdr;
138 struct Tcp6hdr
139 {
140 uchar vcf[4];
141 uchar ploadlen[2];
142 uchar proto;
143 uchar ttl;
144 uchar tcpsrc[IPaddrlen];
145 uchar tcpdst[IPaddrlen];
146 uchar tcpsport[2];
147 uchar tcpdport[2];
148 uchar tcpseq[4];
149 uchar tcpack[4];
150 uchar tcpflag[2];
151 uchar tcpwin[2];
152 uchar tcpcksum[2];
153 uchar tcpurg[2];
154 /* Options segment */
155 uchar tcpopt[1];
156 };
157
158 /*
159 * this represents the control info
160 * for a single packet. It is derived from
161 * a packet in ntohtcp{4,6}() and stuck into
162 * a packet in htontcp{4,6}().
163 */
164 typedef struct Tcp Tcp;
165 struct Tcp
166 {
167 ushort source;
168 ushort dest;
169 ulong seq;
170 ulong ack;
171 uchar flags;
172 ushort ws; /* window scale option (if not zero) */
173 ulong wnd;
174 ushort urg;
175 ushort mss; /* max segment size option (if not zero) */
176 ushort len; /* size of data */
177 };
178
179 /*
180 * this header is malloc'd to thread together fragments
181 * waiting to be coalesced
182 */
183 typedef struct Reseq Reseq;
184 struct Reseq
185 {
186 Reseq *next;
187 Tcp seg;
188 Block *bp;
189 ushort length;
190 };
191
192 /*
193 * the qlock in the Conv locks this structure
194 */
195 typedef struct Tcpctl Tcpctl;
196 struct Tcpctl
197 {
198 uchar state; /* Connection state */
199 uchar type; /* Listening or active connection */
200 uchar code; /* Icmp code */
201 struct {
202 ulong una; /* Unacked data pointer */
203 ulong nxt; /* Next sequence expected */
204 ulong ptr; /* Data pointer */
205 ulong wnd; /* Tcp send window */
206 ulong urg; /* Urgent data pointer */
207 ulong wl2;
208 int scale; /* how much to right shift window in xmitted packets */
209 /* to implement tahoe and reno TCP */
210 ulong dupacks; /* number of duplicate acks rcvd */
211 int recovery; /* loss recovery flag */
212 ulong rxt; /* right window marker for recovery */
213 } snd;
214 struct {
215 ulong nxt; /* Receive pointer to next uchar slot */
216 ulong wnd; /* Receive window incoming */
217 ulong urg; /* Urgent pointer */
218 int blocked;
219 int una; /* unacked data segs */
220 int scale; /* how much to left shift window in rcved packets */
221 } rcv;
222 ulong iss; /* Initial sequence number */
223 int sawwsopt; /* true if we saw a wsopt on the incoming SYN */
224 ulong cwind; /* Congestion window */
225 int scale; /* desired snd.scale */
226 ushort ssthresh; /* Slow start threshold */
227 int resent; /* Bytes just resent */
228 int irs; /* Initial received squence */
229 ushort mss; /* Mean segment size */
230 int rerecv; /* Overlap of data rerecevived */
231 ulong window; /* Recevive window */
232 uchar backoff; /* Exponential backoff counter */
233 int backedoff; /* ms we've backed off for rexmits */
234 uchar flags; /* State flags */
235 Reseq *reseq; /* Resequencing queue */
236 Tcptimer timer; /* Activity timer */
237 Tcptimer acktimer; /* Acknowledge timer */
238 Tcptimer rtt_timer; /* Round trip timer */
239 Tcptimer katimer; /* keep alive timer */
240 ulong rttseq; /* Round trip sequence */
241 int srtt; /* Shortened round trip */
242 int mdev; /* Mean deviation of round trip */
243 int kacounter; /* count down for keep alive */
244 uint sndsyntime; /* time syn sent */
245 ulong time; /* time Finwait2 or Syn_received was sent */
246 int nochecksum; /* non-zero means don't send checksums */
247 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
248
249 union {
250 Tcp4hdr tcp4hdr;
251 Tcp6hdr tcp6hdr;
252 } protohdr; /* prototype header */
253 };
254
255 /*
256 * New calls are put in limbo rather than having a conversation structure
257 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
258 * any real Conv structures mucking things up. Calls in limbo rexmit their
259 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
260 *
261 * In particular they aren't on a listener's queue so that they don't figure
262 * in the input queue limit.
263 *
264 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
266 * there is no hashing of this list.
267 */
268 typedef struct Limbo Limbo;
269 struct Limbo
270 {
271 Limbo *next;
272
273 uchar laddr[IPaddrlen];
274 uchar raddr[IPaddrlen];
275 ushort lport;
276 ushort rport;
277 ulong irs; /* initial received sequence */
278 ulong iss; /* initial sent sequence */
279 ushort mss; /* mss from the other end */
280 ushort rcvscale; /* how much to scale rcvd windows */
281 ushort sndscale; /* how much to scale sent windows */
282 ulong lastsend; /* last time we sent a synack */
283 uchar version; /* v4 or v6 */
284 uchar rexmits; /* number of retransmissions */
285 };
286
287 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
288 ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */
289
290 enum {
291 /* MIB stats */
292 MaxConn,
293 ActiveOpens,
294 PassiveOpens,
295 EstabResets,
296 CurrEstab,
297 InSegs,
298 OutSegs,
299 RetransSegs,
300 RetransTimeouts,
301 InErrs,
302 OutRsts,
303
304 /* non-MIB stats */
305 CsumErrs,
306 HlenErrs,
307 LenErrs,
308 OutOfOrder,
309
310 Nstats
311 };
312
313 static char *statnames[] =
314 {
315 [MaxConn] "MaxConn",
316 [ActiveOpens] "ActiveOpens",
317 [PassiveOpens] "PassiveOpens",
318 [EstabResets] "EstabResets",
319 [CurrEstab] "CurrEstab",
320 [InSegs] "InSegs",
321 [OutSegs] "OutSegs",
322 [RetransSegs] "RetransSegs",
323 [RetransTimeouts] "RetransTimeouts",
324 [InErrs] "InErrs",
325 [OutRsts] "OutRsts",
326 [CsumErrs] "CsumErrs",
327 [HlenErrs] "HlenErrs",
328 [LenErrs] "LenErrs",
329 [OutOfOrder] "OutOfOrder",
330 };
331
332 typedef struct Tcppriv Tcppriv;
333 struct Tcppriv
334 {
335 /* List of active timers */
336 QLock tl;
337 Tcptimer *timers;
338
339 /* hash table for matching conversations */
340 Ipht ht;
341
342 /* calls in limbo waiting for an ACK to our SYN ACK */
343 int nlimbo;
344 Limbo *lht[NLHT];
345
346 /* for keeping track of tcpackproc */
347 QLock apl;
348 int ackprocstarted;
349
350 ulong stats[Nstats];
351 };
352
353 /*
354 * Setting tcpporthogdefense to non-zero enables Dong Lin's
355 * solution to hijacked systems staking out port's as a form
356 * of DoS attack.
357 *
358 * To avoid stateless Conv hogs, we pick a sequence number at random. If
359 * that number gets acked by the other end, we shut down the connection.
360 * Look for tcpporthogdefense in the code.
361 */
362 int tcpporthogdefense = 0;
363
364 int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
365 void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
366 void localclose(Conv*, char*);
367 void procsyn(Conv*, Tcp*);
368 void tcpiput(Proto*, Ipifc*, Block*);
369 void tcpoutput(Conv*);
370 int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
371 void tcpstart(Conv*, int);
372 void tcptimeout(void*);
373 void tcpsndsyn(Conv*, Tcpctl*);
374 void tcprcvwin(Conv*);
375 void tcpacktimer(void*);
376 void tcpkeepalive(void*);
377 void tcpsetkacounter(Tcpctl*);
378 void tcprxmit(Conv*);
379 void tcpsettimer(Tcpctl*);
380 void tcpsynackrtt(Conv*);
381 void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
382
383 static void limborexmit(Proto*);
384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
385
386 void
387 tcpsetstate(Conv *s, uchar newstate)
388 {
389 Tcpctl *tcb;
390 uchar oldstate;
391 Tcppriv *tpriv;
392
393 tpriv = s->p->priv;
394
395 tcb = (Tcpctl*)s->ptcl;
396
397 oldstate = tcb->state;
398 if(oldstate == newstate)
399 return;
400
401 if(oldstate == Established)
402 tpriv->stats[CurrEstab]--;
403 if(newstate == Established)
404 tpriv->stats[CurrEstab]++;
405
406 /**
407 print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
408 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
409 **/
410
411 switch(newstate) {
412 case Closed:
413 qclose(s->rq);
414 qclose(s->wq);
415 qclose(s->eq);
416 break;
417
418 case Close_wait: /* Remote closes */
419 qhangup(s->rq, nil);
420 break;
421 }
422
423 tcb->state = newstate;
424
425 if(oldstate == Syn_sent && newstate != Closed)
426 Fsconnected(s, nil);
427 }
428
429 static char*
430 tcpconnect(Conv *c, char **argv, int argc)
431 {
432 char *e;
433 Tcpctl *tcb;
434
435 tcb = (Tcpctl*)(c->ptcl);
436 if(tcb->state != Closed)
437 return Econinuse;
438
439 e = Fsstdconnect(c, argv, argc);
440 if(e != nil)
441 return e;
442 tcpstart(c, TCP_CONNECT);
443
444 return nil;
445 }
446
447 static int
448 tcpstate(Conv *c, char *state, int n)
449 {
450 Tcpctl *s;
451
452 s = (Tcpctl*)(c->ptcl);
453
454 return snprint(state, n,
455 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
456 tcpstates[s->state],
457 c->rq ? qlen(c->rq) : 0,
458 c->wq ? qlen(c->wq) : 0,
459 s->srtt, s->mdev,
460 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
461 s->timer.start, s->timer.count, s->rerecv,
462 s->katimer.start, s->katimer.count);
463 }
464
465 static int
466 tcpinuse(Conv *c)
467 {
468 Tcpctl *s;
469
470 s = (Tcpctl*)(c->ptcl);
471 return s->state != Closed;
472 }
473
474 static char*
475 tcpannounce(Conv *c, char **argv, int argc)
476 {
477 char *e;
478 Tcpctl *tcb;
479
480 tcb = (Tcpctl*)(c->ptcl);
481 if(tcb->state != Closed)
482 return Econinuse;
483
484 e = Fsstdannounce(c, argv, argc);
485 if(e != nil)
486 return e;
487 tcpstart(c, TCP_LISTEN);
488 Fsconnected(c, nil);
489
490 return nil;
491 }
492
493 /*
494 * tcpclose is always called with the q locked
495 */
496 static void
497 tcpclose(Conv *c)
498 {
499 Tcpctl *tcb;
500
501 tcb = (Tcpctl*)c->ptcl;
502
503 qhangup(c->rq, nil);
504 qhangup(c->wq, nil);
505 qhangup(c->eq, nil);
506 qflush(c->rq);
507
508 switch(tcb->state) {
509 case Listen:
510 /*
511 * reset any incoming calls to this listener
512 */
513 Fsconnected(c, "Hangup");
514
515 localclose(c, nil);
516 break;
517 case Closed:
518 case Syn_sent:
519 localclose(c, nil);
520 break;
521 case Syn_received:
522 case Established:
523 tcb->flgcnt++;
524 tcb->snd.nxt++;
525 tcpsetstate(c, Finwait1);
526 tcpoutput(c);
527 break;
528 case Close_wait:
529 tcb->flgcnt++;
530 tcb->snd.nxt++;
531 tcpsetstate(c, Last_ack);
532 tcpoutput(c);
533 break;
534 }
535 }
536
537 void
538 tcpkick(void *x)
539 {
540 Conv *s = x;
541 Tcpctl *tcb;
542
543 tcb = (Tcpctl*)s->ptcl;
544
545 if(waserror()){
546 qunlock(s);
547 nexterror();
548 }
549 qlock(s);
550
551 switch(tcb->state) {
552 case Syn_sent:
553 case Syn_received:
554 case Established:
555 case Close_wait:
556 /*
557 * Push data
558 */
559 tcprcvwin(s);
560 tcpoutput(s);
561 break;
562 default:
563 localclose(s, "Hangup");
564 break;
565 }
566
567 qunlock(s);
568 poperror();
569 }
570
571 void
572 tcprcvwin(Conv *s) /* Call with tcb locked */
573 {
574 int w;
575 Tcpctl *tcb;
576
577 tcb = (Tcpctl*)s->ptcl;
578 w = tcb->window - qlen(s->rq);
579 if(w < 0)
580 w = 0;
581 tcb->rcv.wnd = w;
582 if(w == 0)
583 tcb->rcv.blocked = 1;
584 }
585
586 void
587 tcpacktimer(void *v)
588 {
589 Tcpctl *tcb;
590 Conv *s;
591
592 s = v;
593 tcb = (Tcpctl*)s->ptcl;
594
595 if(waserror()){
596 qunlock(s);
597 nexterror();
598 }
599 qlock(s);
600 if(tcb->state != Closed){
601 tcb->flags |= FORCE;
602 tcprcvwin(s);
603 tcpoutput(s);
604 }
605 qunlock(s);
606 poperror();
607 }
608
609 static void
610 tcpcreate(Conv *c)
611 {
612 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
613 c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
614 }
615
616 static void
617 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
618 {
619 if(newstate != TcptimerON){
620 if(t->state == TcptimerON){
621 /* unchain */
622 if(priv->timers == t){
623 priv->timers = t->next;
624 if(t->prev != nil)
625 panic("timerstate1");
626 }
627 if(t->next)
628 t->next->prev = t->prev;
629 if(t->prev)
630 t->prev->next = t->next;
631 t->next = t->prev = nil;
632 }
633 } else {
634 if(t->state != TcptimerON){
635 /* chain */
636 if(t->prev != nil || t->next != nil)
637 panic("timerstate2");
638 t->prev = nil;
639 t->next = priv->timers;
640 if(t->next)
641 t->next->prev = t;
642 priv->timers = t;
643 }
644 }
645 t->state = newstate;
646 }
647
648 void
649 tcpackproc(void *a)
650 {
651 Tcptimer *t, *tp, *timeo;
652 Proto *tcp;
653 Tcppriv *priv;
654 int loop;
655
656 tcp = a;
657 priv = tcp->priv;
658
659 for(;;) {
660 tsleep(&up->sleep, return0, 0, MSPTICK);
661
662 qlock(&priv->tl);
663 timeo = nil;
664 loop = 0;
665 for(t = priv->timers; t != nil; t = tp) {
666 if(loop++ > 10000)
667 panic("tcpackproc1");
668 tp = t->next;
669 if(t->state == TcptimerON) {
670 t->count--;
671 if(t->count == 0) {
672 timerstate(priv, t, TcptimerDONE);
673 t->readynext = timeo;
674 timeo = t;
675 }
676 }
677 }
678 qunlock(&priv->tl);
679
680 loop = 0;
681 for(t = timeo; t != nil; t = t->readynext) {
682 if(loop++ > 10000)
683 panic("tcpackproc2");
684 if(t->state == TcptimerDONE && t->func != nil && !waserror()){
685 (*t->func)(t->arg);
686 poperror();
687 }
688 }
689
690 limborexmit(tcp);
691 }
692 }
693
694 void
695 tcpgo(Tcppriv *priv, Tcptimer *t)
696 {
697 if(t == nil || t->start == 0)
698 return;
699
700 qlock(&priv->tl);
701 t->count = t->start;
702 timerstate(priv, t, TcptimerON);
703 qunlock(&priv->tl);
704 }
705
706 void
707 tcphalt(Tcppriv *priv, Tcptimer *t)
708 {
709 if(t == nil)
710 return;
711
712 qlock(&priv->tl);
713 timerstate(priv, t, TcptimerOFF);
714 qunlock(&priv->tl);
715 }
716
717 int
718 backoff(int n)
719 {
720 return 1 << n;
721 }
722
723 void
724 localclose(Conv *s, char *reason) /* called with tcb locked */
725 {
726 Tcpctl *tcb;
727 Reseq *rp,*rp1;
728 Tcppriv *tpriv;
729
730 tpriv = s->p->priv;
731 tcb = (Tcpctl*)s->ptcl;
732
733 iphtrem(&tpriv->ht, s);
734
735 tcphalt(tpriv, &tcb->timer);
736 tcphalt(tpriv, &tcb->rtt_timer);
737 tcphalt(tpriv, &tcb->acktimer);
738 tcphalt(tpriv, &tcb->katimer);
739
740 /* Flush reassembly queue; nothing more can arrive */
741 for(rp = tcb->reseq; rp != nil; rp = rp1) {
742 rp1 = rp->next;
743 freeblist(rp->bp);
744 free(rp);
745 }
746 tcb->reseq = nil;
747
748 if(tcb->state == Syn_sent)
749 Fsconnected(s, reason);
750 if(s->state == Announced)
751 wakeup(&s->listenr);
752
753 qhangup(s->rq, reason);
754 qhangup(s->wq, reason);
755
756 tcpsetstate(s, Closed);
757 }
758
759 /* mtu (- TCP + IP hdr len) of 1st hop */
760 int
761 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
762 {
763 Ipifc *ifc;
764 int mtu;
765
766 ifc = findipifc(tcp->f, addr, 0);
767 switch(version){
768 default:
769 case V4:
770 mtu = DEF_MSS;
771 if(ifc != nil)
772 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
773 break;
774 case V6:
775 mtu = DEF_MSS6;
776 if(ifc != nil)
777 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
778 break;
779 }
780 if(ifc != nil){
781 if(ifc->mbps > 1000)
782 *scale = HaveWS | 4;
783 else if(ifc->mbps > 100)
784 *scale = HaveWS | 3;
785 else if(ifc->mbps > 10)
786 *scale = HaveWS | 1;
787 else
788 *scale = HaveWS | 0;
789 } else
790 *scale = HaveWS | 0;
791
792 return mtu;
793 }
794
795 void
796 inittcpctl(Conv *s, int mode)
797 {
798 Tcpctl *tcb;
799 Tcp4hdr* h4;
800 Tcp6hdr* h6;
801 int mss;
802
803 tcb = (Tcpctl*)s->ptcl;
804
805 memset(tcb, 0, sizeof(Tcpctl));
806
807 tcb->ssthresh = 65535;
808 tcb->srtt = tcp_irtt<<LOGAGAIN;
809 tcb->mdev = 0;
810
811 /* setup timers */
812 tcb->timer.start = tcp_irtt / MSPTICK;
813 tcb->timer.func = tcptimeout;
814 tcb->timer.arg = s;
815 tcb->rtt_timer.start = MAX_TIME;
816 tcb->acktimer.start = TCP_ACK / MSPTICK;
817 tcb->acktimer.func = tcpacktimer;
818 tcb->acktimer.arg = s;
819 tcb->katimer.start = DEF_KAT / MSPTICK;
820 tcb->katimer.func = tcpkeepalive;
821 tcb->katimer.arg = s;
822
823 mss = DEF_MSS;
824
825 /* create a prototype(pseudo) header */
826 if(mode != TCP_LISTEN){
827 if(ipcmp(s->laddr, IPnoaddr) == 0)
828 findlocalip(s->p->f, s->laddr, s->raddr);
829
830 switch(s->ipversion){
831 case V4:
832 h4 = &tcb->protohdr.tcp4hdr;
833 memset(h4, 0, sizeof(*h4));
834 h4->proto = IP_TCPPROTO;
835 hnputs(h4->tcpsport, s->lport);
836 hnputs(h4->tcpdport, s->rport);
837 v6tov4(h4->tcpsrc, s->laddr);
838 v6tov4(h4->tcpdst, s->raddr);
839 break;
840 case V6:
841 h6 = &tcb->protohdr.tcp6hdr;
842 memset(h6, 0, sizeof(*h6));
843 h6->proto = IP_TCPPROTO;
844 hnputs(h6->tcpsport, s->lport);
845 hnputs(h6->tcpdport, s->rport);
846 ipmove(h6->tcpsrc, s->laddr);
847 ipmove(h6->tcpdst, s->raddr);
848 mss = DEF_MSS6;
849 break;
850 default:
851 panic("inittcpctl: version %d", s->ipversion);
852 }
853 }
854
855 tcb->mss = tcb->cwind = mss;
856
857 /* default is no window scaling */
858 tcb->window = QMAX;
859 tcb->rcv.wnd = QMAX;
860 tcb->rcv.scale = 0;
861 tcb->snd.scale = 0;
862 qsetlimit(s->rq, QMAX);
863 }
864
865 /*
866 * called with s qlocked
867 */
868 void
869 tcpstart(Conv *s, int mode)
870 {
871 Tcpctl *tcb;
872 Tcppriv *tpriv;
873 char kpname[KNAMELEN];
874
875 tpriv = s->p->priv;
876
877 if(tpriv->ackprocstarted == 0){
878 qlock(&tpriv->apl);
879 if(tpriv->ackprocstarted == 0){
880 sprint(kpname, "#I%dtcpack", s->p->f->dev);
881 kproc(kpname, tcpackproc, s->p);
882 tpriv->ackprocstarted = 1;
883 }
884 qunlock(&tpriv->apl);
885 }
886
887 tcb = (Tcpctl*)s->ptcl;
888
889 inittcpctl(s, mode);
890
891 iphtadd(&tpriv->ht, s);
892 switch(mode) {
893 case TCP_LISTEN:
894 tpriv->stats[PassiveOpens]++;
895 tcb->flags |= CLONE;
896 tcpsetstate(s, Listen);
897 break;
898
899 case TCP_CONNECT:
900 tpriv->stats[ActiveOpens]++;
901 tcb->flags |= ACTIVE;
902 tcpsndsyn(s, tcb);
903 tcpsetstate(s, Syn_sent);
904 tcpoutput(s);
905 break;
906 }
907 }
908
909 static char*
910 tcpflag(ushort flag)
911 {
912 static char buf[128];
913
914 sprint(buf, "%d", flag>>10); /* Head len */
915 if(flag & URG)
916 strcat(buf, " URG");
917 if(flag & ACK)
918 strcat(buf, " ACK");
919 if(flag & PSH)
920 strcat(buf, " PSH");
921 if(flag & RST)
922 strcat(buf, " RST");
923 if(flag & SYN)
924 strcat(buf, " SYN");
925 if(flag & FIN)
926 strcat(buf, " FIN");
927
928 return buf;
929 }
930
931 Block *
932 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
933 {
934 int dlen;
935 Tcp6hdr *h;
936 ushort csum;
937 ushort hdrlen, optpad = 0;
938 uchar *opt;
939
940 hdrlen = TCP6_HDRSIZE;
941 if(tcph->flags & SYN){
942 if(tcph->mss)
943 hdrlen += MSS_LENGTH;
944 if(tcph->ws)
945 hdrlen += WS_LENGTH;
946 optpad = hdrlen & 3;
947 if(optpad)
948 optpad = 4 - optpad;
949 hdrlen += optpad;
950 }
951
952 if(data) {
953 dlen = blocklen(data);
954 data = padblock(data, hdrlen + TCP6_PKT);
955 if(data == nil)
956 return nil;
957 }
958 else {
959 dlen = 0;
960 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
961 if(data == nil)
962 return nil;
963 data->wp += hdrlen + TCP6_PKT;
964 }
965
966 /* copy in pseudo ip header plus port numbers */
967 h = (Tcp6hdr *)(data->rp);
968 memmove(h, ph, TCP6_TCBPHDRSZ);
969
970 /* compose pseudo tcp header, do cksum calculation */
971 hnputl(h->vcf, hdrlen + dlen);
972 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
973 h->ttl = ph->proto;
974
975 /* copy in variable bits */
976 hnputl(h->tcpseq, tcph->seq);
977 hnputl(h->tcpack, tcph->ack);
978 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
979 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
980 hnputs(h->tcpurg, tcph->urg);
981
982 if(tcph->flags & SYN){
983 opt = h->tcpopt;
984 if(tcph->mss != 0){
985 *opt++ = MSSOPT;
986 *opt++ = MSS_LENGTH;
987 hnputs(opt, tcph->mss);
988 opt += 2;
989 }
990 if(tcph->ws != 0){
991 *opt++ = WSOPT;
992 *opt++ = WS_LENGTH;
993 *opt++ = tcph->ws;
994 }
995 while(optpad-- > 0)
996 *opt++ = NOOPOPT;
997 }
998
999 if(tcb != nil && tcb->nochecksum){
1000 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1001 } else {
1002 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1003 hnputs(h->tcpcksum, csum);
1004 }
1005
1006 /* move from pseudo header back to normal ip header */
1007 memset(h->vcf, 0, 4);
1008 h->vcf[0] = IP_VER6;
1009 hnputs(h->ploadlen, hdrlen+dlen);
1010 h->proto = ph->proto;
1011
1012 return data;
1013 }
1014
1015 Block *
1016 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1017 {
1018 int dlen;
1019 Tcp4hdr *h;
1020 ushort csum;
1021 ushort hdrlen, optpad = 0;
1022 uchar *opt;
1023
1024 hdrlen = TCP4_HDRSIZE;
1025 if(tcph->flags & SYN){
1026 if(tcph->mss)
1027 hdrlen += MSS_LENGTH;
1028 if(tcph->ws)
1029 hdrlen += WS_LENGTH;
1030 optpad = hdrlen & 3;
1031 if(optpad)
1032 optpad = 4 - optpad;
1033 hdrlen += optpad;
1034 }
1035
1036 if(data) {
1037 dlen = blocklen(data);
1038 data = padblock(data, hdrlen + TCP4_PKT);
1039 if(data == nil)
1040 return nil;
1041 }
1042 else {
1043 dlen = 0;
1044 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
1045 if(data == nil)
1046 return nil;
1047 data->wp += hdrlen + TCP4_PKT;
1048 }
1049
1050 /* copy in pseudo ip header plus port numbers */
1051 h = (Tcp4hdr *)(data->rp);
1052 memmove(h, ph, TCP4_TCBPHDRSZ);
1053
1054 /* copy in variable bits */
1055 hnputs(h->tcplen, hdrlen + dlen);
1056 hnputl(h->tcpseq, tcph->seq);
1057 hnputl(h->tcpack, tcph->ack);
1058 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1059 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1060 hnputs(h->tcpurg, tcph->urg);
1061
1062 if(tcph->flags & SYN){
1063 opt = h->tcpopt;
1064 if(tcph->mss != 0){
1065 *opt++ = MSSOPT;
1066 *opt++ = MSS_LENGTH;
1067 hnputs(opt, tcph->mss);
1068 opt += 2;
1069 }
1070 if(tcph->ws != 0){
1071 *opt++ = WSOPT;
1072 *opt++ = WS_LENGTH;
1073 *opt++ = tcph->ws;
1074 }
1075 while(optpad-- > 0)
1076 *opt++ = NOOPOPT;
1077 }
1078
1079 if(tcb != nil && tcb->nochecksum){
1080 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1081 } else {
1082 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1083 hnputs(h->tcpcksum, csum);
1084 }
1085
1086 return data;
1087 }
1088
1089 int
1090 ntohtcp6(Tcp *tcph, Block **bpp)
1091 {
1092 Tcp6hdr *h;
1093 uchar *optr;
1094 ushort hdrlen;
1095 ushort optlen;
1096 int n;
1097
1098 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1099 if(*bpp == nil)
1100 return -1;
1101
1102 h = (Tcp6hdr *)((*bpp)->rp);
1103 tcph->source = nhgets(h->tcpsport);
1104 tcph->dest = nhgets(h->tcpdport);
1105 tcph->seq = nhgetl(h->tcpseq);
1106 tcph->ack = nhgetl(h->tcpack);
1107 hdrlen = (h->tcpflag[0]>>2) & ~3;
1108 if(hdrlen < TCP6_HDRSIZE) {
1109 freeblist(*bpp);
1110 return -1;
1111 }
1112
1113 tcph->flags = h->tcpflag[1];
1114 tcph->wnd = nhgets(h->tcpwin);
1115 tcph->urg = nhgets(h->tcpurg);
1116 tcph->mss = 0;
1117 tcph->ws = 0;
1118 tcph->len = nhgets(h->ploadlen) - hdrlen;
1119
1120 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1121 if(*bpp == nil)
1122 return -1;
1123
1124 optr = h->tcpopt;
1125 n = hdrlen - TCP6_HDRSIZE;
1126 while(n > 0 && *optr != EOLOPT) {
1127 if(*optr == NOOPOPT) {
1128 n--;
1129 optr++;
1130 continue;
1131 }
1132 optlen = optr[1];
1133 if(optlen < 2 || optlen > n)
1134 break;
1135 switch(*optr) {
1136 case MSSOPT:
1137 if(optlen == MSS_LENGTH)
1138 tcph->mss = nhgets(optr+2);
1139 break;
1140 case WSOPT:
1141 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1142 tcph->ws = HaveWS | *(optr+2);
1143 break;
1144 }
1145 n -= optlen;
1146 optr += optlen;
1147 }
1148 return hdrlen;
1149 }
1150
1151 int
1152 ntohtcp4(Tcp *tcph, Block **bpp)
1153 {
1154 Tcp4hdr *h;
1155 uchar *optr;
1156 ushort hdrlen;
1157 ushort optlen;
1158 int n;
1159
1160 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1161 if(*bpp == nil)
1162 return -1;
1163
1164 h = (Tcp4hdr *)((*bpp)->rp);
1165 tcph->source = nhgets(h->tcpsport);
1166 tcph->dest = nhgets(h->tcpdport);
1167 tcph->seq = nhgetl(h->tcpseq);
1168 tcph->ack = nhgetl(h->tcpack);
1169
1170 hdrlen = (h->tcpflag[0]>>2) & ~3;
1171 if(hdrlen < TCP4_HDRSIZE) {
1172 freeblist(*bpp);
1173 return -1;
1174 }
1175
1176 tcph->flags = h->tcpflag[1];
1177 tcph->wnd = nhgets(h->tcpwin);
1178 tcph->urg = nhgets(h->tcpurg);
1179 tcph->mss = 0;
1180 tcph->ws = 0;
1181 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1182
1183 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1184 if(*bpp == nil)
1185 return -1;
1186
1187 optr = h->tcpopt;
1188 n = hdrlen - TCP4_HDRSIZE;
1189 while(n > 0 && *optr != EOLOPT) {
1190 if(*optr == NOOPOPT) {
1191 n--;
1192 optr++;
1193 continue;
1194 }
1195 optlen = optr[1];
1196 if(optlen < 2 || optlen > n)
1197 break;
1198 switch(*optr) {
1199 case MSSOPT:
1200 if(optlen == MSS_LENGTH)
1201 tcph->mss = nhgets(optr+2);
1202 break;
1203 case WSOPT:
1204 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1205 tcph->ws = HaveWS | *(optr+2);
1206 break;
1207 }
1208 n -= optlen;
1209 optr += optlen;
1210 }
1211 return hdrlen;
1212 }
1213
1214 /*
1215 * For outgiing calls, generate an initial sequence
1216 * number and put a SYN on the send queue
1217 */
1218 void
1219 tcpsndsyn(Conv *s, Tcpctl *tcb)
1220 {
1221 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1222 tcb->rttseq = tcb->iss;
1223 tcb->snd.wl2 = tcb->iss;
1224 tcb->snd.una = tcb->iss;
1225 tcb->snd.ptr = tcb->rttseq;
1226 tcb->snd.nxt = tcb->rttseq;
1227 tcb->flgcnt++;
1228 tcb->flags |= FORCE;
1229 tcb->sndsyntime = NOW;
1230
1231 /* set desired mss and scale */
1232 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1233 }
1234
1235 void
1236 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1237 {
1238 Block *hbp;
1239 uchar rflags;
1240 Tcppriv *tpriv;
1241 Tcp4hdr ph4;
1242 Tcp6hdr ph6;
1243
1244 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1245
1246 tpriv = tcp->priv;
1247
1248 if(seg->flags & RST)
1249 return;
1250
1251 /* make pseudo header */
1252 switch(version) {
1253 case V4:
1254 memset(&ph4, 0, sizeof(ph4));
1255 ph4.vihl = IP_VER4;
1256 v6tov4(ph4.tcpsrc, dest);
1257 v6tov4(ph4.tcpdst, source);
1258 ph4.proto = IP_TCPPROTO;
1259 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1260 hnputs(ph4.tcpsport, seg->dest);
1261 hnputs(ph4.tcpdport, seg->source);
1262 break;
1263 case V6:
1264 memset(&ph6, 0, sizeof(ph6));
1265 ph6.vcf[0] = IP_VER6;
1266 ipmove(ph6.tcpsrc, dest);
1267 ipmove(ph6.tcpdst, source);
1268 ph6.proto = IP_TCPPROTO;
1269 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1270 hnputs(ph6.tcpsport, seg->dest);
1271 hnputs(ph6.tcpdport, seg->source);
1272 break;
1273 default:
1274 panic("sndrst: version %d", version);
1275 }
1276
1277 tpriv->stats[OutRsts]++;
1278 rflags = RST;
1279
1280 /* convince the other end that this reset is in band */
1281 if(seg->flags & ACK) {
1282 seg->seq = seg->ack;
1283 seg->ack = 0;
1284 }
1285 else {
1286 rflags |= ACK;
1287 seg->ack = seg->seq;
1288 seg->seq = 0;
1289 if(seg->flags & SYN)
1290 seg->ack++;
1291 seg->ack += length;
1292 if(seg->flags & FIN)
1293 seg->ack++;
1294 }
1295 seg->flags = rflags;
1296 seg->wnd = 0;
1297 seg->urg = 0;
1298 seg->mss = 0;
1299 seg->ws = 0;
1300 switch(version) {
1301 case V4:
1302 hbp = htontcp4(seg, nil, &ph4, nil);
1303 if(hbp == nil)
1304 return;
1305 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1306 break;
1307 case V6:
1308 hbp = htontcp6(seg, nil, &ph6, nil);
1309 if(hbp == nil)
1310 return;
1311 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1312 break;
1313 default:
1314 panic("sndrst2: version %d", version);
1315 }
1316 }
1317
1318 /*
1319 * send a reset to the remote side and close the conversation
1320 * called with s qlocked
1321 */
1322 char*
1323 tcphangup(Conv *s)
1324 {
1325 Tcp seg;
1326 Tcpctl *tcb;
1327 Block *hbp;
1328
1329 tcb = (Tcpctl*)s->ptcl;
1330 if(waserror())
1331 return commonerror();
1332 if(ipcmp(s->raddr, IPnoaddr) != 0) {
1333 if(!waserror()){
1334 seg.flags = RST | ACK;
1335 seg.ack = tcb->rcv.nxt;
1336 tcb->rcv.una = 0;
1337 seg.seq = tcb->snd.ptr;
1338 seg.wnd = 0;
1339 seg.urg = 0;
1340 seg.mss = 0;
1341 seg.ws = 0;
1342 switch(s->ipversion) {
1343 case V4:
1344 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1345 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1346 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1347 break;
1348 case V6:
1349 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1350 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1351 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1352 break;
1353 default:
1354 panic("tcphangup: version %d", s->ipversion);
1355 }
1356 poperror();
1357 }
1358 }
1359 localclose(s, nil);
1360 poperror();
1361 return nil;
1362 }
1363
1364 /*
1365 * (re)send a SYN ACK
1366 */
1367 int
1368 sndsynack(Proto *tcp, Limbo *lp)
1369 {
1370 Block *hbp;
1371 Tcp4hdr ph4;
1372 Tcp6hdr ph6;
1373 Tcp seg;
1374 int scale;
1375
1376 /* make pseudo header */
1377 switch(lp->version) {
1378 case V4:
1379 memset(&ph4, 0, sizeof(ph4));
1380 ph4.vihl = IP_VER4;
1381 v6tov4(ph4.tcpsrc, lp->laddr);
1382 v6tov4(ph4.tcpdst, lp->raddr);
1383 ph4.proto = IP_TCPPROTO;
1384 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1385 hnputs(ph4.tcpsport, lp->lport);
1386 hnputs(ph4.tcpdport, lp->rport);
1387 break;
1388 case V6:
1389 memset(&ph6, 0, sizeof(ph6));
1390 ph6.vcf[0] = IP_VER6;
1391 ipmove(ph6.tcpsrc, lp->laddr);
1392 ipmove(ph6.tcpdst, lp->raddr);
1393 ph6.proto = IP_TCPPROTO;
1394 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1395 hnputs(ph6.tcpsport, lp->lport);
1396 hnputs(ph6.tcpdport, lp->rport);
1397 break;
1398 default:
1399 panic("sndrst: version %d", lp->version);
1400 }
1401
1402 seg.seq = lp->iss;
1403 seg.ack = lp->irs+1;
1404 seg.flags = SYN|ACK;
1405 seg.urg = 0;
1406 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1407 seg.wnd = QMAX;
1408
1409 /* if the other side set scale, we should too */
1410 if(lp->rcvscale){
1411 seg.ws = scale;
1412 lp->sndscale = scale;
1413 } else {
1414 seg.ws = 0;
1415 lp->sndscale = 0;
1416 }
1417
1418 switch(lp->version) {
1419 case V4:
1420 hbp = htontcp4(&seg, nil, &ph4, nil);
1421 if(hbp == nil)
1422 return -1;
1423 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1424 break;
1425 case V6:
1426 hbp = htontcp6(&seg, nil, &ph6, nil);
1427 if(hbp == nil)
1428 return -1;
1429 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1430 break;
1431 default:
1432 panic("sndsnack: version %d", lp->version);
1433 }
1434 lp->lastsend = NOW;
1435 return 0;
1436 }
1437
1438 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1439
1440 /*
1441 * put a call into limbo and respond with a SYN ACK
1442 *
1443 * called with proto locked
1444 */
1445 static void
1446 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1447 {
1448 Limbo *lp, **l;
1449 Tcppriv *tpriv;
1450 int h;
1451
1452 tpriv = s->p->priv;
1453 h = hashipa(source, seg->source);
1454
1455 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1456 lp = *l;
1457 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1458 continue;
1459 if(ipcmp(lp->raddr, source) != 0)
1460 continue;
1461 if(ipcmp(lp->laddr, dest) != 0)
1462 continue;
1463
1464 /* each new SYN restarts the retransmits */
1465 lp->irs = seg->seq;
1466 break;
1467 }
1468 lp = *l;
1469 if(lp == nil){
1470 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1471 lp = tpriv->lht[h];
1472 tpriv->lht[h] = lp->next;
1473 lp->next = nil;
1474 } else {
1475 lp = malloc(sizeof(*lp));
1476 if(lp == nil)
1477 return;
1478 tpriv->nlimbo++;
1479 }
1480 *l = lp;
1481 lp->version = version;
1482 ipmove(lp->laddr, dest);
1483 ipmove(lp->raddr, source);
1484 lp->lport = seg->dest;
1485 lp->rport = seg->source;
1486 lp->mss = seg->mss;
1487 lp->rcvscale = seg->ws;
1488 lp->irs = seg->seq;
1489 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1490 }
1491
1492 if(sndsynack(s->p, lp) < 0){
1493 *l = lp->next;
1494 tpriv->nlimbo--;
1495 free(lp);
1496 }
1497 }
1498
1499 /*
1500 * resend SYN ACK's once every SYNACK_RXTIMER ms.
1501 */
1502 static void
1503 limborexmit(Proto *tcp)
1504 {
1505 Tcppriv *tpriv;
1506 Limbo **l, *lp;
1507 int h;
1508 int seen;
1509 ulong now;
1510
1511 tpriv = tcp->priv;
1512
1513 if(!canqlock(tcp))
1514 return;
1515 seen = 0;
1516 now = NOW;
1517 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1518 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1519 lp = *l;
1520 seen++;
1521 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1522 continue;
1523
1524 /* time it out after 1 second */
1525 if(++(lp->rexmits) > 5){
1526 tpriv->nlimbo--;
1527 *l = lp->next;
1528 free(lp);
1529 continue;
1530 }
1531
1532 /* if we're being attacked, don't bother resending SYN ACK's */
1533 if(tpriv->nlimbo > 100)
1534 continue;
1535
1536 if(sndsynack(tcp, lp) < 0){
1537 tpriv->nlimbo--;
1538 *l = lp->next;
1539 free(lp);
1540 continue;
1541 }
1542
1543 l = &lp->next;
1544 }
1545 }
1546 qunlock(tcp);
1547 }
1548
1549 /*
1550 * lookup call in limbo. if found, throw it out.
1551 *
1552 * called with proto locked
1553 */
1554 static void
1555 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1556 {
1557 Limbo *lp, **l;
1558 int h;
1559 Tcppriv *tpriv;
1560
1561 tpriv = s->p->priv;
1562
1563 /* find a call in limbo */
1564 h = hashipa(src, segp->source);
1565 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1566 lp = *l;
1567 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1568 continue;
1569 if(ipcmp(lp->laddr, dst) != 0)
1570 continue;
1571 if(ipcmp(lp->raddr, src) != 0)
1572 continue;
1573
1574 /* RST can only follow the SYN */
1575 if(segp->seq == lp->irs+1){
1576 tpriv->nlimbo--;
1577 *l = lp->next;
1578 free(lp);
1579 }
1580 break;
1581 }
1582 }
1583
1584 /*
1585 * come here when we finally get an ACK to our SYN-ACK.
1586 * lookup call in limbo. if found, create a new conversation
1587 *
1588 * called with proto locked
1589 */
1590 static Conv*
1591 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1592 {
1593 Conv *new;
1594 Tcpctl *tcb;
1595 Tcppriv *tpriv;
1596 Tcp4hdr *h4;
1597 Tcp6hdr *h6;
1598 Limbo *lp, **l;
1599 int h;
1600
1601 /* unless it's just an ack, it can't be someone coming out of limbo */
1602 if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1603 return nil;
1604
1605 tpriv = s->p->priv;
1606
1607 /* find a call in limbo */
1608 h = hashipa(src, segp->source);
1609 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1610 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d\n",
1611 src, segp->source, lp->raddr, lp->rport,
1612 dst, segp->dest, lp->laddr, lp->lport,
1613 version, lp->version
1614 );
1615
1616 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1617 continue;
1618 if(ipcmp(lp->laddr, dst) != 0)
1619 continue;
1620 if(ipcmp(lp->raddr, src) != 0)
1621 continue;
1622
1623 /* we're assuming no data with the initial SYN */
1624 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1625 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1626 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1627 lp = nil;
1628 } else {
1629 tpriv->nlimbo--;
1630 *l = lp->next;
1631 }
1632 break;
1633 }
1634 if(lp == nil)
1635 return nil;
1636
1637 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1638 if(new == nil)
1639 return nil;
1640
1641 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1642 tcb = (Tcpctl*)new->ptcl;
1643 tcb->flags &= ~CLONE;
1644 tcb->timer.arg = new;
1645 tcb->timer.state = TcptimerOFF;
1646 tcb->acktimer.arg = new;
1647 tcb->acktimer.state = TcptimerOFF;
1648 tcb->katimer.arg = new;
1649 tcb->katimer.state = TcptimerOFF;
1650 tcb->rtt_timer.arg = new;
1651 tcb->rtt_timer.state = TcptimerOFF;
1652
1653 tcb->irs = lp->irs;
1654 tcb->rcv.nxt = tcb->irs+1;
1655 tcb->rcv.urg = tcb->rcv.nxt;
1656
1657 tcb->iss = lp->iss;
1658 tcb->rttseq = tcb->iss;
1659 tcb->snd.wl2 = tcb->iss;
1660 tcb->snd.una = tcb->iss+1;
1661 tcb->snd.ptr = tcb->iss+1;
1662 tcb->snd.nxt = tcb->iss+1;
1663 tcb->flgcnt = 0;
1664 tcb->flags |= SYNACK;
1665
1666 /* our sending max segment size cannot be bigger than what he asked for */
1667 if(lp->mss != 0 && lp->mss < tcb->mss)
1668 tcb->mss = lp->mss;
1669
1670 /* window scaling */
1671 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1672
1673 /* the congestion window always starts out as a single segment */
1674 tcb->snd.wnd = segp->wnd;
1675 tcb->cwind = tcb->mss;
1676
1677 /* set initial round trip time */
1678 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1679 tcpsynackrtt(new);
1680
1681 free(lp);
1682
1683 /* set up proto header */
1684 switch(version){
1685 case V4:
1686 h4 = &tcb->protohdr.tcp4hdr;
1687 memset(h4, 0, sizeof(*h4));
1688 h4->proto = IP_TCPPROTO;
1689 hnputs(h4->tcpsport, new->lport);
1690 hnputs(h4->tcpdport, new->rport);
1691 v6tov4(h4->tcpsrc, dst);
1692 v6tov4(h4->tcpdst, src);
1693 break;
1694 case V6:
1695 h6 = &tcb->protohdr.tcp6hdr;
1696 memset(h6, 0, sizeof(*h6));
1697 h6->proto = IP_TCPPROTO;
1698 hnputs(h6->tcpsport, new->lport);
1699 hnputs(h6->tcpdport, new->rport);
1700 ipmove(h6->tcpsrc, dst);
1701 ipmove(h6->tcpdst, src);
1702 break;
1703 default:
1704 panic("tcpincoming: version %d", new->ipversion);
1705 }
1706
1707 tcpsetstate(new, Established);
1708
1709 iphtadd(&tpriv->ht, new);
1710
1711 return new;
1712 }
1713
1714 int
1715 seq_within(ulong x, ulong low, ulong high)
1716 {
1717 if(low <= high){
1718 if(low <= x && x <= high)
1719 return 1;
1720 }
1721 else {
1722 if(x >= low || x <= high)
1723 return 1;
1724 }
1725 return 0;
1726 }
1727
1728 int
1729 seq_lt(ulong x, ulong y)
1730 {
1731 return (int)(x-y) < 0;
1732 }
1733
1734 int
1735 seq_le(ulong x, ulong y)
1736 {
1737 return (int)(x-y) <= 0;
1738 }
1739
1740 int
1741 seq_gt(ulong x, ulong y)
1742 {
1743 return (int)(x-y) > 0;
1744 }
1745
1746 int
1747 seq_ge(ulong x, ulong y)
1748 {
1749 return (int)(x-y) >= 0;
1750 }
1751
1752 /*
1753 * use the time between the first SYN and it's ack as the
1754 * initial round trip time
1755 */
1756 void
1757 tcpsynackrtt(Conv *s)
1758 {
1759 Tcpctl *tcb;
1760 int delta;
1761 Tcppriv *tpriv;
1762
1763 tcb = (Tcpctl*)s->ptcl;
1764 tpriv = s->p->priv;
1765
1766 delta = NOW - tcb->sndsyntime;
1767 tcb->srtt = delta<<LOGAGAIN;
1768 tcb->mdev = delta<<LOGDGAIN;
1769
1770 /* halt round trip timer */
1771 tcphalt(tpriv, &tcb->rtt_timer);
1772 }
1773
1774 void
1775 update(Conv *s, Tcp *seg)
1776 {
1777 int rtt, delta;
1778 Tcpctl *tcb;
1779 ulong acked;
1780 ulong expand;
1781 Tcppriv *tpriv;
1782
1783 tpriv = s->p->priv;
1784 tcb = (Tcpctl*)s->ptcl;
1785
1786 /* if everything has been acked, force output(?) */
1787 if(seq_gt(seg->ack, tcb->snd.nxt)) {
1788 tcb->flags |= FORCE;
1789 return;
1790 }
1791
1792 /* added by Dong Lin for fast retransmission */
1793 if(seg->ack == tcb->snd.una
1794 && tcb->snd.una != tcb->snd.nxt
1795 && seg->len == 0
1796 && seg->wnd == tcb->snd.wnd) {
1797
1798 /* this is a pure ack w/o window update */
1799 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n",
1800 tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1801
1802 if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1803 /*
1804 * tahoe tcp rxt the packet, half sshthresh,
1805 * and set cwnd to one packet
1806 */
1807 tcb->snd.recovery = 1;
1808 tcb->snd.rxt = tcb->snd.nxt;
1809 netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1810 tcprxmit(s);
1811 } else {
1812 /* do reno tcp here. */
1813 }
1814 }
1815
1816 /*
1817 * update window
1818 */
1819 if(seq_gt(seg->ack, tcb->snd.wl2)
1820 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1821 tcb->snd.wnd = seg->wnd;
1822 tcb->snd.wl2 = seg->ack;
1823 }
1824
1825 if(!seq_gt(seg->ack, tcb->snd.una)){
1826 /*
1827 * don't let us hangup if sending into a closed window and
1828 * we're still getting acks
1829 */
1830 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1831 tcb->backedoff = MAXBACKMS/4;
1832 }
1833 return;
1834 }
1835
1836 /*
1837 * any positive ack turns off fast rxt,
1838 * (should we do new-reno on partial acks?)
1839 */
1840 if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1841 tcb->snd.dupacks = 0;
1842 tcb->snd.recovery = 0;
1843 } else
1844 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind);
1845
1846 /* Compute the new send window size */
1847 acked = seg->ack - tcb->snd.una;
1848
1849 /* avoid slow start and timers for SYN acks */
1850 if((tcb->flags & SYNACK) == 0) {
1851 tcb->flags |= SYNACK;
1852 acked--;
1853 tcb->flgcnt--;
1854 goto done;
1855 }
1856
1857 /* slow start as long as we're not recovering from lost packets */
1858 if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1859 if(tcb->cwind < tcb->ssthresh) {
1860 expand = tcb->mss;
1861 if(acked < expand)
1862 expand = acked;
1863 }
1864 else
1865 expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1866
1867 if(tcb->cwind + expand < tcb->cwind)
1868 expand = tcb->snd.wnd - tcb->cwind;
1869 if(tcb->cwind + expand > tcb->snd.wnd)
1870 expand = tcb->snd.wnd - tcb->cwind;
1871 tcb->cwind += expand;
1872 }
1873
1874 /* Adjust the timers according to the round trip time */
1875 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1876 tcphalt(tpriv, &tcb->rtt_timer);
1877 if((tcb->flags&RETRAN) == 0) {
1878 tcb->backoff = 0;
1879 tcb->backedoff = 0;
1880 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1881 if(rtt == 0)
1882 rtt = 1; /* otherwise all close systems will rexmit in 0 time */
1883 rtt *= MSPTICK;
1884 if(tcb->srtt == 0) {
1885 tcb->srtt = rtt << LOGAGAIN;
1886 tcb->mdev = rtt << LOGDGAIN;
1887 } else {
1888 delta = rtt - (tcb->srtt>>LOGAGAIN);
1889 tcb->srtt += delta;
1890 if(tcb->srtt <= 0)
1891 tcb->srtt = 1;
1892
1893 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1894 tcb->mdev += delta;
1895 if(tcb->mdev <= 0)
1896 tcb->mdev = 1;
1897 }
1898 tcpsettimer(tcb);
1899 }
1900 }
1901
1902 done:
1903 if(qdiscard(s->wq, acked) < acked)
1904 tcb->flgcnt--;
1905
1906 tcb->snd.una = seg->ack;
1907 if(seq_gt(seg->ack, tcb->snd.urg))
1908 tcb->snd.urg = seg->ack;
1909
1910 if(tcb->snd.una != tcb->snd.nxt)
1911 tcpgo(tpriv, &tcb->timer);
1912 else
1913 tcphalt(tpriv, &tcb->timer);
1914
1915 if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1916 tcb->snd.ptr = tcb->snd.una;
1917
1918 tcb->flags &= ~RETRAN;
1919 tcb->backoff = 0;
1920 tcb->backedoff = 0;
1921 }
1922
1923 void
1924 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1925 {
1926 Tcp seg;
1927 Tcp4hdr *h4;
1928 Tcp6hdr *h6;
1929 int hdrlen;
1930 Tcpctl *tcb;
1931 ushort length, csum;
1932 uchar source[IPaddrlen], dest[IPaddrlen];
1933 Conv *s;
1934 Fs *f;
1935 Tcppriv *tpriv;
1936 uchar version;
1937
1938 f = tcp->f;
1939 tpriv = tcp->priv;
1940
1941 tpriv->stats[InSegs]++;
1942
1943 h4 = (Tcp4hdr*)(bp->rp);
1944 h6 = (Tcp6hdr*)(bp->rp);
1945
1946 if((h4->vihl&0xF0)==IP_VER4) {
1947 version = V4;
1948 length = nhgets(h4->length);
1949 v4tov6(dest, h4->tcpdst);
1950 v4tov6(source, h4->tcpsrc);
1951
1952 h4->Unused = 0;
1953 hnputs(h4->tcplen, length-TCP4_PKT);
1954 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1955 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1956 tpriv->stats[CsumErrs]++;
1957 tpriv->stats[InErrs]++;
1958 netlog(f, Logtcp, "bad tcp proto cksum\n");
1959 freeblist(bp);
1960 return;
1961 }
1962
1963 hdrlen = ntohtcp4(&seg, &bp);
1964 if(hdrlen < 0){
1965 tpriv->stats[HlenErrs]++;
1966 tpriv->stats[InErrs]++;
1967 netlog(f, Logtcp, "bad tcp hdr len\n");
1968 return;
1969 }
1970
1971 /* trim the packet to the size claimed by the datagram */
1972 length -= hdrlen+TCP4_PKT;
1973 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1974 if(bp == nil){
1975 tpriv->stats[LenErrs]++;
1976 tpriv->stats[InErrs]++;
1977 netlog(f, Logtcp, "tcp len < 0 after trim\n");
1978 return;
1979 }
1980 }
1981 else {
1982 int ttl = h6->ttl;
1983 int proto = h6->proto;
1984
1985 version = V6;
1986 length = nhgets(h6->ploadlen);
1987 ipmove(dest, h6->tcpdst);
1988 ipmove(source, h6->tcpsrc);
1989
1990 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1991 h6->ttl = proto;
1992 hnputl(h6->vcf, length);
1993 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1994 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
1995 tpriv->stats[CsumErrs]++;
1996 tpriv->stats[InErrs]++;
1997 netlog(f, Logtcp,
1998 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
1999 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2000 freeblist(bp);
2001 return;
2002 }
2003 h6->ttl = ttl;
2004 h6->proto = proto;
2005 hnputs(h6->ploadlen, length);
2006
2007 hdrlen = ntohtcp6(&seg, &bp);
2008 if(hdrlen < 0){
2009 tpriv->stats[HlenErrs]++;
2010 tpriv->stats[InErrs]++;
2011 netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2012 return;
2013 }
2014
2015 /* trim the packet to the size claimed by the datagram */
2016 length -= hdrlen;
2017 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2018 if(bp == nil){
2019 tpriv->stats[LenErrs]++;
2020 tpriv->stats[InErrs]++;
2021 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2022 return;
2023 }
2024 }
2025
2026 /* lock protocol while searching for a conversation */
2027 qlock(tcp);
2028
2029 /* Look for a matching conversation */
2030 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2031 if(s == nil){
2032 netlog(f, Logtcp, "iphtlook failed\n");
2033 reset:
2034 qunlock(tcp);
2035 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2036 freeblist(bp);
2037 return;
2038 }
2039
2040 /* if it's a listener, look for the right flags and get a new conv */
2041 tcb = (Tcpctl*)s->ptcl;
2042 if(tcb->state == Listen){
2043 if(seg.flags & RST){
2044 limborst(s, &seg, source, dest, version);
2045 qunlock(tcp);
2046 freeblist(bp);
2047 return;
2048 }
2049
2050 /* if this is a new SYN, put the call into limbo */
2051 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2052 limbo(s, source, dest, &seg, version);
2053 qunlock(tcp);
2054 freeblist(bp);
2055 return;
2056 }
2057
2058 /*
2059 * if there's a matching call in limbo, tcpincoming will
2060 * return it in state Syn_received
2061 */
2062 s = tcpincoming(s, &seg, source, dest, version);
2063 if(s == nil)
2064 goto reset;
2065 }
2066
2067 /* The rest of the input state machine is run with the control block
2068 * locked and implements the state machine directly out of the RFC.
2069 * Out-of-band data is ignored - it was always a bad idea.
2070 */
2071 tcb = (Tcpctl*)s->ptcl;
2072 if(waserror()){
2073 qunlock(s);
2074 nexterror();
2075 }
2076 qlock(s);
2077 qunlock(tcp);
2078
2079 /* fix up window */
2080 seg.wnd <<= tcb->rcv.scale;
2081
2082 /* every input packet in puts off the keep alive time out */
2083 tcpsetkacounter(tcb);
2084
2085 switch(tcb->state) {
2086 case Closed:
2087 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2088 goto raise;
2089 case Syn_sent:
2090 if(seg.flags & ACK) {
2091 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2092 sndrst(tcp, source, dest, length, &seg, version,
2093 "bad seq in Syn_sent");
2094 goto raise;
2095 }
2096 }
2097 if(seg.flags & RST) {
2098 if(seg.flags & ACK)
2099 localclose(s, Econrefused);
2100 goto raise;
2101 }
2102
2103 if(seg.flags & SYN) {
2104 procsyn(s, &seg);
2105 if(seg.flags & ACK){
2106 update(s, &seg);
2107 tcpsynackrtt(s);
2108 tcpsetstate(s, Established);
2109 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2110 }
2111 else {
2112 tcb->time = NOW;
2113 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
2114 }
2115
2116 if(length != 0 || (seg.flags & FIN))
2117 break;
2118
2119 freeblist(bp);
2120 goto output;
2121 }
2122 else
2123 freeblist(bp);
2124
2125 qunlock(s);
2126 poperror();
2127 return;
2128 case Syn_received:
2129 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2130 if(seg.flags & ACK)
2131 tcpsynackrtt(s);
2132 break;
2133 }
2134
2135 /*
2136 * One DOS attack is to open connections to us and then forget about them,
2137 * thereby tying up a conv at no long term cost to the attacker.
2138 * This is an attempt to defeat these stateless DOS attacks. See
2139 * corresponding code in tcpsendka().
2140 */
2141 if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2142 if(tcpporthogdefense
2143 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2144 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2145 source, seg.source, dest, seg.dest, seg.flags,
2146 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2147 localclose(s, "stateless hog");
2148 }
2149 }
2150
2151 /* Cut the data to fit the receive window */
2152 if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2153 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2154 update(s, &seg);
2155 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2156 tcphalt(tpriv, &tcb->rtt_timer);
2157 tcphalt(tpriv, &tcb->acktimer);
2158 tcphalt(tpriv, &tcb->katimer);
2159 tcpsetstate(s, Time_wait);
2160 tcb->timer.start = MSL2*(1000 / MSPTICK);
2161 tcpgo(tpriv, &tcb->timer);
2162 }
2163 if(!(seg.flags & RST)) {
2164 tcb->flags |= FORCE;
2165 goto output;
2166 }
2167 qunlock(s);
2168 poperror();
2169 return;
2170 }
2171
2172 /* Cannot accept so answer with a rst */
2173 if(length && tcb->state == Closed) {
2174 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2175 goto raise;
2176 }
2177
2178 /* The segment is beyond the current receive pointer so
2179 * queue the data in the resequence queue
2180 */
2181 if(seg.seq != tcb->rcv.nxt)
2182 if(length != 0 || (seg.flags & (SYN|FIN))) {
2183 update(s, &seg);
2184 if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2185 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2186 tcb->flags |= FORCE;
2187 goto output;
2188 }
2189
2190 /*
2191 * keep looping till we've processed this packet plus any
2192 * adjacent packets in the resequence queue
2193 */
2194 for(;;) {
2195 if(seg.flags & RST) {
2196 if(tcb->state == Established) {
2197 tpriv->stats[EstabResets]++;
2198 if(tcb->rcv.nxt != seg.seq)
2199 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2200 }
2201 localclose(s, Econrefused);
2202 goto raise;
2203 }
2204
2205 if((seg.flags&ACK) == 0)
2206 goto raise;
2207
2208 switch(tcb->state) {
2209 case Syn_received:
2210 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2211 sndrst(tcp, source, dest, length, &seg, version,
2212 "bad seq in Syn_received");
2213 goto raise;
2214 }
2215 update(s, &seg);
2216 tcpsetstate(s, Established);
2217 case Established:
2218 case Close_wait:
2219 update(s, &seg);
2220 break;
2221 case Finwait1:
2222 update(s, &seg);
2223 if(qlen(s->wq)+tcb->flgcnt == 0){
2224 tcphalt(tpriv, &tcb->rtt_timer);
2225 tcphalt(tpriv, &tcb->acktimer);
2226 tcpsetkacounter(tcb);
2227 tcb->time = NOW;
2228 tcpsetstate(s, Finwait2);
2229 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2230 tcpgo(tpriv, &tcb->katimer);
2231 }
2232 break;
2233 case Finwait2:
2234 update(s, &seg);
2235 break;
2236 case Closing:
2237 update(s, &seg);
2238 if(qlen(s->wq)+tcb->flgcnt == 0) {
2239 tcphalt(tpriv, &tcb->rtt_timer);
2240 tcphalt(tpriv, &tcb->acktimer);
2241 tcphalt(tpriv, &tcb->katimer);
2242 tcpsetstate(s, Time_wait);
2243 tcb->timer.start = MSL2*(1000 / MSPTICK);
2244 tcpgo(tpriv, &tcb->timer);
2245 }
2246 break;
2247 case Last_ack:
2248 update(s, &seg);
2249 if(qlen(s->wq)+tcb->flgcnt == 0) {
2250 localclose(s, nil);
2251 goto raise;
2252 }
2253 case Time_wait:
2254 tcb->flags |= FORCE;
2255 if(tcb->timer.state != TcptimerON)
2256 tcpgo(tpriv, &tcb->timer);
2257 }
2258
2259 if((seg.flags&URG) && seg.urg) {
2260 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2261 tcb->rcv.urg = seg.urg + seg.seq;
2262 pullblock(&bp, seg.urg);
2263 }
2264 }
2265 else
2266 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2267 tcb->rcv.urg = tcb->rcv.nxt;
2268
2269 if(length == 0) {
2270 if(bp != nil)
2271 freeblist(bp);
2272 }
2273 else {
2274 switch(tcb->state){
2275 default:
2276 /* Ignore segment text */
2277 if(bp != nil)
2278 freeblist(bp);
2279 break;
2280
2281 case Syn_received:
2282 case Established:
2283 case Finwait1:
2284 /* If we still have some data place on
2285 * receive queue
2286 */
2287 if(bp) {
2288 bp = packblock(bp);
2289 if(bp == nil)
2290 panic("tcp packblock");
2291 qpassnolim(s->rq, bp);
2292 bp = nil;
2293
2294 /*
2295 * Force an ack every 2 data messages. This is
2296 * a hack for rob to make his home system run
2297 * faster.
2298 *
2299 * this also keeps the standard TCP congestion
2300 * control working since it needs an ack every
2301 * 2 max segs worth. This is not quite that,
2302 * but under a real stream is equivalent since
2303 * every packet has a max seg in it.
2304 */
2305 if(++(tcb->rcv.una) >= 2)
2306 tcb->flags |= FORCE;
2307 }
2308 tcb->rcv.nxt += length;
2309
2310 /*
2311 * update our rcv window
2312 */
2313 tcprcvwin(s);
2314
2315 /*
2316 * turn on the acktimer if there's something
2317 * to ack
2318 */
2319 if(tcb->acktimer.state != TcptimerON)
2320 tcpgo(tpriv, &tcb->acktimer);
2321
2322 break;
2323 case Finwait2:
2324 /* no process to read the data, send a reset */
2325 if(bp != nil)
2326 freeblist(bp);
2327 sndrst(tcp, source, dest, length, &seg, version,
2328 "send to Finwait2");
2329 qunlock(s);
2330 poperror();
2331 return;
2332 }
2333 }
2334
2335 if(seg.flags & FIN) {
2336 tcb->flags |= FORCE;
2337
2338 switch(tcb->state) {
2339 case Syn_received:
2340 case Established:
2341 tcb->rcv.nxt++;
2342 tcpsetstate(s, Close_wait);
2343 break;
2344 case Finwait1:
2345 tcb->rcv.nxt++;
2346 if(qlen(s->wq)+tcb->flgcnt == 0) {
2347 tcphalt(tpriv, &tcb->rtt_timer);
2348 tcphalt(tpriv, &tcb->acktimer);
2349 tcphalt(tpriv, &tcb->katimer);
2350 tcpsetstate(s, Time_wait);
2351 tcb->timer.start = MSL2*(1000/MSPTICK);
2352 tcpgo(tpriv, &tcb->timer);
2353 }
2354 else
2355 tcpsetstate(s, Closing);
2356 break;
2357 case Finwait2:
2358 tcb->rcv.nxt++;
2359 tcphalt(tpriv, &tcb->rtt_timer);
2360 tcphalt(tpriv, &tcb->acktimer);
2361 tcphalt(tpriv, &tcb->katimer);
2362 tcpsetstate(s, Time_wait);
2363 tcb->timer.start = MSL2 * (1000/MSPTICK);
2364 tcpgo(tpriv, &tcb->timer);
2365 break;
2366 case Close_wait:
2367 case Closing:
2368 case Last_ack:
2369 break;
2370 case Time_wait:
2371 tcpgo(tpriv, &tcb->timer);
2372 break;
2373 }
2374 }
2375
2376 /*
2377 * get next adjacent segment from the resequence queue.
2378 * dump/trim any overlapping segments
2379 */
2380 for(;;) {
2381 if(tcb->reseq == nil)
2382 goto output;
2383
2384 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2385 goto output;
2386
2387 getreseq(tcb, &seg, &bp, &length);
2388
2389 if(tcptrim(tcb, &seg, &bp, &length) == 0)
2390 break;
2391 }
2392 }
2393 output:
2394 tcpoutput(s);
2395 qunlock(s);
2396 poperror();
2397 return;
2398 raise:
2399 qunlock(s);
2400 poperror();
2401 freeblist(bp);
2402 tcpkick(s);
2403 }
2404
2405 /*
2406 * always enters and exits with the s locked. We drop
2407 * the lock to ipoput the packet so some care has to be
2408 * taken by callers.
2409 */
2410 void
2411 tcpoutput(Conv *s)
2412 {
2413 Tcp seg;
2414 int msgs;
2415 Tcpctl *tcb;
2416 Block *hbp, *bp;
2417 int sndcnt, n;
2418 ulong ssize, dsize, usable, sent;
2419 Fs *f;
2420 Tcppriv *tpriv;
2421 uchar version;
2422
2423 f = s->p->f;
2424 tpriv = s->p->priv;
2425 version = s->ipversion;
2426
2427 for(msgs = 0; msgs < 100; msgs++) {
2428 tcb = (Tcpctl*)s->ptcl;
2429
2430 switch(tcb->state) {
2431 case Listen:
2432 case Closed:
2433 case Finwait2:
2434 return;
2435 }
2436
2437 /* force an ack when a window has opened up */
2438 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2439 tcb->rcv.blocked = 0;
2440 tcb->flags |= FORCE;
2441 }
2442
2443 sndcnt = qlen(s->wq)+tcb->flgcnt;
2444 sent = tcb->snd.ptr - tcb->snd.una;
2445
2446 /* Don't send anything else until our SYN has been acked */
2447 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2448 break;
2449
2450 /* Compute usable segment based on offered window and limit
2451 * window probes to one
2452 */
2453 if(tcb->snd.wnd == 0){
2454 if(sent != 0) {
2455 if((tcb->flags&FORCE) == 0)
2456 break;
2457 // tcb->snd.ptr = tcb->snd.una;
2458 }
2459 usable = 1;
2460 }
2461 else {
2462 usable = tcb->cwind;
2463 if(tcb->snd.wnd < usable)
2464 usable = tcb->snd.wnd;
2465 usable -= sent;
2466 }
2467 ssize = sndcnt-sent;
2468 if(ssize && usable < 2)
2469 netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2470 tcb->snd.wnd, tcb->cwind);
2471 if(usable < ssize)
2472 ssize = usable;
2473 if(tcb->mss < ssize)
2474 ssize = tcb->mss;
2475 dsize = ssize;
2476 seg.urg = 0;
2477
2478 if(ssize == 0)
2479 if((tcb->flags&FORCE) == 0)
2480 break;
2481
2482 tcb->flags &= ~FORCE;
2483 tcprcvwin(s);
2484
2485 /* By default we will generate an ack */
2486 tcphalt(tpriv, &tcb->acktimer);
2487 tcb->rcv.una = 0;
2488 seg.source = s->lport;
2489 seg.dest = s->rport;
2490 seg.flags = ACK;
2491 seg.mss = 0;
2492 seg.ws = 0;
2493 switch(tcb->state){
2494 case Syn_sent:
2495 seg.flags = 0;
2496 if(tcb->snd.ptr == tcb->iss){
2497 seg.flags |= SYN;
2498 dsize--;
2499 seg.mss = tcb->mss;
2500 seg.ws = tcb->scale;
2501 }
2502 break;
2503 case Syn_received:
2504 /*
2505 * don't send any data with a SYN/ACK packet
2506 * because Linux rejects the packet in its
2507 * attempt to solve the SYN attack problem
2508 */
2509 if(tcb->snd.ptr == tcb->iss){
2510 seg.flags |= SYN;
2511 dsize = 0;
2512 ssize = 1;
2513 seg.mss = tcb->mss;
2514 seg.ws = tcb->scale;
2515 }
2516 break;
2517 }
2518 seg.seq = tcb->snd.ptr;
2519 seg.ack = tcb->rcv.nxt;
2520 seg.wnd = tcb->rcv.wnd;
2521
2522 /* Pull out data to send */
2523 bp = nil;
2524 if(dsize != 0) {
2525 bp = qcopy(s->wq, dsize, sent);
2526 if(BLEN(bp) != dsize) {
2527 seg.flags |= FIN;
2528 dsize--;
2529 }
2530 }
2531
2532 if(sent+dsize == sndcnt)
2533 seg.flags |= PSH;
2534
2535 /* keep track of balance of resent data */
2536 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2537 n = tcb->snd.nxt - tcb->snd.ptr;
2538 if(ssize < n)
2539 n = ssize;
2540 tcb->resent += n;
2541 netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n",
2542 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2543 tpriv->stats[RetransSegs]++;
2544 }
2545
2546 tcb->snd.ptr += ssize;
2547
2548 /* Pull up the send pointer so we can accept acks
2549 * for this window
2550 */
2551 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2552 tcb->snd.nxt = tcb->snd.ptr;
2553
2554 /* Build header, link data and compute cksum */
2555 switch(version){
2556 case V4:
2557 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2558 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2559 if(hbp == nil) {
2560 freeblist(bp);
2561 return;
2562 }
2563 break;
2564 case V6:
2565 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2566 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2567 if(hbp == nil) {
2568 freeblist(bp);
2569 return;
2570 }
2571 break;
2572 default:
2573 hbp = nil; /* to suppress a warning */
2574 panic("tcpoutput: version %d", version);
2575 }
2576
2577 /* Start the transmission timers if there is new data and we
2578 * expect acknowledges
2579 */
2580 if(ssize != 0){
2581 if(tcb->timer.state != TcptimerON)
2582 tcpgo(tpriv, &tcb->timer);
2583
2584 /* If round trip timer isn't running, start it.
2585 * measure the longest packet only in case the
2586 * transmission time dominates RTT
2587 */
2588 if(tcb->rtt_timer.state != TcptimerON)
2589 if(ssize == tcb->mss) {
2590 tcpgo(tpriv, &tcb->rtt_timer);
2591 tcb->rttseq = tcb->snd.ptr;
2592 }
2593 }
2594
2595 tpriv->stats[OutSegs]++;
2596
2597 /* put off the next keep alive */
2598 tcpgo(tpriv, &tcb->katimer);
2599
2600 switch(version){
2601 case V4:
2602 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2603 /* a negative return means no route */
2604 localclose(s, "no route");
2605 }
2606 break;
2607 case V6:
2608 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2609 /* a negative return means no route */
2610 localclose(s, "no route");
2611 }
2612 break;
2613 default:
2614 panic("tcpoutput2: version %d", version);
2615 }
2616 if((msgs%4) == 1){
2617 qunlock(s);
2618 sched();
2619 qlock(s);
2620 }
2621 }
2622 }
2623
2624 /*
2625 * the BSD convention (hack?) for keep alives. resend last uchar acked.
2626 */
2627 void
2628 tcpsendka(Conv *s)
2629 {
2630 Tcp seg;
2631 Tcpctl *tcb;
2632 Block *hbp,*dbp;
2633
2634 tcb = (Tcpctl*)s->ptcl;
2635
2636 dbp = nil;
2637 seg.urg = 0;
2638 seg.source = s->lport;
2639 seg.dest = s->rport;
2640 seg.flags = ACK|PSH;
2641 seg.mss = 0;
2642 seg.ws = 0;
2643 if(tcpporthogdefense)
2644 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2645 else
2646 seg.seq = tcb->snd.una-1;
2647 seg.ack = tcb->rcv.nxt;
2648 tcb->rcv.una = 0;
2649 seg.wnd = tcb->rcv.wnd;
2650 if(tcb->state == Finwait2){
2651 seg.flags |= FIN;
2652 } else {
2653 dbp = allocb(1);
2654 dbp->wp++;
2655 }
2656
2657 if(isv4(s->raddr)) {
2658 /* Build header, link data and compute cksum */
2659 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2660 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2661 if(hbp == nil) {
2662 freeblist(dbp);
2663 return;
2664 }
2665 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2666 }
2667 else {
2668 /* Build header, link data and compute cksum */
2669 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2670 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2671 if(hbp == nil) {
2672 freeblist(dbp);
2673 return;
2674 }
2675 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2676 }
2677 }
2678
2679 /*
2680 * set connection to time out after 12 minutes
2681 */
2682 void
2683 tcpsetkacounter(Tcpctl *tcb)
2684 {
2685 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2686 if(tcb->kacounter < 3)
2687 tcb->kacounter = 3;
2688 }
2689
2690 /*
2691 * if we've timed out, close the connection
2692 * otherwise, send a keepalive and restart the timer
2693 */
2694 void
2695 tcpkeepalive(void *v)
2696 {
2697 Tcpctl *tcb;
2698 Conv *s;
2699
2700 s = v;
2701 tcb = (Tcpctl*)s->ptcl;
2702 if(waserror()){
2703 qunlock(s);
2704 nexterror();
2705 }
2706 qlock(s);
2707 if(tcb->state != Closed){
2708 if(--(tcb->kacounter) <= 0) {
2709 localclose(s, Etimedout);
2710 } else {
2711 tcpsendka(s);
2712 tcpgo(s->p->priv, &tcb->katimer);
2713 }
2714 }
2715 qunlock(s);
2716 poperror();
2717 }
2718
2719 /*
2720 * start keepalive timer
2721 */
2722 char*
2723 tcpstartka(Conv *s, char **f, int n)
2724 {
2725 Tcpctl *tcb;
2726 int x;
2727
2728 tcb = (Tcpctl*)s->ptcl;
2729 if(tcb->state != Established)
2730 return "connection must be in Establised state";
2731 if(n > 1){
2732 x = atoi(f[1]);
2733 if(x >= MSPTICK)
2734 tcb->katimer.start = x/MSPTICK;
2735 }
2736 tcpsetkacounter(tcb);
2737 tcpgo(s->p->priv, &tcb->katimer);
2738
2739 return nil;
2740 }
2741
2742 /*
2743 * turn checksums on/off
2744 */
2745 char*
2746 tcpsetchecksum(Conv *s, char **f, int)
2747 {
2748 Tcpctl *tcb;
2749
2750 tcb = (Tcpctl*)s->ptcl;
2751 tcb->nochecksum = !atoi(f[1]);
2752
2753 return nil;
2754 }
2755
2756 void
2757 tcprxmit(Conv *s)
2758 {
2759 Tcpctl *tcb;
2760
2761 tcb = (Tcpctl*)s->ptcl;
2762
2763 tcb->flags |= RETRAN|FORCE;
2764 tcb->snd.ptr = tcb->snd.una;
2765
2766 /*
2767 * We should be halving the slow start threshhold (down to one
2768 * mss) but leaving it at mss seems to work well enough
2769 */
2770 tcb->ssthresh = tcb->mss;
2771
2772 /*
2773 * pull window down to a single packet
2774 */
2775 tcb->cwind = tcb->mss;
2776 tcpoutput(s);
2777 }
2778
2779 void
2780 tcptimeout(void *arg)
2781 {
2782 Conv *s;
2783 Tcpctl *tcb;
2784 int maxback;
2785 Tcppriv *tpriv;
2786
2787 s = (Conv*)arg;
2788 tpriv = s->p->priv;
2789 tcb = (Tcpctl*)s->ptcl;
2790
2791 if(waserror()){
2792 qunlock(s);
2793 nexterror();
2794 }
2795 qlock(s);
2796 switch(tcb->state){
2797 default:
2798 tcb->backoff++;
2799 if(tcb->state == Syn_sent)
2800 maxback = MAXBACKMS/2;
2801 else
2802 maxback = MAXBACKMS;
2803 tcb->backedoff += tcb->timer.start * MSPTICK;
2804 if(tcb->backedoff >= maxback) {
2805 localclose(s, Etimedout);
2806 break;
2807 }
2808 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW);
2809 tcpsettimer(tcb);
2810 tcprxmit(s);
2811 tpriv->stats[RetransTimeouts]++;
2812 tcb->snd.dupacks = 0;
2813 break;
2814 case Time_wait:
2815 localclose(s, nil);
2816 break;
2817 case Closed:
2818 break;
2819 }
2820 qunlock(s);
2821 poperror();
2822 }
2823
2824 int
2825 inwindow(Tcpctl *tcb, int seq)
2826 {
2827 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2828 }
2829
2830 /*
2831 * set up state for a received SYN (or SYN ACK) packet
2832 */
2833 void
2834 procsyn(Conv *s, Tcp *seg)
2835 {
2836 Tcpctl *tcb;
2837
2838 tcb = (Tcpctl*)s->ptcl;
2839 tcb->flags |= FORCE;
2840
2841 tcb->rcv.nxt = seg->seq + 1;
2842 tcb->rcv.urg = tcb->rcv.nxt;
2843 tcb->irs = seg->seq;
2844
2845 /* our sending max segment size cannot be bigger than what he asked for */
2846 if(seg->mss != 0 && seg->mss < tcb->mss)
2847 tcb->mss = seg->mss;
2848
2849 /* the congestion window always starts out as a single segment */
2850 tcb->snd.wnd = seg->wnd;
2851 tcb->cwind = tcb->mss;
2852 }
2853
2854 int
2855 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2856 {
2857 Reseq *rp, *rp1;
2858 int i, rqlen, qmax;
2859
2860 rp = malloc(sizeof(Reseq));
2861 if(rp == nil){
2862 freeblist(bp); /* bp always consumed by add_reseq */
2863 return 0;
2864 }
2865
2866 rp->seg = *seg;
2867 rp->bp = bp;
2868 rp->length = length;
2869
2870 /* Place on reassembly list sorting by starting seq number */
2871 rp1 = tcb->reseq;
2872 if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2873 rp->next = rp1;
2874 tcb->reseq = rp;
2875 if(rp->next != nil)
2876 tpriv->stats[OutOfOrder]++;
2877 return 0;
2878 }
2879
2880 rqlen = 0;
2881 for(i = 0;; i++) {
2882 rqlen += rp1->length;
2883 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2884 rp->next = rp1->next;
2885 rp1->next = rp;
2886 if(rp->next != nil)
2887 tpriv->stats[OutOfOrder]++;
2888 break;
2889 }
2890 rp1 = rp1->next;
2891 }
2892 qmax = QMAX<<tcb->rcv.scale;
2893 if(rqlen > qmax){
2894 print("resequence queue > window: %d > %d\n", rqlen, qmax);
2895 i = 0;
2896 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2897 print("%#lux %#lux %#ux\n", rp1->seg.seq,
2898 rp1->seg.ack, rp1->seg.flags);
2899 if(i++ > 10){
2900 print("...\n");
2901 break;
2902 }
2903 }
2904
2905 /*
2906 * delete entire reassembly queue; wait for retransmit.
2907 * - should we be smarter and only delete the tail?
2908 */
2909 for(rp = tcb->reseq; rp != nil; rp = rp1){
2910 rp1 = rp->next;
2911 freeblist(rp->bp);
2912 free(rp);
2913 }
2914 tcb->reseq = nil;
2915
2916 return -1;
2917 }
2918 return 0;
2919 }
2920
2921 void
2922 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2923 {
2924 Reseq *rp;
2925
2926 rp = tcb->reseq;
2927 if(rp == nil)
2928 return;
2929
2930 tcb->reseq = rp->next;
2931
2932 *seg = rp->seg;
2933 *bp = rp->bp;
2934 *length = rp->length;
2935
2936 free(rp);
2937 }
2938
2939 int
2940 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2941 {
2942 ushort len;
2943 uchar accept;
2944 int dupcnt, excess;
2945
2946 accept = 0;
2947 len = *length;
2948 if(seg->flags & SYN)
2949 len++;
2950 if(seg->flags & FIN)
2951 len++;
2952
2953 if(tcb->rcv.wnd == 0) {
2954 if(len == 0 && seg->seq == tcb->rcv.nxt)
2955 return 0;
2956 }
2957 else {
2958 /* Some part of the segment should be in the window */
2959 if(inwindow(tcb,seg->seq))
2960 accept++;
2961 else
2962 if(len != 0) {
2963 if(inwindow(tcb, seg->seq+len-1) ||
2964 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
2965 accept++;
2966 }
2967 }
2968 if(!accept) {
2969 freeblist(*bp);
2970 return -1;
2971 }
2972 dupcnt = tcb->rcv.nxt - seg->seq;
2973 if(dupcnt > 0){
2974 tcb->rerecv += dupcnt;
2975 if(seg->flags & SYN){
2976 seg->flags &= ~SYN;
2977 seg->seq++;
2978
2979 if(seg->urg > 1)
2980 seg->urg--;
2981 else
2982 seg->flags &= ~URG;
2983 dupcnt--;
2984 }
2985 if(dupcnt > 0){
2986 pullblock(bp, (ushort)dupcnt);
2987 seg->seq += dupcnt;
2988 *length -= dupcnt;
2989
2990 if(seg->urg > dupcnt)
2991 seg->urg -= dupcnt;
2992 else {
2993 seg->flags &= ~URG;
2994 seg->urg = 0;
2995 }
2996 }
2997 }
2998 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
2999 if(excess > 0) {
3000 tcb->rerecv += excess;
3001 *length -= excess;
3002 *bp = trimblock(*bp, 0, *length);
3003 if(*bp == nil)
3004 panic("presotto is a boofhead");
3005 seg->flags &= ~FIN;
3006 }
3007 return 0;
3008 }
3009
3010 void
3011 tcpadvise(Proto *tcp, Block *bp, char *msg)
3012 {
3013 Tcp4hdr *h4;
3014 Tcp6hdr *h6;
3015 Tcpctl *tcb;
3016 uchar source[IPaddrlen];
3017 uchar dest[IPaddrlen];
3018 ushort psource, pdest;
3019 Conv *s, **p;
3020
3021 h4 = (Tcp4hdr*)(bp->rp);
3022 h6 = (Tcp6hdr*)(bp->rp);
3023
3024 if((h4->vihl&0xF0)==IP_VER4) {
3025 v4tov6(dest, h4->tcpdst);
3026 v4tov6(source, h4->tcpsrc);
3027 psource = nhgets(h4->tcpsport);
3028 pdest = nhgets(h4->tcpdport);
3029 }
3030 else {
3031 ipmove(dest, h6->tcpdst);
3032 ipmove(source, h6->tcpsrc);
3033 psource = nhgets(h6->tcpsport);
3034 pdest = nhgets(h6->tcpdport);
3035 }
3036
3037 /* Look for a connection */
3038 qlock(tcp);
3039 for(p = tcp->conv; *p; p++) {
3040 s = *p;
3041 tcb = (Tcpctl*)s->ptcl;
3042 if(s->rport == pdest)
3043 if(s->lport == psource)
3044 if(tcb->state != Closed)
3045 if(ipcmp(s->raddr, dest) == 0)
3046 if(ipcmp(s->laddr, source) == 0){
3047 qlock(s);
3048 qunlock(tcp);
3049 switch(tcb->state){
3050 case Syn_sent:
3051 localclose(s, msg);
3052 break;
3053 }
3054 qunlock(s);
3055 freeblist(bp);
3056 return;
3057 }
3058 }
3059 qunlock(tcp);
3060 freeblist(bp);
3061 }
3062
3063 static char*
3064 tcpporthogdefensectl(char *val)
3065 {
3066 if(strcmp(val, "on") == 0)
3067 tcpporthogdefense = 1;
3068 else if(strcmp(val, "off") == 0)
3069 tcpporthogdefense = 0;
3070 else
3071 return "unknown value for tcpporthogdefense";
3072 return nil;
3073 }
3074
3075 /* called with c qlocked */
3076 char*
3077 tcpctl(Conv* c, char** f, int n)
3078 {
3079 if(n == 1 && strcmp(f[0], "hangup") == 0)
3080 return tcphangup(c);
3081 if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3082 return tcpstartka(c, f, n);
3083 if(n >= 1 && strcmp(f[0], "checksum") == 0)
3084 return tcpsetchecksum(c, f, n);
3085 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3086 return tcpporthogdefensectl(f[1]);
3087 return "unknown control request";
3088 }
3089
3090 int
3091 tcpstats(Proto *tcp, char *buf, int len)
3092 {
3093 Tcppriv *priv;
3094 char *p, *e;
3095 int i;
3096
3097 priv = tcp->priv;
3098 p = buf;
3099 e = p+len;
3100 for(i = 0; i < Nstats; i++)
3101 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
3102 return p - buf;
3103 }
3104
3105 /*
3106 * garbage collect any stale conversations:
3107 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3108 * - Finwait2 after 5 minutes
3109 *
3110 * this is called whenever we run out of channels. Both checks are
3111 * of questionable validity so we try to use them only when we're
3112 * up against the wall.
3113 */
3114 int
3115 tcpgc(Proto *tcp)
3116 {
3117 Conv *c, **pp, **ep;
3118 int n;
3119 Tcpctl *tcb;
3120
3121
3122 n = 0;
3123 ep = &tcp->conv[tcp->nc];
3124 for(pp = tcp->conv; pp < ep; pp++) {
3125 c = *pp;
3126 if(c == nil)
3127 break;
3128 if(!canqlock(c))
3129 continue;
3130 tcb = (Tcpctl*)c->ptcl;
3131 switch(tcb->state){
3132 case Syn_received:
3133 if(NOW - tcb->time > 5000){
3134 localclose(c, "timed out");
3135 n++;
3136 }
3137 break;
3138 case Finwait2:
3139 if(NOW - tcb->time > 5*60*1000){
3140 localclose(c, "timed out");
3141 n++;
3142 }
3143 break;
3144 }
3145 qunlock(c);
3146 }
3147 return n;
3148 }
3149
3150 void
3151 tcpsettimer(Tcpctl *tcb)
3152 {
3153 int x;
3154
3155 /* round trip dependency */
3156 x = backoff(tcb->backoff) *
3157 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3158
3159 /* bounded twixt 1/2 and 64 seconds */
3160 if(x < 500/MSPTICK)
3161 x = 500/MSPTICK;
3162 else if(x > (64000/MSPTICK))
3163 x = 64000/MSPTICK;
3164 tcb->timer.start = x;
3165 }
3166
3167 void
3168 tcpinit(Fs *fs)
3169 {
3170 Proto *tcp;
3171 Tcppriv *tpriv;
3172
3173 tcp = smalloc(sizeof(Proto));
3174 tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3175 tcp->name = "tcp";
3176 tcp->connect = tcpconnect;
3177 tcp->announce = tcpannounce;
3178 tcp->ctl = tcpctl;
3179 tcp->state = tcpstate;
3180 tcp->create = tcpcreate;
3181 tcp->close = tcpclose;
3182 tcp->rcv = tcpiput;
3183 tcp->advise = tcpadvise;
3184 tcp->stats = tcpstats;
3185 tcp->inuse = tcpinuse;
3186 tcp->gc = tcpgc;
3187 tcp->ipproto = IP_TCPPROTO;
3188 tcp->nc = scalednconv();
3189 tcp->ptclsize = sizeof(Tcpctl);
3190 tpriv->stats[MaxConn] = tcp->nc;
3191
3192 Fsproto(fs, tcp);
3193 }
3194
3195 void
3196 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3197 {
3198 if(rcvscale){
3199 tcb->rcv.scale = rcvscale & 0xff;
3200 tcb->snd.scale = sndscale & 0xff;
3201 tcb->window = QMAX<<tcb->snd.scale;
3202 qsetlimit(s->rq, tcb->window);
3203 } else {
3204 tcb->rcv.scale = 0;
3205 tcb->snd.scale = 0;
3206 tcb->window = QMAX;
3207 qsetlimit(s->rq, tcb->window);
3208 }
3209 }
Cache object: a6dedd06375163a3508093f5163171b6
|