The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/ip/tcp.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 #include        "u.h"
    2 #include        "../port/lib.h"
    3 #include        "mem.h"
    4 #include        "dat.h"
    5 #include        "fns.h"
    6 #include        "../port/error.h"
    7 
    8 #include        "ip.h"
    9 
   10 enum
   11 {
   12         QMAX            = 64*1024-1,
   13         IP_TCPPROTO     = 6,
   14 
   15         TCP4_IPLEN      = 8,
   16         TCP4_PHDRSIZE   = 12,
   17         TCP4_HDRSIZE    = 20,
   18         TCP4_TCBPHDRSZ  = 40,
   19         TCP4_PKT        = TCP4_IPLEN+TCP4_PHDRSIZE,
   20 
   21         TCP6_IPLEN      = 0,
   22         TCP6_PHDRSIZE   = 40,
   23         TCP6_HDRSIZE    = 20,
   24         TCP6_TCBPHDRSZ  = 60,
   25         TCP6_PKT        = TCP6_IPLEN+TCP6_PHDRSIZE,
   26 
   27         TcptimerOFF     = 0,
   28         TcptimerON      = 1,
   29         TcptimerDONE    = 2,
   30         MAX_TIME        = (1<<20),      /* Forever */
   31         TCP_ACK         = 50,           /* Timed ack sequence in ms */
   32         MAXBACKMS       = 9*60*1000,    /* longest backoff time (ms) before hangup */
   33 
   34         URG             = 0x20,         /* Data marked urgent */
   35         ACK             = 0x10,         /* Acknowledge is valid */
   36         PSH             = 0x08,         /* Whole data pipe is pushed */
   37         RST             = 0x04,         /* Reset connection */
   38         SYN             = 0x02,         /* Pkt. is synchronise */
   39         FIN             = 0x01,         /* Start close down */
   40 
   41         EOLOPT          = 0,
   42         NOOPOPT         = 1,
   43         MSSOPT          = 2,
   44         MSS_LENGTH      = 4,            /* Mean segment size */
   45         WSOPT           = 3,
   46         WS_LENGTH       = 3,            /* Bits to scale window size by */
   47         MSL2            = 10,
   48         MSPTICK         = 50,           /* Milliseconds per timer tick */
   49         DEF_MSS         = 1460,         /* Default mean segment */
   50         DEF_MSS6        = 1280,         /* Default mean segment (min) for v6 */
   51         DEF_RTT         = 500,          /* Default round trip */
   52         DEF_KAT         = 120000,       /* Default time (ms) between keep alives */
   53         TCP_LISTEN      = 0,            /* Listen connection */
   54         TCP_CONNECT     = 1,            /* Outgoing connection */
   55         SYNACK_RXTIMER  = 250,          /* ms between SYNACK retransmits */
   56 
   57         TCPREXMTTHRESH  = 3,            /* dupack threshhold for rxt */
   58 
   59         FORCE           = 1,
   60         CLONE           = 2,
   61         RETRAN          = 4,
   62         ACTIVE          = 8,
   63         SYNACK          = 16,
   64 
   65         LOGAGAIN        = 3,
   66         LOGDGAIN        = 2,
   67 
   68         Closed          = 0,            /* Connection states */
   69         Listen,
   70         Syn_sent,
   71         Syn_received,
   72         Established,
   73         Finwait1,
   74         Finwait2,
   75         Close_wait,
   76         Closing,
   77         Last_ack,
   78         Time_wait,
   79 
   80         Maxlimbo        = 1000,         /* maximum procs waiting for response to SYN ACK */
   81         NLHT            = 256,          /* hash table size, must be a power of 2 */
   82         LHTMASK         = NLHT-1,
   83 
   84         HaveWS          = 1<<8,
   85 };
   86 
   87 /* Must correspond to the enumeration above */
   88 char *tcpstates[] =
   89 {
   90         "Closed",       "Listen",       "Syn_sent", "Syn_received",
   91         "Established",  "Finwait1",     "Finwait2", "Close_wait",
   92         "Closing",      "Last_ack",     "Time_wait"
   93 };
   94 
   95 typedef struct Tcptimer Tcptimer;
   96 struct Tcptimer
   97 {
   98         Tcptimer        *next;
   99         Tcptimer        *prev;
  100         Tcptimer        *readynext;
  101         int     state;
  102         int     start;
  103         int     count;
  104         void    (*func)(void*);
  105         void    *arg;
  106 };
  107 
  108 /*
  109  *  v4 and v6 pseudo headers used for
  110  *  checksuming tcp
  111  */
  112 typedef struct Tcp4hdr Tcp4hdr;
  113 struct Tcp4hdr
  114 {
  115         uchar   vihl;           /* Version and header length */
  116         uchar   tos;            /* Type of service */
  117         uchar   length[2];      /* packet length */
  118         uchar   id[2];          /* Identification */
  119         uchar   frag[2];        /* Fragment information */
  120         uchar   Unused;
  121         uchar   proto;
  122         uchar   tcplen[2];
  123         uchar   tcpsrc[4];
  124         uchar   tcpdst[4];
  125         uchar   tcpsport[2];
  126         uchar   tcpdport[2];
  127         uchar   tcpseq[4];
  128         uchar   tcpack[4];
  129         uchar   tcpflag[2];
  130         uchar   tcpwin[2];
  131         uchar   tcpcksum[2];
  132         uchar   tcpurg[2];
  133         /* Options segment */
  134         uchar   tcpopt[1];
  135 };
  136 
  137 typedef struct Tcp6hdr Tcp6hdr;
  138 struct Tcp6hdr
  139 {
  140         uchar   vcf[4];
  141         uchar   ploadlen[2];
  142         uchar   proto;
  143         uchar   ttl;
  144         uchar   tcpsrc[IPaddrlen];
  145         uchar   tcpdst[IPaddrlen];
  146         uchar   tcpsport[2];
  147         uchar   tcpdport[2];
  148         uchar   tcpseq[4];
  149         uchar   tcpack[4];
  150         uchar   tcpflag[2];
  151         uchar   tcpwin[2];
  152         uchar   tcpcksum[2];
  153         uchar   tcpurg[2];
  154         /* Options segment */
  155         uchar   tcpopt[1];
  156 };
  157 
  158 /*
  159  *  this represents the control info
  160  *  for a single packet.  It is derived from
  161  *  a packet in ntohtcp{4,6}() and stuck into
  162  *  a packet in htontcp{4,6}().
  163  */
  164 typedef struct Tcp Tcp;
  165 struct  Tcp
  166 {
  167         ushort  source;
  168         ushort  dest;
  169         ulong   seq;
  170         ulong   ack;
  171         uchar   flags;
  172         ushort  ws;     /* window scale option (if not zero) */
  173         ulong   wnd;
  174         ushort  urg;
  175         ushort  mss;    /* max segment size option (if not zero) */
  176         ushort  len;    /* size of data */
  177 };
  178 
  179 /*
  180  *  this header is malloc'd to thread together fragments
  181  *  waiting to be coalesced
  182  */
  183 typedef struct Reseq Reseq;
  184 struct Reseq
  185 {
  186         Reseq   *next;
  187         Tcp     seg;
  188         Block   *bp;
  189         ushort  length;
  190 };
  191 
  192 /*
  193  *  the qlock in the Conv locks this structure
  194  */
  195 typedef struct Tcpctl Tcpctl;
  196 struct Tcpctl
  197 {
  198         uchar   state;                  /* Connection state */
  199         uchar   type;                   /* Listening or active connection */
  200         uchar   code;                   /* Icmp code */
  201         struct {
  202                 ulong   una;            /* Unacked data pointer */
  203                 ulong   nxt;            /* Next sequence expected */
  204                 ulong   ptr;            /* Data pointer */
  205                 ulong   wnd;            /* Tcp send window */
  206                 ulong   urg;            /* Urgent data pointer */
  207                 ulong   wl2;
  208                 int     scale;          /* how much to right shift window in xmitted packets */
  209                 /* to implement tahoe and reno TCP */
  210                 ulong   dupacks;        /* number of duplicate acks rcvd */
  211                 int     recovery;       /* loss recovery flag */
  212                 ulong   rxt;            /* right window marker for recovery */
  213         } snd;
  214         struct {
  215                 ulong   nxt;            /* Receive pointer to next uchar slot */
  216                 ulong   wnd;            /* Receive window incoming */
  217                 ulong   urg;            /* Urgent pointer */
  218                 int     blocked;
  219                 int     una;            /* unacked data segs */
  220                 int     scale;          /* how much to left shift window in rcved packets */
  221         } rcv;
  222         ulong   iss;                    /* Initial sequence number */
  223         int     sawwsopt;               /* true if we saw a wsopt on the incoming SYN */
  224         ulong   cwind;                  /* Congestion window */
  225         int     scale;                  /* desired snd.scale */
  226         ushort  ssthresh;               /* Slow start threshold */
  227         int     resent;                 /* Bytes just resent */
  228         int     irs;                    /* Initial received squence */
  229         ushort  mss;                    /* Mean segment size */
  230         int     rerecv;                 /* Overlap of data rerecevived */
  231         ulong   window;                 /* Recevive window */
  232         uchar   backoff;                /* Exponential backoff counter */
  233         int     backedoff;              /* ms we've backed off for rexmits */
  234         uchar   flags;                  /* State flags */
  235         Reseq   *reseq;                 /* Resequencing queue */
  236         Tcptimer        timer;                  /* Activity timer */
  237         Tcptimer        acktimer;               /* Acknowledge timer */
  238         Tcptimer        rtt_timer;              /* Round trip timer */
  239         Tcptimer        katimer;                /* keep alive timer */
  240         ulong   rttseq;                 /* Round trip sequence */
  241         int     srtt;                   /* Shortened round trip */
  242         int     mdev;                   /* Mean deviation of round trip */
  243         int     kacounter;              /* count down for keep alive */
  244         uint    sndsyntime;             /* time syn sent */
  245         ulong   time;                   /* time Finwait2 or Syn_received was sent */
  246         int     nochecksum;             /* non-zero means don't send checksums */
  247         int     flgcnt;                 /* number of flags in the sequence (FIN,SEQ) */
  248 
  249         union {
  250                 Tcp4hdr tcp4hdr;
  251                 Tcp6hdr tcp6hdr;
  252         } protohdr;             /* prototype header */
  253 };
  254 
  255 /*
  256  *  New calls are put in limbo rather than having a conversation structure
  257  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
  258  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
  259  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
  260  *
  261  *  In particular they aren't on a listener's queue so that they don't figure
  262  *  in the input queue limit.
  263  *
  264  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
  265  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
  266  *  there is no hashing of this list.
  267  */
  268 typedef struct Limbo Limbo;
  269 struct Limbo
  270 {
  271         Limbo   *next;
  272 
  273         uchar   laddr[IPaddrlen];
  274         uchar   raddr[IPaddrlen];
  275         ushort  lport;
  276         ushort  rport;
  277         ulong   irs;            /* initial received sequence */
  278         ulong   iss;            /* initial sent sequence */
  279         ushort  mss;            /* mss from the other end */
  280         ushort  rcvscale;       /* how much to scale rcvd windows */
  281         ushort  sndscale;       /* how much to scale sent windows */
  282         ulong   lastsend;       /* last time we sent a synack */
  283         uchar   version;        /* v4 or v6 */
  284         uchar   rexmits;        /* number of retransmissions */
  285 };
  286 
  287 int     tcp_irtt = DEF_RTT;     /* Initial guess at round trip time */
  288 ushort  tcp_mss = DEF_MSS;      /* Maximum segment size to be sent */
  289 
  290 enum {
  291         /* MIB stats */
  292         MaxConn,
  293         ActiveOpens,
  294         PassiveOpens,
  295         EstabResets,
  296         CurrEstab,
  297         InSegs,
  298         OutSegs,
  299         RetransSegs,
  300         RetransTimeouts,
  301         InErrs,
  302         OutRsts,
  303 
  304         /* non-MIB stats */
  305         CsumErrs,
  306         HlenErrs,
  307         LenErrs,
  308         OutOfOrder,
  309 
  310         Nstats
  311 };
  312 
  313 static char *statnames[] =
  314 {
  315 [MaxConn]       "MaxConn",
  316 [ActiveOpens]   "ActiveOpens",
  317 [PassiveOpens]  "PassiveOpens",
  318 [EstabResets]   "EstabResets",
  319 [CurrEstab]     "CurrEstab",
  320 [InSegs]        "InSegs",
  321 [OutSegs]       "OutSegs",
  322 [RetransSegs]   "RetransSegs",
  323 [RetransTimeouts]       "RetransTimeouts",
  324 [InErrs]        "InErrs",
  325 [OutRsts]       "OutRsts",
  326 [CsumErrs]      "CsumErrs",
  327 [HlenErrs]      "HlenErrs",
  328 [LenErrs]       "LenErrs",
  329 [OutOfOrder]    "OutOfOrder",
  330 };
  331 
  332 typedef struct Tcppriv Tcppriv;
  333 struct Tcppriv
  334 {
  335         /* List of active timers */
  336         QLock   tl;
  337         Tcptimer *timers;
  338 
  339         /* hash table for matching conversations */
  340         Ipht    ht;
  341 
  342         /* calls in limbo waiting for an ACK to our SYN ACK */
  343         int     nlimbo;
  344         Limbo   *lht[NLHT];
  345 
  346         /* for keeping track of tcpackproc */
  347         QLock   apl;
  348         int     ackprocstarted;
  349 
  350         ulong   stats[Nstats];
  351 };
  352 
  353 /*
  354  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
  355  *  solution to hijacked systems staking out port's as a form
  356  *  of DoS attack.
  357  *
  358  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
  359  *  that number gets acked by the other end, we shut down the connection.
  360  *  Look for tcpporthogdefense in the code.
  361  */
  362 int tcpporthogdefense = 0;
  363 
  364 int     addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
  365 void    getreseq(Tcpctl*, Tcp*, Block**, ushort*);
  366 void    localclose(Conv*, char*);
  367 void    procsyn(Conv*, Tcp*);
  368 void    tcpiput(Proto*, Ipifc*, Block*);
  369 void    tcpoutput(Conv*);
  370 int     tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
  371 void    tcpstart(Conv*, int);
  372 void    tcptimeout(void*);
  373 void    tcpsndsyn(Conv*, Tcpctl*);
  374 void    tcprcvwin(Conv*);
  375 void    tcpacktimer(void*);
  376 void    tcpkeepalive(void*);
  377 void    tcpsetkacounter(Tcpctl*);
  378 void    tcprxmit(Conv*);
  379 void    tcpsettimer(Tcpctl*);
  380 void    tcpsynackrtt(Conv*);
  381 void    tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
  382 
  383 static void limborexmit(Proto*);
  384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
  385 
  386 void
  387 tcpsetstate(Conv *s, uchar newstate)
  388 {
  389         Tcpctl *tcb;
  390         uchar oldstate;
  391         Tcppriv *tpriv;
  392 
  393         tpriv = s->p->priv;
  394 
  395         tcb = (Tcpctl*)s->ptcl;
  396 
  397         oldstate = tcb->state;
  398         if(oldstate == newstate)
  399                 return;
  400 
  401         if(oldstate == Established)
  402                 tpriv->stats[CurrEstab]--;
  403         if(newstate == Established)
  404                 tpriv->stats[CurrEstab]++;
  405 
  406         /**
  407         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
  408                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
  409         **/
  410 
  411         switch(newstate) {
  412         case Closed:
  413                 qclose(s->rq);
  414                 qclose(s->wq);
  415                 qclose(s->eq);
  416                 break;
  417 
  418         case Close_wait:                /* Remote closes */
  419                 qhangup(s->rq, nil);
  420                 break;
  421         }
  422 
  423         tcb->state = newstate;
  424 
  425         if(oldstate == Syn_sent && newstate != Closed)
  426                 Fsconnected(s, nil);
  427 }
  428 
  429 static char*
  430 tcpconnect(Conv *c, char **argv, int argc)
  431 {
  432         char *e;
  433         Tcpctl *tcb;
  434 
  435         tcb = (Tcpctl*)(c->ptcl);
  436         if(tcb->state != Closed)
  437                 return Econinuse;
  438 
  439         e = Fsstdconnect(c, argv, argc);
  440         if(e != nil)
  441                 return e;
  442         tcpstart(c, TCP_CONNECT);
  443 
  444         return nil;
  445 }
  446 
  447 static int
  448 tcpstate(Conv *c, char *state, int n)
  449 {
  450         Tcpctl *s;
  451 
  452         s = (Tcpctl*)(c->ptcl);
  453 
  454         return snprint(state, n,
  455                 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
  456                 tcpstates[s->state],
  457                 c->rq ? qlen(c->rq) : 0,
  458                 c->wq ? qlen(c->wq) : 0,
  459                 s->srtt, s->mdev,
  460                 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
  461                 s->timer.start, s->timer.count, s->rerecv,
  462                 s->katimer.start, s->katimer.count);
  463 }
  464 
  465 static int
  466 tcpinuse(Conv *c)
  467 {
  468         Tcpctl *s;
  469 
  470         s = (Tcpctl*)(c->ptcl);
  471         return s->state != Closed;
  472 }
  473 
  474 static char*
  475 tcpannounce(Conv *c, char **argv, int argc)
  476 {
  477         char *e;
  478         Tcpctl *tcb;
  479 
  480         tcb = (Tcpctl*)(c->ptcl);
  481         if(tcb->state != Closed)
  482                 return Econinuse;
  483 
  484         e = Fsstdannounce(c, argv, argc);
  485         if(e != nil)
  486                 return e;
  487         tcpstart(c, TCP_LISTEN);
  488         Fsconnected(c, nil);
  489 
  490         return nil;
  491 }
  492 
  493 /*
  494  *  tcpclose is always called with the q locked
  495  */
  496 static void
  497 tcpclose(Conv *c)
  498 {
  499         Tcpctl *tcb;
  500 
  501         tcb = (Tcpctl*)c->ptcl;
  502 
  503         qhangup(c->rq, nil);
  504         qhangup(c->wq, nil);
  505         qhangup(c->eq, nil);
  506         qflush(c->rq);
  507 
  508         switch(tcb->state) {
  509         case Listen:
  510                 /*
  511                  *  reset any incoming calls to this listener
  512                  */
  513                 Fsconnected(c, "Hangup");
  514 
  515                 localclose(c, nil);
  516                 break;
  517         case Closed:
  518         case Syn_sent:
  519                 localclose(c, nil);
  520                 break;
  521         case Syn_received:
  522         case Established:
  523                 tcb->flgcnt++;
  524                 tcb->snd.nxt++;
  525                 tcpsetstate(c, Finwait1);
  526                 tcpoutput(c);
  527                 break;
  528         case Close_wait:
  529                 tcb->flgcnt++;
  530                 tcb->snd.nxt++;
  531                 tcpsetstate(c, Last_ack);
  532                 tcpoutput(c);
  533                 break;
  534         }
  535 }
  536 
  537 void
  538 tcpkick(void *x)
  539 {
  540         Conv *s = x;
  541         Tcpctl *tcb;
  542 
  543         tcb = (Tcpctl*)s->ptcl;
  544 
  545         if(waserror()){
  546                 qunlock(s);
  547                 nexterror();
  548         }
  549         qlock(s);
  550 
  551         switch(tcb->state) {
  552         case Syn_sent:
  553         case Syn_received:
  554         case Established:
  555         case Close_wait:
  556                 /*
  557                  * Push data
  558                  */
  559                 tcprcvwin(s);
  560                 tcpoutput(s);
  561                 break;
  562         default:
  563                 localclose(s, "Hangup");
  564                 break;
  565         }
  566 
  567         qunlock(s);
  568         poperror();
  569 }
  570 
  571 void
  572 tcprcvwin(Conv *s)                              /* Call with tcb locked */
  573 {
  574         int w;
  575         Tcpctl *tcb;
  576 
  577         tcb = (Tcpctl*)s->ptcl;
  578         w = tcb->window - qlen(s->rq);
  579         if(w < 0)
  580                 w = 0;
  581         tcb->rcv.wnd = w;
  582         if(w == 0)
  583                 tcb->rcv.blocked = 1;
  584 }
  585 
  586 void
  587 tcpacktimer(void *v)
  588 {
  589         Tcpctl *tcb;
  590         Conv *s;
  591 
  592         s = v;
  593         tcb = (Tcpctl*)s->ptcl;
  594 
  595         if(waserror()){
  596                 qunlock(s);
  597                 nexterror();
  598         }
  599         qlock(s);
  600         if(tcb->state != Closed){
  601                 tcb->flags |= FORCE;
  602                 tcprcvwin(s);
  603                 tcpoutput(s);
  604         }
  605         qunlock(s);
  606         poperror();
  607 }
  608 
  609 static void
  610 tcpcreate(Conv *c)
  611 {
  612         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
  613         c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
  614 }
  615 
  616 static void
  617 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
  618 {
  619         if(newstate != TcptimerON){
  620                 if(t->state == TcptimerON){
  621                         /* unchain */
  622                         if(priv->timers == t){
  623                                 priv->timers = t->next;
  624                                 if(t->prev != nil)
  625                                         panic("timerstate1");
  626                         }
  627                         if(t->next)
  628                                 t->next->prev = t->prev;
  629                         if(t->prev)
  630                                 t->prev->next = t->next;
  631                         t->next = t->prev = nil;
  632                 }
  633         } else {
  634                 if(t->state != TcptimerON){
  635                         /* chain */
  636                         if(t->prev != nil || t->next != nil)
  637                                 panic("timerstate2");
  638                         t->prev = nil;
  639                         t->next = priv->timers;
  640                         if(t->next)
  641                                 t->next->prev = t;
  642                         priv->timers = t;
  643                 }
  644         }
  645         t->state = newstate;
  646 }
  647 
  648 void
  649 tcpackproc(void *a)
  650 {
  651         Tcptimer *t, *tp, *timeo;
  652         Proto *tcp;
  653         Tcppriv *priv;
  654         int loop;
  655 
  656         tcp = a;
  657         priv = tcp->priv;
  658 
  659         for(;;) {
  660                 tsleep(&up->sleep, return0, 0, MSPTICK);
  661 
  662                 qlock(&priv->tl);
  663                 timeo = nil;
  664                 loop = 0;
  665                 for(t = priv->timers; t != nil; t = tp) {
  666                         if(loop++ > 10000)
  667                                 panic("tcpackproc1");
  668                         tp = t->next;
  669                         if(t->state == TcptimerON) {
  670                                 t->count--;
  671                                 if(t->count == 0) {
  672                                         timerstate(priv, t, TcptimerDONE);
  673                                         t->readynext = timeo;
  674                                         timeo = t;
  675                                 }
  676                         }
  677                 }
  678                 qunlock(&priv->tl);
  679 
  680                 loop = 0;
  681                 for(t = timeo; t != nil; t = t->readynext) {
  682                         if(loop++ > 10000)
  683                                 panic("tcpackproc2");
  684                         if(t->state == TcptimerDONE && t->func != nil && !waserror()){
  685                                 (*t->func)(t->arg);
  686                                 poperror();
  687                         }
  688                 }
  689 
  690                 limborexmit(tcp);
  691         }
  692 }
  693 
  694 void
  695 tcpgo(Tcppriv *priv, Tcptimer *t)
  696 {
  697         if(t == nil || t->start == 0)
  698                 return;
  699 
  700         qlock(&priv->tl);
  701         t->count = t->start;
  702         timerstate(priv, t, TcptimerON);
  703         qunlock(&priv->tl);
  704 }
  705 
  706 void
  707 tcphalt(Tcppriv *priv, Tcptimer *t)
  708 {
  709         if(t == nil)
  710                 return;
  711 
  712         qlock(&priv->tl);
  713         timerstate(priv, t, TcptimerOFF);
  714         qunlock(&priv->tl);
  715 }
  716 
  717 int
  718 backoff(int n)
  719 {
  720         return 1 << n;
  721 }
  722 
  723 void
  724 localclose(Conv *s, char *reason)       /* called with tcb locked */
  725 {
  726         Tcpctl *tcb;
  727         Reseq *rp,*rp1;
  728         Tcppriv *tpriv;
  729 
  730         tpriv = s->p->priv;
  731         tcb = (Tcpctl*)s->ptcl;
  732 
  733         iphtrem(&tpriv->ht, s);
  734 
  735         tcphalt(tpriv, &tcb->timer);
  736         tcphalt(tpriv, &tcb->rtt_timer);
  737         tcphalt(tpriv, &tcb->acktimer);
  738         tcphalt(tpriv, &tcb->katimer);
  739 
  740         /* Flush reassembly queue; nothing more can arrive */
  741         for(rp = tcb->reseq; rp != nil; rp = rp1) {
  742                 rp1 = rp->next;
  743                 freeblist(rp->bp);
  744                 free(rp);
  745         }
  746         tcb->reseq = nil;
  747 
  748         if(tcb->state == Syn_sent)
  749                 Fsconnected(s, reason);
  750         if(s->state == Announced)
  751                 wakeup(&s->listenr);
  752 
  753         qhangup(s->rq, reason);
  754         qhangup(s->wq, reason);
  755 
  756         tcpsetstate(s, Closed);
  757 }
  758 
  759 /* mtu (- TCP + IP hdr len) of 1st hop */
  760 int
  761 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
  762 {
  763         Ipifc *ifc;
  764         int mtu;
  765 
  766         ifc = findipifc(tcp->f, addr, 0);
  767         switch(version){
  768         default:
  769         case V4:
  770                 mtu = DEF_MSS;
  771                 if(ifc != nil)
  772                         mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
  773                 break;
  774         case V6:
  775                 mtu = DEF_MSS6;
  776                 if(ifc != nil)
  777                         mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
  778                 break;
  779         }
  780         if(ifc != nil){
  781                 if(ifc->mbps > 1000)
  782                         *scale = HaveWS | 4;
  783                 else if(ifc->mbps > 100)
  784                         *scale = HaveWS | 3;
  785                 else if(ifc->mbps > 10)
  786                         *scale = HaveWS | 1;
  787                 else
  788                         *scale = HaveWS | 0;
  789         } else
  790                 *scale = HaveWS | 0;
  791 
  792         return mtu;
  793 }
  794 
  795 void
  796 inittcpctl(Conv *s, int mode)
  797 {
  798         Tcpctl *tcb;
  799         Tcp4hdr* h4;
  800         Tcp6hdr* h6;
  801         int mss;
  802 
  803         tcb = (Tcpctl*)s->ptcl;
  804 
  805         memset(tcb, 0, sizeof(Tcpctl));
  806 
  807         tcb->ssthresh = 65535;
  808         tcb->srtt = tcp_irtt<<LOGAGAIN;
  809         tcb->mdev = 0;
  810 
  811         /* setup timers */
  812         tcb->timer.start = tcp_irtt / MSPTICK;
  813         tcb->timer.func = tcptimeout;
  814         tcb->timer.arg = s;
  815         tcb->rtt_timer.start = MAX_TIME;
  816         tcb->acktimer.start = TCP_ACK / MSPTICK;
  817         tcb->acktimer.func = tcpacktimer;
  818         tcb->acktimer.arg = s;
  819         tcb->katimer.start = DEF_KAT / MSPTICK;
  820         tcb->katimer.func = tcpkeepalive;
  821         tcb->katimer.arg = s;
  822 
  823         mss = DEF_MSS;
  824 
  825         /* create a prototype(pseudo) header */
  826         if(mode != TCP_LISTEN){
  827                 if(ipcmp(s->laddr, IPnoaddr) == 0)
  828                         findlocalip(s->p->f, s->laddr, s->raddr);
  829 
  830                 switch(s->ipversion){
  831                 case V4:
  832                         h4 = &tcb->protohdr.tcp4hdr;
  833                         memset(h4, 0, sizeof(*h4));
  834                         h4->proto = IP_TCPPROTO;
  835                         hnputs(h4->tcpsport, s->lport);
  836                         hnputs(h4->tcpdport, s->rport);
  837                         v6tov4(h4->tcpsrc, s->laddr);
  838                         v6tov4(h4->tcpdst, s->raddr);
  839                         break;
  840                 case V6:
  841                         h6 = &tcb->protohdr.tcp6hdr;
  842                         memset(h6, 0, sizeof(*h6));
  843                         h6->proto = IP_TCPPROTO;
  844                         hnputs(h6->tcpsport, s->lport);
  845                         hnputs(h6->tcpdport, s->rport);
  846                         ipmove(h6->tcpsrc, s->laddr);
  847                         ipmove(h6->tcpdst, s->raddr);
  848                         mss = DEF_MSS6;
  849                         break;
  850                 default:
  851                         panic("inittcpctl: version %d", s->ipversion);
  852                 }
  853         }
  854 
  855         tcb->mss = tcb->cwind = mss;
  856 
  857         /* default is no window scaling */
  858         tcb->window = QMAX;
  859         tcb->rcv.wnd = QMAX;
  860         tcb->rcv.scale = 0;
  861         tcb->snd.scale = 0;
  862         qsetlimit(s->rq, QMAX);
  863 }
  864 
  865 /*
  866  *  called with s qlocked
  867  */
  868 void
  869 tcpstart(Conv *s, int mode)
  870 {
  871         Tcpctl *tcb;
  872         Tcppriv *tpriv;
  873         char kpname[KNAMELEN];
  874 
  875         tpriv = s->p->priv;
  876 
  877         if(tpriv->ackprocstarted == 0){
  878                 qlock(&tpriv->apl);
  879                 if(tpriv->ackprocstarted == 0){
  880                         sprint(kpname, "#I%dtcpack", s->p->f->dev);
  881                         kproc(kpname, tcpackproc, s->p);
  882                         tpriv->ackprocstarted = 1;
  883                 }
  884                 qunlock(&tpriv->apl);
  885         }
  886 
  887         tcb = (Tcpctl*)s->ptcl;
  888 
  889         inittcpctl(s, mode);
  890 
  891         iphtadd(&tpriv->ht, s);
  892         switch(mode) {
  893         case TCP_LISTEN:
  894                 tpriv->stats[PassiveOpens]++;
  895                 tcb->flags |= CLONE;
  896                 tcpsetstate(s, Listen);
  897                 break;
  898 
  899         case TCP_CONNECT:
  900                 tpriv->stats[ActiveOpens]++;
  901                 tcb->flags |= ACTIVE;
  902                 tcpsndsyn(s, tcb);
  903                 tcpsetstate(s, Syn_sent);
  904                 tcpoutput(s);
  905                 break;
  906         }
  907 }
  908 
  909 static char*
  910 tcpflag(ushort flag)
  911 {
  912         static char buf[128];
  913 
  914         sprint(buf, "%d", flag>>10);    /* Head len */
  915         if(flag & URG)
  916                 strcat(buf, " URG");
  917         if(flag & ACK)
  918                 strcat(buf, " ACK");
  919         if(flag & PSH)
  920                 strcat(buf, " PSH");
  921         if(flag & RST)
  922                 strcat(buf, " RST");
  923         if(flag & SYN)
  924                 strcat(buf, " SYN");
  925         if(flag & FIN)
  926                 strcat(buf, " FIN");
  927 
  928         return buf;
  929 }
  930 
  931 Block *
  932 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
  933 {
  934         int dlen;
  935         Tcp6hdr *h;
  936         ushort csum;
  937         ushort hdrlen, optpad = 0;
  938         uchar *opt;
  939 
  940         hdrlen = TCP6_HDRSIZE;
  941         if(tcph->flags & SYN){
  942                 if(tcph->mss)
  943                         hdrlen += MSS_LENGTH;
  944                 if(tcph->ws)
  945                         hdrlen += WS_LENGTH;
  946                 optpad = hdrlen & 3;
  947                 if(optpad)
  948                         optpad = 4 - optpad;
  949                 hdrlen += optpad;
  950         }
  951 
  952         if(data) {
  953                 dlen = blocklen(data);
  954                 data = padblock(data, hdrlen + TCP6_PKT);
  955                 if(data == nil)
  956                         return nil;
  957         }
  958         else {
  959                 dlen = 0;
  960                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
  961                 if(data == nil)
  962                         return nil;
  963                 data->wp += hdrlen + TCP6_PKT;
  964         }
  965 
  966         /* copy in pseudo ip header plus port numbers */
  967         h = (Tcp6hdr *)(data->rp);
  968         memmove(h, ph, TCP6_TCBPHDRSZ);
  969 
  970         /* compose pseudo tcp header, do cksum calculation */
  971         hnputl(h->vcf, hdrlen + dlen);
  972         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
  973         h->ttl = ph->proto;
  974 
  975         /* copy in variable bits */
  976         hnputl(h->tcpseq, tcph->seq);
  977         hnputl(h->tcpack, tcph->ack);
  978         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
  979         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
  980         hnputs(h->tcpurg, tcph->urg);
  981 
  982         if(tcph->flags & SYN){
  983                 opt = h->tcpopt;
  984                 if(tcph->mss != 0){
  985                         *opt++ = MSSOPT;
  986                         *opt++ = MSS_LENGTH;
  987                         hnputs(opt, tcph->mss);
  988                         opt += 2;
  989                 }
  990                 if(tcph->ws != 0){
  991                         *opt++ = WSOPT;
  992                         *opt++ = WS_LENGTH;
  993                         *opt++ = tcph->ws;
  994                 }
  995                 while(optpad-- > 0)
  996                         *opt++ = NOOPOPT;
  997         }
  998 
  999         if(tcb != nil && tcb->nochecksum){
 1000                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
 1001         } else {
 1002                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
 1003                 hnputs(h->tcpcksum, csum);
 1004         }
 1005 
 1006         /* move from pseudo header back to normal ip header */
 1007         memset(h->vcf, 0, 4);
 1008         h->vcf[0] = IP_VER6;
 1009         hnputs(h->ploadlen, hdrlen+dlen);
 1010         h->proto = ph->proto;
 1011 
 1012         return data;
 1013 }
 1014 
 1015 Block *
 1016 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
 1017 {
 1018         int dlen;
 1019         Tcp4hdr *h;
 1020         ushort csum;
 1021         ushort hdrlen, optpad = 0;
 1022         uchar *opt;
 1023 
 1024         hdrlen = TCP4_HDRSIZE;
 1025         if(tcph->flags & SYN){
 1026                 if(tcph->mss)
 1027                         hdrlen += MSS_LENGTH;
 1028                 if(tcph->ws)
 1029                         hdrlen += WS_LENGTH;
 1030                 optpad = hdrlen & 3;
 1031                 if(optpad)
 1032                         optpad = 4 - optpad;
 1033                 hdrlen += optpad;
 1034         }
 1035 
 1036         if(data) {
 1037                 dlen = blocklen(data);
 1038                 data = padblock(data, hdrlen + TCP4_PKT);
 1039                 if(data == nil)
 1040                         return nil;
 1041         }
 1042         else {
 1043                 dlen = 0;
 1044                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
 1045                 if(data == nil)
 1046                         return nil;
 1047                 data->wp += hdrlen + TCP4_PKT;
 1048         }
 1049 
 1050         /* copy in pseudo ip header plus port numbers */
 1051         h = (Tcp4hdr *)(data->rp);
 1052         memmove(h, ph, TCP4_TCBPHDRSZ);
 1053 
 1054         /* copy in variable bits */
 1055         hnputs(h->tcplen, hdrlen + dlen);
 1056         hnputl(h->tcpseq, tcph->seq);
 1057         hnputl(h->tcpack, tcph->ack);
 1058         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
 1059         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
 1060         hnputs(h->tcpurg, tcph->urg);
 1061 
 1062         if(tcph->flags & SYN){
 1063                 opt = h->tcpopt;
 1064                 if(tcph->mss != 0){
 1065                         *opt++ = MSSOPT;
 1066                         *opt++ = MSS_LENGTH;
 1067                         hnputs(opt, tcph->mss);
 1068                         opt += 2;
 1069                 }
 1070                 if(tcph->ws != 0){
 1071                         *opt++ = WSOPT;
 1072                         *opt++ = WS_LENGTH;
 1073                         *opt++ = tcph->ws;
 1074                 }
 1075                 while(optpad-- > 0)
 1076                         *opt++ = NOOPOPT;
 1077         }
 1078 
 1079         if(tcb != nil && tcb->nochecksum){
 1080                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
 1081         } else {
 1082                 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
 1083                 hnputs(h->tcpcksum, csum);
 1084         }
 1085 
 1086         return data;
 1087 }
 1088 
 1089 int
 1090 ntohtcp6(Tcp *tcph, Block **bpp)
 1091 {
 1092         Tcp6hdr *h;
 1093         uchar *optr;
 1094         ushort hdrlen;
 1095         ushort optlen;
 1096         int n;
 1097 
 1098         *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
 1099         if(*bpp == nil)
 1100                 return -1;
 1101 
 1102         h = (Tcp6hdr *)((*bpp)->rp);
 1103         tcph->source = nhgets(h->tcpsport);
 1104         tcph->dest = nhgets(h->tcpdport);
 1105         tcph->seq = nhgetl(h->tcpseq);
 1106         tcph->ack = nhgetl(h->tcpack);
 1107         hdrlen = (h->tcpflag[0]>>2) & ~3;
 1108         if(hdrlen < TCP6_HDRSIZE) {
 1109                 freeblist(*bpp);
 1110                 return -1;
 1111         }
 1112 
 1113         tcph->flags = h->tcpflag[1];
 1114         tcph->wnd = nhgets(h->tcpwin);
 1115         tcph->urg = nhgets(h->tcpurg);
 1116         tcph->mss = 0;
 1117         tcph->ws = 0;
 1118         tcph->len = nhgets(h->ploadlen) - hdrlen;
 1119 
 1120         *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
 1121         if(*bpp == nil)
 1122                 return -1;
 1123 
 1124         optr = h->tcpopt;
 1125         n = hdrlen - TCP6_HDRSIZE;
 1126         while(n > 0 && *optr != EOLOPT) {
 1127                 if(*optr == NOOPOPT) {
 1128                         n--;
 1129                         optr++;
 1130                         continue;
 1131                 }
 1132                 optlen = optr[1];
 1133                 if(optlen < 2 || optlen > n)
 1134                         break;
 1135                 switch(*optr) {
 1136                 case MSSOPT:
 1137                         if(optlen == MSS_LENGTH)
 1138                                 tcph->mss = nhgets(optr+2);
 1139                         break;
 1140                 case WSOPT:
 1141                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
 1142                                 tcph->ws = HaveWS | *(optr+2);
 1143                         break;
 1144                 }
 1145                 n -= optlen;
 1146                 optr += optlen;
 1147         }
 1148         return hdrlen;
 1149 }
 1150 
 1151 int
 1152 ntohtcp4(Tcp *tcph, Block **bpp)
 1153 {
 1154         Tcp4hdr *h;
 1155         uchar *optr;
 1156         ushort hdrlen;
 1157         ushort optlen;
 1158         int n;
 1159 
 1160         *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
 1161         if(*bpp == nil)
 1162                 return -1;
 1163 
 1164         h = (Tcp4hdr *)((*bpp)->rp);
 1165         tcph->source = nhgets(h->tcpsport);
 1166         tcph->dest = nhgets(h->tcpdport);
 1167         tcph->seq = nhgetl(h->tcpseq);
 1168         tcph->ack = nhgetl(h->tcpack);
 1169 
 1170         hdrlen = (h->tcpflag[0]>>2) & ~3;
 1171         if(hdrlen < TCP4_HDRSIZE) {
 1172                 freeblist(*bpp);
 1173                 return -1;
 1174         }
 1175 
 1176         tcph->flags = h->tcpflag[1];
 1177         tcph->wnd = nhgets(h->tcpwin);
 1178         tcph->urg = nhgets(h->tcpurg);
 1179         tcph->mss = 0;
 1180         tcph->ws = 0;
 1181         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
 1182 
 1183         *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
 1184         if(*bpp == nil)
 1185                 return -1;
 1186 
 1187         optr = h->tcpopt;
 1188         n = hdrlen - TCP4_HDRSIZE;
 1189         while(n > 0 && *optr != EOLOPT) {
 1190                 if(*optr == NOOPOPT) {
 1191                         n--;
 1192                         optr++;
 1193                         continue;
 1194                 }
 1195                 optlen = optr[1];
 1196                 if(optlen < 2 || optlen > n)
 1197                         break;
 1198                 switch(*optr) {
 1199                 case MSSOPT:
 1200                         if(optlen == MSS_LENGTH)
 1201                                 tcph->mss = nhgets(optr+2);
 1202                         break;
 1203                 case WSOPT:
 1204                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
 1205                                 tcph->ws = HaveWS | *(optr+2);
 1206                         break;
 1207                 }
 1208                 n -= optlen;
 1209                 optr += optlen;
 1210         }
 1211         return hdrlen;
 1212 }
 1213 
 1214 /*
 1215  *  For outgiing calls, generate an initial sequence
 1216  *  number and put a SYN on the send queue
 1217  */
 1218 void
 1219 tcpsndsyn(Conv *s, Tcpctl *tcb)
 1220 {
 1221         tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
 1222         tcb->rttseq = tcb->iss;
 1223         tcb->snd.wl2 = tcb->iss;
 1224         tcb->snd.una = tcb->iss;
 1225         tcb->snd.ptr = tcb->rttseq;
 1226         tcb->snd.nxt = tcb->rttseq;
 1227         tcb->flgcnt++;
 1228         tcb->flags |= FORCE;
 1229         tcb->sndsyntime = NOW;
 1230 
 1231         /* set desired mss and scale */
 1232         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
 1233 }
 1234 
 1235 void
 1236 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
 1237 {
 1238         Block *hbp;
 1239         uchar rflags;
 1240         Tcppriv *tpriv;
 1241         Tcp4hdr ph4;
 1242         Tcp6hdr ph6;
 1243 
 1244         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
 1245 
 1246         tpriv = tcp->priv;
 1247 
 1248         if(seg->flags & RST)
 1249                 return;
 1250 
 1251         /* make pseudo header */
 1252         switch(version) {
 1253         case V4:
 1254                 memset(&ph4, 0, sizeof(ph4));
 1255                 ph4.vihl = IP_VER4;
 1256                 v6tov4(ph4.tcpsrc, dest);
 1257                 v6tov4(ph4.tcpdst, source);
 1258                 ph4.proto = IP_TCPPROTO;
 1259                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
 1260                 hnputs(ph4.tcpsport, seg->dest);
 1261                 hnputs(ph4.tcpdport, seg->source);
 1262                 break;
 1263         case V6:
 1264                 memset(&ph6, 0, sizeof(ph6));
 1265                 ph6.vcf[0] = IP_VER6;
 1266                 ipmove(ph6.tcpsrc, dest);
 1267                 ipmove(ph6.tcpdst, source);
 1268                 ph6.proto = IP_TCPPROTO;
 1269                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
 1270                 hnputs(ph6.tcpsport, seg->dest);
 1271                 hnputs(ph6.tcpdport, seg->source);
 1272                 break;
 1273         default:
 1274                 panic("sndrst: version %d", version);
 1275         }
 1276 
 1277         tpriv->stats[OutRsts]++;
 1278         rflags = RST;
 1279 
 1280         /* convince the other end that this reset is in band */
 1281         if(seg->flags & ACK) {
 1282                 seg->seq = seg->ack;
 1283                 seg->ack = 0;
 1284         }
 1285         else {
 1286                 rflags |= ACK;
 1287                 seg->ack = seg->seq;
 1288                 seg->seq = 0;
 1289                 if(seg->flags & SYN)
 1290                         seg->ack++;
 1291                 seg->ack += length;
 1292                 if(seg->flags & FIN)
 1293                         seg->ack++;
 1294         }
 1295         seg->flags = rflags;
 1296         seg->wnd = 0;
 1297         seg->urg = 0;
 1298         seg->mss = 0;
 1299         seg->ws = 0;
 1300         switch(version) {
 1301         case V4:
 1302                 hbp = htontcp4(seg, nil, &ph4, nil);
 1303                 if(hbp == nil)
 1304                         return;
 1305                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
 1306                 break;
 1307         case V6:
 1308                 hbp = htontcp6(seg, nil, &ph6, nil);
 1309                 if(hbp == nil)
 1310                         return;
 1311                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
 1312                 break;
 1313         default:
 1314                 panic("sndrst2: version %d", version);
 1315         }
 1316 }
 1317 
 1318 /*
 1319  *  send a reset to the remote side and close the conversation
 1320  *  called with s qlocked
 1321  */
 1322 char*
 1323 tcphangup(Conv *s)
 1324 {
 1325         Tcp seg;
 1326         Tcpctl *tcb;
 1327         Block *hbp;
 1328 
 1329         tcb = (Tcpctl*)s->ptcl;
 1330         if(waserror())
 1331                 return commonerror();
 1332         if(ipcmp(s->raddr, IPnoaddr) != 0) {
 1333                 if(!waserror()){
 1334                         seg.flags = RST | ACK;
 1335                         seg.ack = tcb->rcv.nxt;
 1336                         tcb->rcv.una = 0;
 1337                         seg.seq = tcb->snd.ptr;
 1338                         seg.wnd = 0;
 1339                         seg.urg = 0;
 1340                         seg.mss = 0;
 1341                         seg.ws = 0;
 1342                         switch(s->ipversion) {
 1343                         case V4:
 1344                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
 1345                                 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
 1346                                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
 1347                                 break;
 1348                         case V6:
 1349                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
 1350                                 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
 1351                                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
 1352                                 break;
 1353                         default:
 1354                                 panic("tcphangup: version %d", s->ipversion);
 1355                         }
 1356                         poperror();
 1357                 }
 1358         }
 1359         localclose(s, nil);
 1360         poperror();
 1361         return nil;
 1362 }
 1363 
 1364 /*
 1365  *  (re)send a SYN ACK
 1366  */
 1367 int
 1368 sndsynack(Proto *tcp, Limbo *lp)
 1369 {
 1370         Block *hbp;
 1371         Tcp4hdr ph4;
 1372         Tcp6hdr ph6;
 1373         Tcp seg;
 1374         int scale;
 1375 
 1376         /* make pseudo header */
 1377         switch(lp->version) {
 1378         case V4:
 1379                 memset(&ph4, 0, sizeof(ph4));
 1380                 ph4.vihl = IP_VER4;
 1381                 v6tov4(ph4.tcpsrc, lp->laddr);
 1382                 v6tov4(ph4.tcpdst, lp->raddr);
 1383                 ph4.proto = IP_TCPPROTO;
 1384                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
 1385                 hnputs(ph4.tcpsport, lp->lport);
 1386                 hnputs(ph4.tcpdport, lp->rport);
 1387                 break;
 1388         case V6:
 1389                 memset(&ph6, 0, sizeof(ph6));
 1390                 ph6.vcf[0] = IP_VER6;
 1391                 ipmove(ph6.tcpsrc, lp->laddr);
 1392                 ipmove(ph6.tcpdst, lp->raddr);
 1393                 ph6.proto = IP_TCPPROTO;
 1394                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
 1395                 hnputs(ph6.tcpsport, lp->lport);
 1396                 hnputs(ph6.tcpdport, lp->rport);
 1397                 break;
 1398         default:
 1399                 panic("sndrst: version %d", lp->version);
 1400         }
 1401 
 1402         seg.seq = lp->iss;
 1403         seg.ack = lp->irs+1;
 1404         seg.flags = SYN|ACK;
 1405         seg.urg = 0;
 1406         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
 1407         seg.wnd = QMAX;
 1408 
 1409         /* if the other side set scale, we should too */
 1410         if(lp->rcvscale){
 1411                 seg.ws = scale;
 1412                 lp->sndscale = scale;
 1413         } else {
 1414                 seg.ws = 0;
 1415                 lp->sndscale = 0;
 1416         }
 1417 
 1418         switch(lp->version) {
 1419         case V4:
 1420                 hbp = htontcp4(&seg, nil, &ph4, nil);
 1421                 if(hbp == nil)
 1422                         return -1;
 1423                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
 1424                 break;
 1425         case V6:
 1426                 hbp = htontcp6(&seg, nil, &ph6, nil);
 1427                 if(hbp == nil)
 1428                         return -1;
 1429                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
 1430                 break;
 1431         default:
 1432                 panic("sndsnack: version %d", lp->version);
 1433         }
 1434         lp->lastsend = NOW;
 1435         return 0;
 1436 }
 1437 
 1438 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
 1439 
 1440 /*
 1441  *  put a call into limbo and respond with a SYN ACK
 1442  *
 1443  *  called with proto locked
 1444  */
 1445 static void
 1446 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
 1447 {
 1448         Limbo *lp, **l;
 1449         Tcppriv *tpriv;
 1450         int h;
 1451 
 1452         tpriv = s->p->priv;
 1453         h = hashipa(source, seg->source);
 1454 
 1455         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
 1456                 lp = *l;
 1457                 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
 1458                         continue;
 1459                 if(ipcmp(lp->raddr, source) != 0)
 1460                         continue;
 1461                 if(ipcmp(lp->laddr, dest) != 0)
 1462                         continue;
 1463 
 1464                 /* each new SYN restarts the retransmits */
 1465                 lp->irs = seg->seq;
 1466                 break;
 1467         }
 1468         lp = *l;
 1469         if(lp == nil){
 1470                 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
 1471                         lp = tpriv->lht[h];
 1472                         tpriv->lht[h] = lp->next;
 1473                         lp->next = nil;
 1474                 } else {
 1475                         lp = malloc(sizeof(*lp));
 1476                         if(lp == nil)
 1477                                 return;
 1478                         tpriv->nlimbo++;
 1479                 }
 1480                 *l = lp;
 1481                 lp->version = version;
 1482                 ipmove(lp->laddr, dest);
 1483                 ipmove(lp->raddr, source);
 1484                 lp->lport = seg->dest;
 1485                 lp->rport = seg->source;
 1486                 lp->mss = seg->mss;
 1487                 lp->rcvscale = seg->ws;
 1488                 lp->irs = seg->seq;
 1489                 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
 1490         }
 1491 
 1492         if(sndsynack(s->p, lp) < 0){
 1493                 *l = lp->next;
 1494                 tpriv->nlimbo--;
 1495                 free(lp);
 1496         }
 1497 }
 1498 
 1499 /*
 1500  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
 1501  */
 1502 static void
 1503 limborexmit(Proto *tcp)
 1504 {
 1505         Tcppriv *tpriv;
 1506         Limbo **l, *lp;
 1507         int h;
 1508         int seen;
 1509         ulong now;
 1510 
 1511         tpriv = tcp->priv;
 1512 
 1513         if(!canqlock(tcp))
 1514                 return;
 1515         seen = 0;
 1516         now = NOW;
 1517         for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
 1518                 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
 1519                         lp = *l;
 1520                         seen++;
 1521                         if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
 1522                                 continue;
 1523 
 1524                         /* time it out after 1 second */
 1525                         if(++(lp->rexmits) > 5){
 1526                                 tpriv->nlimbo--;
 1527                                 *l = lp->next;
 1528                                 free(lp);
 1529                                 continue;
 1530                         }
 1531 
 1532                         /* if we're being attacked, don't bother resending SYN ACK's */
 1533                         if(tpriv->nlimbo > 100)
 1534                                 continue;
 1535 
 1536                         if(sndsynack(tcp, lp) < 0){
 1537                                 tpriv->nlimbo--;
 1538                                 *l = lp->next;
 1539                                 free(lp);
 1540                                 continue;
 1541                         }
 1542 
 1543                         l = &lp->next;
 1544                 }
 1545         }
 1546         qunlock(tcp);
 1547 }
 1548 
 1549 /*
 1550  *  lookup call in limbo.  if found, throw it out.
 1551  *
 1552  *  called with proto locked
 1553  */
 1554 static void
 1555 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
 1556 {
 1557         Limbo *lp, **l;
 1558         int h;
 1559         Tcppriv *tpriv;
 1560 
 1561         tpriv = s->p->priv;
 1562 
 1563         /* find a call in limbo */
 1564         h = hashipa(src, segp->source);
 1565         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
 1566                 lp = *l;
 1567                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
 1568                         continue;
 1569                 if(ipcmp(lp->laddr, dst) != 0)
 1570                         continue;
 1571                 if(ipcmp(lp->raddr, src) != 0)
 1572                         continue;
 1573 
 1574                 /* RST can only follow the SYN */
 1575                 if(segp->seq == lp->irs+1){
 1576                         tpriv->nlimbo--;
 1577                         *l = lp->next;
 1578                         free(lp);
 1579                 }
 1580                 break;
 1581         }
 1582 }
 1583 
 1584 /*
 1585  *  come here when we finally get an ACK to our SYN-ACK.
 1586  *  lookup call in limbo.  if found, create a new conversation
 1587  *
 1588  *  called with proto locked
 1589  */
 1590 static Conv*
 1591 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
 1592 {
 1593         Conv *new;
 1594         Tcpctl *tcb;
 1595         Tcppriv *tpriv;
 1596         Tcp4hdr *h4;
 1597         Tcp6hdr *h6;
 1598         Limbo *lp, **l;
 1599         int h;
 1600 
 1601         /* unless it's just an ack, it can't be someone coming out of limbo */
 1602         if((segp->flags & SYN) || (segp->flags & ACK) == 0)
 1603                 return nil;
 1604 
 1605         tpriv = s->p->priv;
 1606 
 1607         /* find a call in limbo */
 1608         h = hashipa(src, segp->source);
 1609         for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
 1610                 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d\n",
 1611                         src, segp->source, lp->raddr, lp->rport,
 1612                         dst, segp->dest, lp->laddr, lp->lport,
 1613                         version, lp->version
 1614                 );
 1615 
 1616                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
 1617                         continue;
 1618                 if(ipcmp(lp->laddr, dst) != 0)
 1619                         continue;
 1620                 if(ipcmp(lp->raddr, src) != 0)
 1621                         continue;
 1622 
 1623                 /* we're assuming no data with the initial SYN */
 1624                 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
 1625                         netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
 1626                                 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
 1627                         lp = nil;
 1628                 } else {
 1629                         tpriv->nlimbo--;
 1630                         *l = lp->next;
 1631                 }
 1632                 break;
 1633         }
 1634         if(lp == nil)
 1635                 return nil;
 1636 
 1637         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
 1638         if(new == nil)
 1639                 return nil;
 1640 
 1641         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
 1642         tcb = (Tcpctl*)new->ptcl;
 1643         tcb->flags &= ~CLONE;
 1644         tcb->timer.arg = new;
 1645         tcb->timer.state = TcptimerOFF;
 1646         tcb->acktimer.arg = new;
 1647         tcb->acktimer.state = TcptimerOFF;
 1648         tcb->katimer.arg = new;
 1649         tcb->katimer.state = TcptimerOFF;
 1650         tcb->rtt_timer.arg = new;
 1651         tcb->rtt_timer.state = TcptimerOFF;
 1652 
 1653         tcb->irs = lp->irs;
 1654         tcb->rcv.nxt = tcb->irs+1;
 1655         tcb->rcv.urg = tcb->rcv.nxt;
 1656 
 1657         tcb->iss = lp->iss;
 1658         tcb->rttseq = tcb->iss;
 1659         tcb->snd.wl2 = tcb->iss;
 1660         tcb->snd.una = tcb->iss+1;
 1661         tcb->snd.ptr = tcb->iss+1;
 1662         tcb->snd.nxt = tcb->iss+1;
 1663         tcb->flgcnt = 0;
 1664         tcb->flags |= SYNACK;
 1665 
 1666         /* our sending max segment size cannot be bigger than what he asked for */
 1667         if(lp->mss != 0 && lp->mss < tcb->mss)
 1668                 tcb->mss = lp->mss;
 1669 
 1670         /* window scaling */
 1671         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
 1672 
 1673         /* the congestion window always starts out as a single segment */
 1674         tcb->snd.wnd = segp->wnd;
 1675         tcb->cwind = tcb->mss;
 1676 
 1677         /* set initial round trip time */
 1678         tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
 1679         tcpsynackrtt(new);
 1680 
 1681         free(lp);
 1682 
 1683         /* set up proto header */
 1684         switch(version){
 1685         case V4:
 1686                 h4 = &tcb->protohdr.tcp4hdr;
 1687                 memset(h4, 0, sizeof(*h4));
 1688                 h4->proto = IP_TCPPROTO;
 1689                 hnputs(h4->tcpsport, new->lport);
 1690                 hnputs(h4->tcpdport, new->rport);
 1691                 v6tov4(h4->tcpsrc, dst);
 1692                 v6tov4(h4->tcpdst, src);
 1693                 break;
 1694         case V6:
 1695                 h6 = &tcb->protohdr.tcp6hdr;
 1696                 memset(h6, 0, sizeof(*h6));
 1697                 h6->proto = IP_TCPPROTO;
 1698                 hnputs(h6->tcpsport, new->lport);
 1699                 hnputs(h6->tcpdport, new->rport);
 1700                 ipmove(h6->tcpsrc, dst);
 1701                 ipmove(h6->tcpdst, src);
 1702                 break;
 1703         default:
 1704                 panic("tcpincoming: version %d", new->ipversion);
 1705         }
 1706 
 1707         tcpsetstate(new, Established);
 1708 
 1709         iphtadd(&tpriv->ht, new);
 1710 
 1711         return new;
 1712 }
 1713 
 1714 int
 1715 seq_within(ulong x, ulong low, ulong high)
 1716 {
 1717         if(low <= high){
 1718                 if(low <= x && x <= high)
 1719                         return 1;
 1720         }
 1721         else {
 1722                 if(x >= low || x <= high)
 1723                         return 1;
 1724         }
 1725         return 0;
 1726 }
 1727 
 1728 int
 1729 seq_lt(ulong x, ulong y)
 1730 {
 1731         return (int)(x-y) < 0;
 1732 }
 1733 
 1734 int
 1735 seq_le(ulong x, ulong y)
 1736 {
 1737         return (int)(x-y) <= 0;
 1738 }
 1739 
 1740 int
 1741 seq_gt(ulong x, ulong y)
 1742 {
 1743         return (int)(x-y) > 0;
 1744 }
 1745 
 1746 int
 1747 seq_ge(ulong x, ulong y)
 1748 {
 1749         return (int)(x-y) >= 0;
 1750 }
 1751 
 1752 /*
 1753  *  use the time between the first SYN and it's ack as the
 1754  *  initial round trip time
 1755  */
 1756 void
 1757 tcpsynackrtt(Conv *s)
 1758 {
 1759         Tcpctl *tcb;
 1760         int delta;
 1761         Tcppriv *tpriv;
 1762 
 1763         tcb = (Tcpctl*)s->ptcl;
 1764         tpriv = s->p->priv;
 1765 
 1766         delta = NOW - tcb->sndsyntime;
 1767         tcb->srtt = delta<<LOGAGAIN;
 1768         tcb->mdev = delta<<LOGDGAIN;
 1769 
 1770         /* halt round trip timer */
 1771         tcphalt(tpriv, &tcb->rtt_timer);
 1772 }
 1773 
 1774 void
 1775 update(Conv *s, Tcp *seg)
 1776 {
 1777         int rtt, delta;
 1778         Tcpctl *tcb;
 1779         ulong acked;
 1780         ulong expand;
 1781         Tcppriv *tpriv;
 1782 
 1783         tpriv = s->p->priv;
 1784         tcb = (Tcpctl*)s->ptcl;
 1785 
 1786         /* if everything has been acked, force output(?) */
 1787         if(seq_gt(seg->ack, tcb->snd.nxt)) {
 1788                 tcb->flags |= FORCE;
 1789                 return;
 1790         }
 1791 
 1792         /* added by Dong Lin for fast retransmission */
 1793         if(seg->ack == tcb->snd.una
 1794         && tcb->snd.una != tcb->snd.nxt
 1795         && seg->len == 0
 1796         && seg->wnd == tcb->snd.wnd) {
 1797 
 1798                 /* this is a pure ack w/o window update */
 1799                 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n",
 1800                         tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
 1801 
 1802                 if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
 1803                         /*
 1804                          *  tahoe tcp rxt the packet, half sshthresh,
 1805                          *  and set cwnd to one packet
 1806                          */
 1807                         tcb->snd.recovery = 1;
 1808                         tcb->snd.rxt = tcb->snd.nxt;
 1809                         netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
 1810                         tcprxmit(s);
 1811                 } else {
 1812                         /* do reno tcp here. */
 1813                 }
 1814         }
 1815 
 1816         /*
 1817          *  update window
 1818          */
 1819         if(seq_gt(seg->ack, tcb->snd.wl2)
 1820         || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
 1821                 tcb->snd.wnd = seg->wnd;
 1822                 tcb->snd.wl2 = seg->ack;
 1823         }
 1824 
 1825         if(!seq_gt(seg->ack, tcb->snd.una)){
 1826                 /*
 1827                  *  don't let us hangup if sending into a closed window and
 1828                  *  we're still getting acks
 1829                  */
 1830                 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
 1831                         tcb->backedoff = MAXBACKMS/4;
 1832                 }
 1833                 return;
 1834         }
 1835 
 1836         /*
 1837          *  any positive ack turns off fast rxt,
 1838          *  (should we do new-reno on partial acks?)
 1839          */
 1840         if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
 1841                 tcb->snd.dupacks = 0;
 1842                 tcb->snd.recovery = 0;
 1843         } else
 1844                 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind);
 1845 
 1846         /* Compute the new send window size */
 1847         acked = seg->ack - tcb->snd.una;
 1848 
 1849         /* avoid slow start and timers for SYN acks */
 1850         if((tcb->flags & SYNACK) == 0) {
 1851                 tcb->flags |= SYNACK;
 1852                 acked--;
 1853                 tcb->flgcnt--;
 1854                 goto done;
 1855         }
 1856 
 1857         /* slow start as long as we're not recovering from lost packets */
 1858         if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
 1859                 if(tcb->cwind < tcb->ssthresh) {
 1860                         expand = tcb->mss;
 1861                         if(acked < expand)
 1862                                 expand = acked;
 1863                 }
 1864                 else
 1865                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
 1866 
 1867                 if(tcb->cwind + expand < tcb->cwind)
 1868                         expand = tcb->snd.wnd - tcb->cwind;
 1869                 if(tcb->cwind + expand > tcb->snd.wnd)
 1870                         expand = tcb->snd.wnd - tcb->cwind;
 1871                 tcb->cwind += expand;
 1872         }
 1873 
 1874         /* Adjust the timers according to the round trip time */
 1875         if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
 1876                 tcphalt(tpriv, &tcb->rtt_timer);
 1877                 if((tcb->flags&RETRAN) == 0) {
 1878                         tcb->backoff = 0;
 1879                         tcb->backedoff = 0;
 1880                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
 1881                         if(rtt == 0)
 1882                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
 1883                         rtt *= MSPTICK;
 1884                         if(tcb->srtt == 0) {
 1885                                 tcb->srtt = rtt << LOGAGAIN;
 1886                                 tcb->mdev = rtt << LOGDGAIN;
 1887                         } else {
 1888                                 delta = rtt - (tcb->srtt>>LOGAGAIN);
 1889                                 tcb->srtt += delta;
 1890                                 if(tcb->srtt <= 0)
 1891                                         tcb->srtt = 1;
 1892 
 1893                                 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
 1894                                 tcb->mdev += delta;
 1895                                 if(tcb->mdev <= 0)
 1896                                         tcb->mdev = 1;
 1897                         }
 1898                         tcpsettimer(tcb);
 1899                 }
 1900         }
 1901 
 1902 done:
 1903         if(qdiscard(s->wq, acked) < acked)
 1904                 tcb->flgcnt--;
 1905 
 1906         tcb->snd.una = seg->ack;
 1907         if(seq_gt(seg->ack, tcb->snd.urg))
 1908                 tcb->snd.urg = seg->ack;
 1909 
 1910         if(tcb->snd.una != tcb->snd.nxt)
 1911                 tcpgo(tpriv, &tcb->timer);
 1912         else
 1913                 tcphalt(tpriv, &tcb->timer);
 1914 
 1915         if(seq_lt(tcb->snd.ptr, tcb->snd.una))
 1916                 tcb->snd.ptr = tcb->snd.una;
 1917 
 1918         tcb->flags &= ~RETRAN;
 1919         tcb->backoff = 0;
 1920         tcb->backedoff = 0;
 1921 }
 1922 
 1923 void
 1924 tcpiput(Proto *tcp, Ipifc*, Block *bp)
 1925 {
 1926         Tcp seg;
 1927         Tcp4hdr *h4;
 1928         Tcp6hdr *h6;
 1929         int hdrlen;
 1930         Tcpctl *tcb;
 1931         ushort length, csum;
 1932         uchar source[IPaddrlen], dest[IPaddrlen];
 1933         Conv *s;
 1934         Fs *f;
 1935         Tcppriv *tpriv;
 1936         uchar version;
 1937 
 1938         f = tcp->f;
 1939         tpriv = tcp->priv;
 1940 
 1941         tpriv->stats[InSegs]++;
 1942 
 1943         h4 = (Tcp4hdr*)(bp->rp);
 1944         h6 = (Tcp6hdr*)(bp->rp);
 1945 
 1946         if((h4->vihl&0xF0)==IP_VER4) {
 1947                 version = V4;
 1948                 length = nhgets(h4->length);
 1949                 v4tov6(dest, h4->tcpdst);
 1950                 v4tov6(source, h4->tcpsrc);
 1951 
 1952                 h4->Unused = 0;
 1953                 hnputs(h4->tcplen, length-TCP4_PKT);
 1954                 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
 1955                         ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
 1956                         tpriv->stats[CsumErrs]++;
 1957                         tpriv->stats[InErrs]++;
 1958                         netlog(f, Logtcp, "bad tcp proto cksum\n");
 1959                         freeblist(bp);
 1960                         return;
 1961                 }
 1962 
 1963                 hdrlen = ntohtcp4(&seg, &bp);
 1964                 if(hdrlen < 0){
 1965                         tpriv->stats[HlenErrs]++;
 1966                         tpriv->stats[InErrs]++;
 1967                         netlog(f, Logtcp, "bad tcp hdr len\n");
 1968                         return;
 1969                 }
 1970 
 1971                 /* trim the packet to the size claimed by the datagram */
 1972                 length -= hdrlen+TCP4_PKT;
 1973                 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
 1974                 if(bp == nil){
 1975                         tpriv->stats[LenErrs]++;
 1976                         tpriv->stats[InErrs]++;
 1977                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
 1978                         return;
 1979                 }
 1980         }
 1981         else {
 1982                 int ttl = h6->ttl;
 1983                 int proto = h6->proto;
 1984 
 1985                 version = V6;
 1986                 length = nhgets(h6->ploadlen);
 1987                 ipmove(dest, h6->tcpdst);
 1988                 ipmove(source, h6->tcpsrc);
 1989 
 1990                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
 1991                 h6->ttl = proto;
 1992                 hnputl(h6->vcf, length);
 1993                 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
 1994                     (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
 1995                         tpriv->stats[CsumErrs]++;
 1996                         tpriv->stats[InErrs]++;
 1997                         netlog(f, Logtcp,
 1998                             "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
 1999                                 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
 2000                         freeblist(bp);
 2001                         return;
 2002                 }
 2003                 h6->ttl = ttl;
 2004                 h6->proto = proto;
 2005                 hnputs(h6->ploadlen, length);
 2006 
 2007                 hdrlen = ntohtcp6(&seg, &bp);
 2008                 if(hdrlen < 0){
 2009                         tpriv->stats[HlenErrs]++;
 2010                         tpriv->stats[InErrs]++;
 2011                         netlog(f, Logtcp, "bad tcpv6 hdr len\n");
 2012                         return;
 2013                 }
 2014 
 2015                 /* trim the packet to the size claimed by the datagram */
 2016                 length -= hdrlen;
 2017                 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
 2018                 if(bp == nil){
 2019                         tpriv->stats[LenErrs]++;
 2020                         tpriv->stats[InErrs]++;
 2021                         netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
 2022                         return;
 2023                 }
 2024         }
 2025 
 2026         /* lock protocol while searching for a conversation */
 2027         qlock(tcp);
 2028 
 2029         /* Look for a matching conversation */
 2030         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
 2031         if(s == nil){
 2032                 netlog(f, Logtcp, "iphtlook failed\n");
 2033 reset:
 2034                 qunlock(tcp);
 2035                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
 2036                 freeblist(bp);
 2037                 return;
 2038         }
 2039 
 2040         /* if it's a listener, look for the right flags and get a new conv */
 2041         tcb = (Tcpctl*)s->ptcl;
 2042         if(tcb->state == Listen){
 2043                 if(seg.flags & RST){
 2044                         limborst(s, &seg, source, dest, version);
 2045                         qunlock(tcp);
 2046                         freeblist(bp);
 2047                         return;
 2048                 }
 2049 
 2050                 /* if this is a new SYN, put the call into limbo */
 2051                 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
 2052                         limbo(s, source, dest, &seg, version);
 2053                         qunlock(tcp);
 2054                         freeblist(bp);
 2055                         return;
 2056                 }
 2057 
 2058                 /*
 2059                  *  if there's a matching call in limbo, tcpincoming will
 2060                  *  return it in state Syn_received
 2061                  */
 2062                 s = tcpincoming(s, &seg, source, dest, version);
 2063                 if(s == nil)
 2064                         goto reset;
 2065         }
 2066 
 2067         /* The rest of the input state machine is run with the control block
 2068          * locked and implements the state machine directly out of the RFC.
 2069          * Out-of-band data is ignored - it was always a bad idea.
 2070          */
 2071         tcb = (Tcpctl*)s->ptcl;
 2072         if(waserror()){
 2073                 qunlock(s);
 2074                 nexterror();
 2075         }
 2076         qlock(s);
 2077         qunlock(tcp);
 2078 
 2079         /* fix up window */
 2080         seg.wnd <<= tcb->rcv.scale;
 2081 
 2082         /* every input packet in puts off the keep alive time out */
 2083         tcpsetkacounter(tcb);
 2084 
 2085         switch(tcb->state) {
 2086         case Closed:
 2087                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
 2088                 goto raise;
 2089         case Syn_sent:
 2090                 if(seg.flags & ACK) {
 2091                         if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
 2092                                 sndrst(tcp, source, dest, length, &seg, version,
 2093                                          "bad seq in Syn_sent");
 2094                                 goto raise;
 2095                         }
 2096                 }
 2097                 if(seg.flags & RST) {
 2098                         if(seg.flags & ACK)
 2099                                 localclose(s, Econrefused);
 2100                         goto raise;
 2101                 }
 2102 
 2103                 if(seg.flags & SYN) {
 2104                         procsyn(s, &seg);
 2105                         if(seg.flags & ACK){
 2106                                 update(s, &seg);
 2107                                 tcpsynackrtt(s);
 2108                                 tcpsetstate(s, Established);
 2109                                 tcpsetscale(s, tcb, seg.ws, tcb->scale);
 2110                         }
 2111                         else {
 2112                                 tcb->time = NOW;
 2113                                 tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
 2114                         }
 2115 
 2116                         if(length != 0 || (seg.flags & FIN))
 2117                                 break;
 2118 
 2119                         freeblist(bp);
 2120                         goto output;
 2121                 }
 2122                 else
 2123                         freeblist(bp);
 2124 
 2125                 qunlock(s);
 2126                 poperror();
 2127                 return;
 2128         case Syn_received:
 2129                 /* doesn't matter if it's the correct ack, we're just trying to set timing */
 2130                 if(seg.flags & ACK)
 2131                         tcpsynackrtt(s);
 2132                 break;
 2133         }
 2134 
 2135         /*
 2136          *  One DOS attack is to open connections to us and then forget about them,
 2137          *  thereby tying up a conv at no long term cost to the attacker.
 2138          *  This is an attempt to defeat these stateless DOS attacks.  See
 2139          *  corresponding code in tcpsendka().
 2140          */
 2141         if(tcb->state != Syn_received && (seg.flags & RST) == 0){
 2142                 if(tcpporthogdefense
 2143                 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
 2144                         print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
 2145                                 source, seg.source, dest, seg.dest, seg.flags,
 2146                                 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
 2147                         localclose(s, "stateless hog");
 2148                 }
 2149         }
 2150 
 2151         /* Cut the data to fit the receive window */
 2152         if(tcptrim(tcb, &seg, &bp, &length) == -1) {
 2153                 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
 2154                 update(s, &seg);
 2155                 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
 2156                         tcphalt(tpriv, &tcb->rtt_timer);
 2157                         tcphalt(tpriv, &tcb->acktimer);
 2158                         tcphalt(tpriv, &tcb->katimer);
 2159                         tcpsetstate(s, Time_wait);
 2160                         tcb->timer.start = MSL2*(1000 / MSPTICK);
 2161                         tcpgo(tpriv, &tcb->timer);
 2162                 }
 2163                 if(!(seg.flags & RST)) {
 2164                         tcb->flags |= FORCE;
 2165                         goto output;
 2166                 }
 2167                 qunlock(s);
 2168                 poperror();
 2169                 return;
 2170         }
 2171 
 2172         /* Cannot accept so answer with a rst */
 2173         if(length && tcb->state == Closed) {
 2174                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
 2175                 goto raise;
 2176         }
 2177 
 2178         /* The segment is beyond the current receive pointer so
 2179          * queue the data in the resequence queue
 2180          */
 2181         if(seg.seq != tcb->rcv.nxt)
 2182         if(length != 0 || (seg.flags & (SYN|FIN))) {
 2183                 update(s, &seg);
 2184                 if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
 2185                         print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
 2186                 tcb->flags |= FORCE;
 2187                 goto output;
 2188         }
 2189 
 2190         /*
 2191          *  keep looping till we've processed this packet plus any
 2192          *  adjacent packets in the resequence queue
 2193          */
 2194         for(;;) {
 2195                 if(seg.flags & RST) {
 2196                         if(tcb->state == Established) {
 2197                                 tpriv->stats[EstabResets]++;
 2198                                 if(tcb->rcv.nxt != seg.seq)
 2199                                         print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
 2200                         }
 2201                         localclose(s, Econrefused);
 2202                         goto raise;
 2203                 }
 2204 
 2205                 if((seg.flags&ACK) == 0)
 2206                         goto raise;
 2207 
 2208                 switch(tcb->state) {
 2209                 case Syn_received:
 2210                         if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
 2211                                 sndrst(tcp, source, dest, length, &seg, version,
 2212                                         "bad seq in Syn_received");
 2213                                 goto raise;
 2214                         }
 2215                         update(s, &seg);
 2216                         tcpsetstate(s, Established);
 2217                 case Established:
 2218                 case Close_wait:
 2219                         update(s, &seg);
 2220                         break;
 2221                 case Finwait1:
 2222                         update(s, &seg);
 2223                         if(qlen(s->wq)+tcb->flgcnt == 0){
 2224                                 tcphalt(tpriv, &tcb->rtt_timer);
 2225                                 tcphalt(tpriv, &tcb->acktimer);
 2226                                 tcpsetkacounter(tcb);
 2227                                 tcb->time = NOW;
 2228                                 tcpsetstate(s, Finwait2);
 2229                                 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
 2230                                 tcpgo(tpriv, &tcb->katimer);
 2231                         }
 2232                         break;
 2233                 case Finwait2:
 2234                         update(s, &seg);
 2235                         break;
 2236                 case Closing:
 2237                         update(s, &seg);
 2238                         if(qlen(s->wq)+tcb->flgcnt == 0) {
 2239                                 tcphalt(tpriv, &tcb->rtt_timer);
 2240                                 tcphalt(tpriv, &tcb->acktimer);
 2241                                 tcphalt(tpriv, &tcb->katimer);
 2242                                 tcpsetstate(s, Time_wait);
 2243                                 tcb->timer.start = MSL2*(1000 / MSPTICK);
 2244                                 tcpgo(tpriv, &tcb->timer);
 2245                         }
 2246                         break;
 2247                 case Last_ack:
 2248                         update(s, &seg);
 2249                         if(qlen(s->wq)+tcb->flgcnt == 0) {
 2250                                 localclose(s, nil);
 2251                                 goto raise;
 2252                         }
 2253                 case Time_wait:
 2254                         tcb->flags |= FORCE;
 2255                         if(tcb->timer.state != TcptimerON)
 2256                                 tcpgo(tpriv, &tcb->timer);
 2257                 }
 2258 
 2259                 if((seg.flags&URG) && seg.urg) {
 2260                         if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
 2261                                 tcb->rcv.urg = seg.urg + seg.seq;
 2262                                 pullblock(&bp, seg.urg);
 2263                         }
 2264                 }
 2265                 else
 2266                 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
 2267                         tcb->rcv.urg = tcb->rcv.nxt;
 2268 
 2269                 if(length == 0) {
 2270                         if(bp != nil)
 2271                                 freeblist(bp);
 2272                 }
 2273                 else {
 2274                         switch(tcb->state){
 2275                         default:
 2276                                 /* Ignore segment text */
 2277                                 if(bp != nil)
 2278                                         freeblist(bp);
 2279                                 break;
 2280 
 2281                         case Syn_received:
 2282                         case Established:
 2283                         case Finwait1:
 2284                                 /* If we still have some data place on
 2285                                  * receive queue
 2286                                  */
 2287                                 if(bp) {
 2288                                         bp = packblock(bp);
 2289                                         if(bp == nil)
 2290                                                 panic("tcp packblock");
 2291                                         qpassnolim(s->rq, bp);
 2292                                         bp = nil;
 2293 
 2294                                         /*
 2295                                          *  Force an ack every 2 data messages.  This is
 2296                                          *  a hack for rob to make his home system run
 2297                                          *  faster.
 2298                                          *
 2299                                          *  this also keeps the standard TCP congestion
 2300                                          *  control working since it needs an ack every
 2301                                          *  2 max segs worth.  This is not quite that,
 2302                                          *  but under a real stream is equivalent since
 2303                                          *  every packet has a max seg in it.
 2304                                          */
 2305                                         if(++(tcb->rcv.una) >= 2)
 2306                                                 tcb->flags |= FORCE;
 2307                                 }
 2308                                 tcb->rcv.nxt += length;
 2309 
 2310                                 /*
 2311                                  *  update our rcv window
 2312                                  */
 2313                                 tcprcvwin(s);
 2314 
 2315                                 /*
 2316                                  *  turn on the acktimer if there's something
 2317                                  *  to ack
 2318                                  */
 2319                                 if(tcb->acktimer.state != TcptimerON)
 2320                                         tcpgo(tpriv, &tcb->acktimer);
 2321 
 2322                                 break;
 2323                         case Finwait2:
 2324                                 /* no process to read the data, send a reset */
 2325                                 if(bp != nil)
 2326                                         freeblist(bp);
 2327                                 sndrst(tcp, source, dest, length, &seg, version,
 2328                                         "send to Finwait2");
 2329                                 qunlock(s);
 2330                                 poperror();
 2331                                 return;
 2332                         }
 2333                 }
 2334 
 2335                 if(seg.flags & FIN) {
 2336                         tcb->flags |= FORCE;
 2337 
 2338                         switch(tcb->state) {
 2339                         case Syn_received:
 2340                         case Established:
 2341                                 tcb->rcv.nxt++;
 2342                                 tcpsetstate(s, Close_wait);
 2343                                 break;
 2344                         case Finwait1:
 2345                                 tcb->rcv.nxt++;
 2346                                 if(qlen(s->wq)+tcb->flgcnt == 0) {
 2347                                         tcphalt(tpriv, &tcb->rtt_timer);
 2348                                         tcphalt(tpriv, &tcb->acktimer);
 2349                                         tcphalt(tpriv, &tcb->katimer);
 2350                                         tcpsetstate(s, Time_wait);
 2351                                         tcb->timer.start = MSL2*(1000/MSPTICK);
 2352                                         tcpgo(tpriv, &tcb->timer);
 2353                                 }
 2354                                 else
 2355                                         tcpsetstate(s, Closing);
 2356                                 break;
 2357                         case Finwait2:
 2358                                 tcb->rcv.nxt++;
 2359                                 tcphalt(tpriv, &tcb->rtt_timer);
 2360                                 tcphalt(tpriv, &tcb->acktimer);
 2361                                 tcphalt(tpriv, &tcb->katimer);
 2362                                 tcpsetstate(s, Time_wait);
 2363                                 tcb->timer.start = MSL2 * (1000/MSPTICK);
 2364                                 tcpgo(tpriv, &tcb->timer);
 2365                                 break;
 2366                         case Close_wait:
 2367                         case Closing:
 2368                         case Last_ack:
 2369                                 break;
 2370                         case Time_wait:
 2371                                 tcpgo(tpriv, &tcb->timer);
 2372                                 break;
 2373                         }
 2374                 }
 2375 
 2376                 /*
 2377                  *  get next adjacent segment from the resequence queue.
 2378                  *  dump/trim any overlapping segments
 2379                  */
 2380                 for(;;) {
 2381                         if(tcb->reseq == nil)
 2382                                 goto output;
 2383 
 2384                         if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
 2385                                 goto output;
 2386 
 2387                         getreseq(tcb, &seg, &bp, &length);
 2388 
 2389                         if(tcptrim(tcb, &seg, &bp, &length) == 0)
 2390                                 break;
 2391                 }
 2392         }
 2393 output:
 2394         tcpoutput(s);
 2395         qunlock(s);
 2396         poperror();
 2397         return;
 2398 raise:
 2399         qunlock(s);
 2400         poperror();
 2401         freeblist(bp);
 2402         tcpkick(s);
 2403 }
 2404 
 2405 /*
 2406  *  always enters and exits with the s locked.  We drop
 2407  *  the lock to ipoput the packet so some care has to be
 2408  *  taken by callers.
 2409  */
 2410 void
 2411 tcpoutput(Conv *s)
 2412 {
 2413         Tcp seg;
 2414         int msgs;
 2415         Tcpctl *tcb;
 2416         Block *hbp, *bp;
 2417         int sndcnt, n;
 2418         ulong ssize, dsize, usable, sent;
 2419         Fs *f;
 2420         Tcppriv *tpriv;
 2421         uchar version;
 2422 
 2423         f = s->p->f;
 2424         tpriv = s->p->priv;
 2425         version = s->ipversion;
 2426 
 2427         for(msgs = 0; msgs < 100; msgs++) {
 2428                 tcb = (Tcpctl*)s->ptcl;
 2429 
 2430                 switch(tcb->state) {
 2431                 case Listen:
 2432                 case Closed:
 2433                 case Finwait2:
 2434                         return;
 2435                 }
 2436 
 2437                 /* force an ack when a window has opened up */
 2438                 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
 2439                         tcb->rcv.blocked = 0;
 2440                         tcb->flags |= FORCE;
 2441                 }
 2442 
 2443                 sndcnt = qlen(s->wq)+tcb->flgcnt;
 2444                 sent = tcb->snd.ptr - tcb->snd.una;
 2445 
 2446                 /* Don't send anything else until our SYN has been acked */
 2447                 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
 2448                         break;
 2449 
 2450                 /* Compute usable segment based on offered window and limit
 2451                  * window probes to one
 2452                  */
 2453                 if(tcb->snd.wnd == 0){
 2454                         if(sent != 0) {
 2455                                 if((tcb->flags&FORCE) == 0)
 2456                                         break;
 2457 //                              tcb->snd.ptr = tcb->snd.una;
 2458                         }
 2459                         usable = 1;
 2460                 }
 2461                 else {
 2462                         usable = tcb->cwind;
 2463                         if(tcb->snd.wnd < usable)
 2464                                 usable = tcb->snd.wnd;
 2465                         usable -= sent;
 2466                 }
 2467                 ssize = sndcnt-sent;
 2468                 if(ssize && usable < 2)
 2469                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
 2470                                 tcb->snd.wnd, tcb->cwind);
 2471                 if(usable < ssize)
 2472                         ssize = usable;
 2473                 if(tcb->mss < ssize)
 2474                         ssize = tcb->mss;
 2475                 dsize = ssize;
 2476                 seg.urg = 0;
 2477 
 2478                 if(ssize == 0)
 2479                 if((tcb->flags&FORCE) == 0)
 2480                         break;
 2481 
 2482                 tcb->flags &= ~FORCE;
 2483                 tcprcvwin(s);
 2484 
 2485                 /* By default we will generate an ack */
 2486                 tcphalt(tpriv, &tcb->acktimer);
 2487                 tcb->rcv.una = 0;
 2488                 seg.source = s->lport;
 2489                 seg.dest = s->rport;
 2490                 seg.flags = ACK;
 2491                 seg.mss = 0;
 2492                 seg.ws = 0;
 2493                 switch(tcb->state){
 2494                 case Syn_sent:
 2495                         seg.flags = 0;
 2496                         if(tcb->snd.ptr == tcb->iss){
 2497                                 seg.flags |= SYN;
 2498                                 dsize--;
 2499                                 seg.mss = tcb->mss;
 2500                                 seg.ws = tcb->scale;
 2501                         }
 2502                         break;
 2503                 case Syn_received:
 2504                         /*
 2505                          *  don't send any data with a SYN/ACK packet
 2506                          *  because Linux rejects the packet in its
 2507                          *  attempt to solve the SYN attack problem
 2508                          */
 2509                         if(tcb->snd.ptr == tcb->iss){
 2510                                 seg.flags |= SYN;
 2511                                 dsize = 0;
 2512                                 ssize = 1;
 2513                                 seg.mss = tcb->mss;
 2514                                 seg.ws = tcb->scale;
 2515                         }
 2516                         break;
 2517                 }
 2518                 seg.seq = tcb->snd.ptr;
 2519                 seg.ack = tcb->rcv.nxt;
 2520                 seg.wnd = tcb->rcv.wnd;
 2521 
 2522                 /* Pull out data to send */
 2523                 bp = nil;
 2524                 if(dsize != 0) {
 2525                         bp = qcopy(s->wq, dsize, sent);
 2526                         if(BLEN(bp) != dsize) {
 2527                                 seg.flags |= FIN;
 2528                                 dsize--;
 2529                         }
 2530                 }
 2531 
 2532                 if(sent+dsize == sndcnt)
 2533                         seg.flags |= PSH;
 2534 
 2535                 /* keep track of balance of resent data */
 2536                 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
 2537                         n = tcb->snd.nxt - tcb->snd.ptr;
 2538                         if(ssize < n)
 2539                                 n = ssize;
 2540                         tcb->resent += n;
 2541                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n",
 2542                                 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
 2543                         tpriv->stats[RetransSegs]++;
 2544                 }
 2545 
 2546                 tcb->snd.ptr += ssize;
 2547 
 2548                 /* Pull up the send pointer so we can accept acks
 2549                  * for this window
 2550                  */
 2551                 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
 2552                         tcb->snd.nxt = tcb->snd.ptr;
 2553 
 2554                 /* Build header, link data and compute cksum */
 2555                 switch(version){
 2556                 case V4:
 2557                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
 2558                         hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
 2559                         if(hbp == nil) {
 2560                                 freeblist(bp);
 2561                                 return;
 2562                         }
 2563                         break;
 2564                 case V6:
 2565                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
 2566                         hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
 2567                         if(hbp == nil) {
 2568                                 freeblist(bp);
 2569                                 return;
 2570                         }
 2571                         break;
 2572                 default:
 2573                         hbp = nil;      /* to suppress a warning */
 2574                         panic("tcpoutput: version %d", version);
 2575                 }
 2576 
 2577                 /* Start the transmission timers if there is new data and we
 2578                  * expect acknowledges
 2579                  */
 2580                 if(ssize != 0){
 2581                         if(tcb->timer.state != TcptimerON)
 2582                                 tcpgo(tpriv, &tcb->timer);
 2583 
 2584                         /*  If round trip timer isn't running, start it.
 2585                          *  measure the longest packet only in case the
 2586                          *  transmission time dominates RTT
 2587                          */
 2588                         if(tcb->rtt_timer.state != TcptimerON)
 2589                         if(ssize == tcb->mss) {
 2590                                 tcpgo(tpriv, &tcb->rtt_timer);
 2591                                 tcb->rttseq = tcb->snd.ptr;
 2592                         }
 2593                 }
 2594 
 2595                 tpriv->stats[OutSegs]++;
 2596 
 2597                 /* put off the next keep alive */
 2598                 tcpgo(tpriv, &tcb->katimer);
 2599 
 2600                 switch(version){
 2601                 case V4:
 2602                         if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
 2603                                 /* a negative return means no route */
 2604                                 localclose(s, "no route");
 2605                         }
 2606                         break;
 2607                 case V6:
 2608                         if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
 2609                                 /* a negative return means no route */
 2610                                 localclose(s, "no route");
 2611                         }
 2612                         break;
 2613                 default:
 2614                         panic("tcpoutput2: version %d", version);
 2615                 }
 2616                 if((msgs%4) == 1){
 2617                         qunlock(s);
 2618                         sched();
 2619                         qlock(s);
 2620                 }
 2621         }
 2622 }
 2623 
 2624 /*
 2625  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
 2626  */
 2627 void
 2628 tcpsendka(Conv *s)
 2629 {
 2630         Tcp seg;
 2631         Tcpctl *tcb;
 2632         Block *hbp,*dbp;
 2633 
 2634         tcb = (Tcpctl*)s->ptcl;
 2635 
 2636         dbp = nil;
 2637         seg.urg = 0;
 2638         seg.source = s->lport;
 2639         seg.dest = s->rport;
 2640         seg.flags = ACK|PSH;
 2641         seg.mss = 0;
 2642         seg.ws = 0;
 2643         if(tcpporthogdefense)
 2644                 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
 2645         else
 2646                 seg.seq = tcb->snd.una-1;
 2647         seg.ack = tcb->rcv.nxt;
 2648         tcb->rcv.una = 0;
 2649         seg.wnd = tcb->rcv.wnd;
 2650         if(tcb->state == Finwait2){
 2651                 seg.flags |= FIN;
 2652         } else {
 2653                 dbp = allocb(1);
 2654                 dbp->wp++;
 2655         }
 2656 
 2657         if(isv4(s->raddr)) {
 2658                 /* Build header, link data and compute cksum */
 2659                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
 2660                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
 2661                 if(hbp == nil) {
 2662                         freeblist(dbp);
 2663                         return;
 2664                 }
 2665                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
 2666         }
 2667         else {
 2668                 /* Build header, link data and compute cksum */
 2669                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
 2670                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
 2671                 if(hbp == nil) {
 2672                         freeblist(dbp);
 2673                         return;
 2674                 }
 2675                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
 2676         }
 2677 }
 2678 
 2679 /*
 2680  *  set connection to time out after 12 minutes
 2681  */
 2682 void
 2683 tcpsetkacounter(Tcpctl *tcb)
 2684 {
 2685         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
 2686         if(tcb->kacounter < 3)
 2687                 tcb->kacounter = 3;
 2688 }
 2689 
 2690 /*
 2691  *  if we've timed out, close the connection
 2692  *  otherwise, send a keepalive and restart the timer
 2693  */
 2694 void
 2695 tcpkeepalive(void *v)
 2696 {
 2697         Tcpctl *tcb;
 2698         Conv *s;
 2699 
 2700         s = v;
 2701         tcb = (Tcpctl*)s->ptcl;
 2702         if(waserror()){
 2703                 qunlock(s);
 2704                 nexterror();
 2705         }
 2706         qlock(s);
 2707         if(tcb->state != Closed){
 2708                 if(--(tcb->kacounter) <= 0) {
 2709                         localclose(s, Etimedout);
 2710                 } else {
 2711                         tcpsendka(s);
 2712                         tcpgo(s->p->priv, &tcb->katimer);
 2713                 }
 2714         }
 2715         qunlock(s);
 2716         poperror();
 2717 }
 2718 
 2719 /*
 2720  *  start keepalive timer
 2721  */
 2722 char*
 2723 tcpstartka(Conv *s, char **f, int n)
 2724 {
 2725         Tcpctl *tcb;
 2726         int x;
 2727 
 2728         tcb = (Tcpctl*)s->ptcl;
 2729         if(tcb->state != Established)
 2730                 return "connection must be in Establised state";
 2731         if(n > 1){
 2732                 x = atoi(f[1]);
 2733                 if(x >= MSPTICK)
 2734                         tcb->katimer.start = x/MSPTICK;
 2735         }
 2736         tcpsetkacounter(tcb);
 2737         tcpgo(s->p->priv, &tcb->katimer);
 2738 
 2739         return nil;
 2740 }
 2741 
 2742 /*
 2743  *  turn checksums on/off
 2744  */
 2745 char*
 2746 tcpsetchecksum(Conv *s, char **f, int)
 2747 {
 2748         Tcpctl *tcb;
 2749 
 2750         tcb = (Tcpctl*)s->ptcl;
 2751         tcb->nochecksum = !atoi(f[1]);
 2752 
 2753         return nil;
 2754 }
 2755 
 2756 void
 2757 tcprxmit(Conv *s)
 2758 {
 2759         Tcpctl *tcb;
 2760 
 2761         tcb = (Tcpctl*)s->ptcl;
 2762 
 2763         tcb->flags |= RETRAN|FORCE;
 2764         tcb->snd.ptr = tcb->snd.una;
 2765 
 2766         /*
 2767          *  We should be halving the slow start threshhold (down to one
 2768          *  mss) but leaving it at mss seems to work well enough
 2769          */
 2770         tcb->ssthresh = tcb->mss;
 2771 
 2772         /*
 2773          *  pull window down to a single packet
 2774          */
 2775         tcb->cwind = tcb->mss;
 2776         tcpoutput(s);
 2777 }
 2778 
 2779 void
 2780 tcptimeout(void *arg)
 2781 {
 2782         Conv *s;
 2783         Tcpctl *tcb;
 2784         int maxback;
 2785         Tcppriv *tpriv;
 2786 
 2787         s = (Conv*)arg;
 2788         tpriv = s->p->priv;
 2789         tcb = (Tcpctl*)s->ptcl;
 2790 
 2791         if(waserror()){
 2792                 qunlock(s);
 2793                 nexterror();
 2794         }
 2795         qlock(s);
 2796         switch(tcb->state){
 2797         default:
 2798                 tcb->backoff++;
 2799                 if(tcb->state == Syn_sent)
 2800                         maxback = MAXBACKMS/2;
 2801                 else
 2802                         maxback = MAXBACKMS;
 2803                 tcb->backedoff += tcb->timer.start * MSPTICK;
 2804                 if(tcb->backedoff >= maxback) {
 2805                         localclose(s, Etimedout);
 2806                         break;
 2807                 }
 2808                 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW);
 2809                 tcpsettimer(tcb);
 2810                 tcprxmit(s);
 2811                 tpriv->stats[RetransTimeouts]++;
 2812                 tcb->snd.dupacks = 0;
 2813                 break;
 2814         case Time_wait:
 2815                 localclose(s, nil);
 2816                 break;
 2817         case Closed:
 2818                 break;
 2819         }
 2820         qunlock(s);
 2821         poperror();
 2822 }
 2823 
 2824 int
 2825 inwindow(Tcpctl *tcb, int seq)
 2826 {
 2827         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
 2828 }
 2829 
 2830 /*
 2831  *  set up state for a received SYN (or SYN ACK) packet
 2832  */
 2833 void
 2834 procsyn(Conv *s, Tcp *seg)
 2835 {
 2836         Tcpctl *tcb;
 2837 
 2838         tcb = (Tcpctl*)s->ptcl;
 2839         tcb->flags |= FORCE;
 2840 
 2841         tcb->rcv.nxt = seg->seq + 1;
 2842         tcb->rcv.urg = tcb->rcv.nxt;
 2843         tcb->irs = seg->seq;
 2844 
 2845         /* our sending max segment size cannot be bigger than what he asked for */
 2846         if(seg->mss != 0 && seg->mss < tcb->mss)
 2847                 tcb->mss = seg->mss;
 2848 
 2849         /* the congestion window always starts out as a single segment */
 2850         tcb->snd.wnd = seg->wnd;
 2851         tcb->cwind = tcb->mss;
 2852 }
 2853 
 2854 int
 2855 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
 2856 {
 2857         Reseq *rp, *rp1;
 2858         int i, rqlen, qmax;
 2859 
 2860         rp = malloc(sizeof(Reseq));
 2861         if(rp == nil){
 2862                 freeblist(bp);  /* bp always consumed by add_reseq */
 2863                 return 0;
 2864         }
 2865 
 2866         rp->seg = *seg;
 2867         rp->bp = bp;
 2868         rp->length = length;
 2869 
 2870         /* Place on reassembly list sorting by starting seq number */
 2871         rp1 = tcb->reseq;
 2872         if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
 2873                 rp->next = rp1;
 2874                 tcb->reseq = rp;
 2875                 if(rp->next != nil)
 2876                         tpriv->stats[OutOfOrder]++;
 2877                 return 0;
 2878         }
 2879 
 2880         rqlen = 0;
 2881         for(i = 0;; i++) {
 2882                 rqlen += rp1->length;
 2883                 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
 2884                         rp->next = rp1->next;
 2885                         rp1->next = rp;
 2886                         if(rp->next != nil)
 2887                                 tpriv->stats[OutOfOrder]++;
 2888                         break;
 2889                 }
 2890                 rp1 = rp1->next;
 2891         }
 2892         qmax = QMAX<<tcb->rcv.scale;
 2893         if(rqlen > qmax){
 2894                 print("resequence queue > window: %d > %d\n", rqlen, qmax);
 2895                 i = 0;
 2896                 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
 2897                         print("%#lux %#lux %#ux\n", rp1->seg.seq,
 2898                                 rp1->seg.ack, rp1->seg.flags);
 2899                         if(i++ > 10){
 2900                                 print("...\n");
 2901                                 break;
 2902                         }
 2903                 }
 2904 
 2905                 /*
 2906                  * delete entire reassembly queue; wait for retransmit.
 2907                  * - should we be smarter and only delete the tail?
 2908                  */
 2909                 for(rp = tcb->reseq; rp != nil; rp = rp1){
 2910                         rp1 = rp->next;
 2911                         freeblist(rp->bp);
 2912                         free(rp);
 2913                 }
 2914                 tcb->reseq = nil;
 2915 
 2916                 return -1;
 2917         }
 2918         return 0;
 2919 }
 2920 
 2921 void
 2922 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
 2923 {
 2924         Reseq *rp;
 2925 
 2926         rp = tcb->reseq;
 2927         if(rp == nil)
 2928                 return;
 2929 
 2930         tcb->reseq = rp->next;
 2931 
 2932         *seg = rp->seg;
 2933         *bp = rp->bp;
 2934         *length = rp->length;
 2935 
 2936         free(rp);
 2937 }
 2938 
 2939 int
 2940 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
 2941 {
 2942         ushort len;
 2943         uchar accept;
 2944         int dupcnt, excess;
 2945 
 2946         accept = 0;
 2947         len = *length;
 2948         if(seg->flags & SYN)
 2949                 len++;
 2950         if(seg->flags & FIN)
 2951                 len++;
 2952 
 2953         if(tcb->rcv.wnd == 0) {
 2954                 if(len == 0 && seg->seq == tcb->rcv.nxt)
 2955                         return 0;
 2956         }
 2957         else {
 2958                 /* Some part of the segment should be in the window */
 2959                 if(inwindow(tcb,seg->seq))
 2960                         accept++;
 2961                 else
 2962                 if(len != 0) {
 2963                         if(inwindow(tcb, seg->seq+len-1) ||
 2964                         seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
 2965                                 accept++;
 2966                 }
 2967         }
 2968         if(!accept) {
 2969                 freeblist(*bp);
 2970                 return -1;
 2971         }
 2972         dupcnt = tcb->rcv.nxt - seg->seq;
 2973         if(dupcnt > 0){
 2974                 tcb->rerecv += dupcnt;
 2975                 if(seg->flags & SYN){
 2976                         seg->flags &= ~SYN;
 2977                         seg->seq++;
 2978 
 2979                         if(seg->urg > 1)
 2980                                 seg->urg--;
 2981                         else
 2982                                 seg->flags &= ~URG;
 2983                         dupcnt--;
 2984                 }
 2985                 if(dupcnt > 0){
 2986                         pullblock(bp, (ushort)dupcnt);
 2987                         seg->seq += dupcnt;
 2988                         *length -= dupcnt;
 2989 
 2990                         if(seg->urg > dupcnt)
 2991                                 seg->urg -= dupcnt;
 2992                         else {
 2993                                 seg->flags &= ~URG;
 2994                                 seg->urg = 0;
 2995                         }
 2996                 }
 2997         }
 2998         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
 2999         if(excess > 0) {
 3000                 tcb->rerecv += excess;
 3001                 *length -= excess;
 3002                 *bp = trimblock(*bp, 0, *length);
 3003                 if(*bp == nil)
 3004                         panic("presotto is a boofhead");
 3005                 seg->flags &= ~FIN;
 3006         }
 3007         return 0;
 3008 }
 3009 
 3010 void
 3011 tcpadvise(Proto *tcp, Block *bp, char *msg)
 3012 {
 3013         Tcp4hdr *h4;
 3014         Tcp6hdr *h6;
 3015         Tcpctl *tcb;
 3016         uchar source[IPaddrlen];
 3017         uchar dest[IPaddrlen];
 3018         ushort psource, pdest;
 3019         Conv *s, **p;
 3020 
 3021         h4 = (Tcp4hdr*)(bp->rp);
 3022         h6 = (Tcp6hdr*)(bp->rp);
 3023 
 3024         if((h4->vihl&0xF0)==IP_VER4) {
 3025                 v4tov6(dest, h4->tcpdst);
 3026                 v4tov6(source, h4->tcpsrc);
 3027                 psource = nhgets(h4->tcpsport);
 3028                 pdest = nhgets(h4->tcpdport);
 3029         }
 3030         else {
 3031                 ipmove(dest, h6->tcpdst);
 3032                 ipmove(source, h6->tcpsrc);
 3033                 psource = nhgets(h6->tcpsport);
 3034                 pdest = nhgets(h6->tcpdport);
 3035         }
 3036 
 3037         /* Look for a connection */
 3038         qlock(tcp);
 3039         for(p = tcp->conv; *p; p++) {
 3040                 s = *p;
 3041                 tcb = (Tcpctl*)s->ptcl;
 3042                 if(s->rport == pdest)
 3043                 if(s->lport == psource)
 3044                 if(tcb->state != Closed)
 3045                 if(ipcmp(s->raddr, dest) == 0)
 3046                 if(ipcmp(s->laddr, source) == 0){
 3047                         qlock(s);
 3048                         qunlock(tcp);
 3049                         switch(tcb->state){
 3050                         case Syn_sent:
 3051                                 localclose(s, msg);
 3052                                 break;
 3053                         }
 3054                         qunlock(s);
 3055                         freeblist(bp);
 3056                         return;
 3057                 }
 3058         }
 3059         qunlock(tcp);
 3060         freeblist(bp);
 3061 }
 3062 
 3063 static char*
 3064 tcpporthogdefensectl(char *val)
 3065 {
 3066         if(strcmp(val, "on") == 0)
 3067                 tcpporthogdefense = 1;
 3068         else if(strcmp(val, "off") == 0)
 3069                 tcpporthogdefense = 0;
 3070         else
 3071                 return "unknown value for tcpporthogdefense";
 3072         return nil;
 3073 }
 3074 
 3075 /* called with c qlocked */
 3076 char*
 3077 tcpctl(Conv* c, char** f, int n)
 3078 {
 3079         if(n == 1 && strcmp(f[0], "hangup") == 0)
 3080                 return tcphangup(c);
 3081         if(n >= 1 && strcmp(f[0], "keepalive") == 0)
 3082                 return tcpstartka(c, f, n);
 3083         if(n >= 1 && strcmp(f[0], "checksum") == 0)
 3084                 return tcpsetchecksum(c, f, n);
 3085         if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
 3086                 return tcpporthogdefensectl(f[1]);
 3087         return "unknown control request";
 3088 }
 3089 
 3090 int
 3091 tcpstats(Proto *tcp, char *buf, int len)
 3092 {
 3093         Tcppriv *priv;
 3094         char *p, *e;
 3095         int i;
 3096 
 3097         priv = tcp->priv;
 3098         p = buf;
 3099         e = p+len;
 3100         for(i = 0; i < Nstats; i++)
 3101                 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
 3102         return p - buf;
 3103 }
 3104 
 3105 /*
 3106  *  garbage collect any stale conversations:
 3107  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
 3108  *      - Finwait2 after 5 minutes
 3109  *
 3110  *  this is called whenever we run out of channels.  Both checks are
 3111  *  of questionable validity so we try to use them only when we're
 3112  *  up against the wall.
 3113  */
 3114 int
 3115 tcpgc(Proto *tcp)
 3116 {
 3117         Conv *c, **pp, **ep;
 3118         int n;
 3119         Tcpctl *tcb;
 3120 
 3121 
 3122         n = 0;
 3123         ep = &tcp->conv[tcp->nc];
 3124         for(pp = tcp->conv; pp < ep; pp++) {
 3125                 c = *pp;
 3126                 if(c == nil)
 3127                         break;
 3128                 if(!canqlock(c))
 3129                         continue;
 3130                 tcb = (Tcpctl*)c->ptcl;
 3131                 switch(tcb->state){
 3132                 case Syn_received:
 3133                         if(NOW - tcb->time > 5000){
 3134                                 localclose(c, "timed out");
 3135                                 n++;
 3136                         }
 3137                         break;
 3138                 case Finwait2:
 3139                         if(NOW - tcb->time > 5*60*1000){
 3140                                 localclose(c, "timed out");
 3141                                 n++;
 3142                         }
 3143                         break;
 3144                 }
 3145                 qunlock(c);
 3146         }
 3147         return n;
 3148 }
 3149 
 3150 void
 3151 tcpsettimer(Tcpctl *tcb)
 3152 {
 3153         int x;
 3154 
 3155         /* round trip dependency */
 3156         x = backoff(tcb->backoff) *
 3157                 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
 3158 
 3159         /* bounded twixt 1/2 and 64 seconds */
 3160         if(x < 500/MSPTICK)
 3161                 x = 500/MSPTICK;
 3162         else if(x > (64000/MSPTICK))
 3163                 x = 64000/MSPTICK;
 3164         tcb->timer.start = x;
 3165 }
 3166 
 3167 void
 3168 tcpinit(Fs *fs)
 3169 {
 3170         Proto *tcp;
 3171         Tcppriv *tpriv;
 3172 
 3173         tcp = smalloc(sizeof(Proto));
 3174         tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
 3175         tcp->name = "tcp";
 3176         tcp->connect = tcpconnect;
 3177         tcp->announce = tcpannounce;
 3178         tcp->ctl = tcpctl;
 3179         tcp->state = tcpstate;
 3180         tcp->create = tcpcreate;
 3181         tcp->close = tcpclose;
 3182         tcp->rcv = tcpiput;
 3183         tcp->advise = tcpadvise;
 3184         tcp->stats = tcpstats;
 3185         tcp->inuse = tcpinuse;
 3186         tcp->gc = tcpgc;
 3187         tcp->ipproto = IP_TCPPROTO;
 3188         tcp->nc = scalednconv();
 3189         tcp->ptclsize = sizeof(Tcpctl);
 3190         tpriv->stats[MaxConn] = tcp->nc;
 3191 
 3192         Fsproto(fs, tcp);
 3193 }
 3194 
 3195 void
 3196 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
 3197 {
 3198         if(rcvscale){
 3199                 tcb->rcv.scale = rcvscale & 0xff;
 3200                 tcb->snd.scale = sndscale & 0xff;
 3201                 tcb->window = QMAX<<tcb->snd.scale;
 3202                 qsetlimit(s->rq, tcb->window);
 3203         } else {
 3204                 tcb->rcv.scale = 0;
 3205                 tcb->snd.scale = 0;
 3206                 tcb->window = QMAX;
 3207                 qsetlimit(s->rq, tcb->window);
 3208         }
 3209 }

Cache object: a6dedd06375163a3508093f5163171b6


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.