1 /* $OpenBSD: tcp_usrreq.c,v 1.216 2023/01/22 12:05:44 mvs Exp $ */
2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
33 *
34 * NRL grants permission for redistribution and use in source and binary
35 * forms, with or without modification, of the software and documentation
36 * created at NRL provided that the following conditions are met:
37 *
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgements:
45 * This product includes software developed by the University of
46 * California, Berkeley and its contributors.
47 * This product includes software developed at the Information
48 * Technology Division, US Naval Research Laboratory.
49 * 4. Neither the name of the NRL nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 *
65 * The views and conclusions contained in the software and documentation
66 * are those of the authors and should not be interpreted as representing
67 * official policies, either expressed or implied, of the US Naval
68 * Research Laboratory (NRL).
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/sysctl.h>
79 #include <sys/domain.h>
80 #include <sys/kernel.h>
81 #include <sys/pool.h>
82 #include <sys/proc.h>
83
84 #include <net/if.h>
85 #include <net/if_var.h>
86 #include <net/route.h>
87
88 #include <netinet/in.h>
89 #include <netinet/in_var.h>
90 #include <netinet/ip.h>
91 #include <netinet/in_pcb.h>
92 #include <netinet/ip_var.h>
93 #include <netinet/tcp.h>
94 #include <netinet/tcp_fsm.h>
95 #include <netinet/tcp_seq.h>
96 #include <netinet/tcp_timer.h>
97 #include <netinet/tcp_var.h>
98 #include <netinet/tcp_debug.h>
99
100 #ifdef INET6
101 #include <netinet6/in6_var.h>
102 #endif
103
104 #ifndef TCP_SENDSPACE
105 #define TCP_SENDSPACE 1024*16
106 #endif
107 u_int tcp_sendspace = TCP_SENDSPACE;
108 #ifndef TCP_RECVSPACE
109 #define TCP_RECVSPACE 1024*16
110 #endif
111 u_int tcp_recvspace = TCP_RECVSPACE;
112 u_int tcp_autorcvbuf_inc = 16 * 1024;
113
114 const struct pr_usrreqs tcp_usrreqs = {
115 .pru_attach = tcp_attach,
116 .pru_detach = tcp_detach,
117 .pru_bind = tcp_bind,
118 .pru_listen = tcp_listen,
119 .pru_connect = tcp_connect,
120 .pru_accept = tcp_accept,
121 .pru_disconnect = tcp_disconnect,
122 .pru_shutdown = tcp_shutdown,
123 .pru_rcvd = tcp_rcvd,
124 .pru_send = tcp_send,
125 .pru_abort = tcp_abort,
126 .pru_sense = tcp_sense,
127 .pru_rcvoob = tcp_rcvoob,
128 .pru_sendoob = tcp_sendoob,
129 .pru_control = in_control,
130 .pru_sockaddr = tcp_sockaddr,
131 .pru_peeraddr = tcp_peeraddr,
132 };
133
134 #ifdef INET6
135 const struct pr_usrreqs tcp6_usrreqs = {
136 .pru_attach = tcp_attach,
137 .pru_detach = tcp_detach,
138 .pru_bind = tcp_bind,
139 .pru_listen = tcp_listen,
140 .pru_connect = tcp_connect,
141 .pru_accept = tcp_accept,
142 .pru_disconnect = tcp_disconnect,
143 .pru_shutdown = tcp_shutdown,
144 .pru_rcvd = tcp_rcvd,
145 .pru_send = tcp_send,
146 .pru_abort = tcp_abort,
147 .pru_sense = tcp_sense,
148 .pru_rcvoob = tcp_rcvoob,
149 .pru_sendoob = tcp_sendoob,
150 .pru_control = in6_control,
151 .pru_sockaddr = tcp_sockaddr,
152 .pru_peeraddr = tcp_peeraddr,
153 };
154 #endif
155
156 const struct sysctl_bounded_args tcpctl_vars[] = {
157 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 },
158 { TCPCTL_KEEPINITTIME, &tcptv_keep_init, 1, 3 * TCPTV_KEEP_INIT },
159 { TCPCTL_KEEPIDLE, &tcp_keepidle, 1, 5 * TCPTV_KEEP_IDLE },
160 { TCPCTL_KEEPINTVL, &tcp_keepintvl, 1, 3 * TCPTV_KEEPINTVL },
161 { TCPCTL_SACK, &tcp_do_sack, 0, 1 },
162 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 },
163 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 },
164 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 },
165 #ifdef TCP_ECN
166 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 },
167 #endif
168 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 },
169 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
170 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
171 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 },
172 };
173
174 struct inpcbtable tcbtable;
175
176 int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *);
177 int tcp_ident(void *, size_t *, void *, size_t, int);
178
179 static inline int tcp_sogetpcb(struct socket *, struct inpcb **,
180 struct tcpcb **);
181
182 static inline int
183 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp)
184 {
185 struct inpcb *inp;
186 struct tcpcb *tp;
187
188 /*
189 * When a TCP is attached to a socket, then there will be
190 * a (struct inpcb) pointed at by the socket, and this
191 * structure will point at a subsidiary (struct tcpcb).
192 */
193 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) {
194 if (so->so_error)
195 return so->so_error;
196 return EINVAL;
197 }
198
199 *rinp = inp;
200 *rtp = tp;
201
202 return 0;
203 }
204
205 /*
206 * Export internal TCP state information via a struct tcp_info without
207 * leaking any sensitive information. Sequence numbers are reported
208 * relative to the initial sequence number.
209 */
210 int
211 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m)
212 {
213 struct proc *p = curproc;
214 struct tcp_info *ti;
215 u_int t = 1000; /* msec => usec */
216 uint32_t now;
217
218 if (sizeof(*ti) > MLEN) {
219 MCLGETL(m, M_WAITOK, sizeof(*ti));
220 if (!ISSET(m->m_flags, M_EXT))
221 return ENOMEM;
222 }
223 ti = mtod(m, struct tcp_info *);
224 m->m_len = sizeof(*ti);
225 memset(ti, 0, sizeof(*ti));
226 now = tcp_now();
227
228 ti->tcpi_state = tp->t_state;
229 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
230 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
231 if (tp->t_flags & TF_SACK_PERMIT)
232 ti->tcpi_options |= TCPI_OPT_SACK;
233 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
234 ti->tcpi_options |= TCPI_OPT_WSCALE;
235 ti->tcpi_snd_wscale = tp->snd_scale;
236 ti->tcpi_rcv_wscale = tp->rcv_scale;
237 }
238 #ifdef TCP_ECN
239 if (tp->t_flags & TF_ECN_PERMIT)
240 ti->tcpi_options |= TCPI_OPT_ECN;
241 #endif
242
243 ti->tcpi_rto = tp->t_rxtcur * t;
244 ti->tcpi_snd_mss = tp->t_maxseg;
245 ti->tcpi_rcv_mss = tp->t_peermss;
246
247 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t;
248 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t;
249 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t;
250 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t;
251
252 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >>
253 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
254 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >>
255 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT);
256 ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
257 ti->tcpi_snd_cwnd = tp->snd_cwnd;
258
259 ti->tcpi_rcv_space = tp->rcv_wnd;
260
261 /*
262 * Provide only minimal information for unprivileged processes.
263 */
264 if (suser(p) != 0)
265 return 0;
266
267 /* FreeBSD-specific extension fields for tcp_info. */
268 ti->tcpi_snd_wnd = tp->snd_wnd;
269 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss;
270 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs;
271 /* missing tcpi_toe_tid */
272 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
273 ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
274 ti->tcpi_snd_zerowin = tp->t_sndzerowin;
275
276 /* OpenBSD extensions */
277 ti->tcpi_rttmin = tp->t_rttmin * t;
278 ti->tcpi_max_sndwnd = tp->max_sndwnd;
279 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs;
280 ti->tcpi_rcv_up = tp->rcv_up - tp->irs;
281 ti->tcpi_snd_una = tp->snd_una - tp->iss;
282 ti->tcpi_snd_up = tp->snd_up - tp->iss;
283 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss;
284 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss;
285 ti->tcpi_snd_max = tp->snd_max - tp->iss;
286
287 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */
288 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t;
289 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt;
290 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t;
291
292 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc;
293 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat;
294 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
295 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
296 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
297 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
298 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
299 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
300
301 return 0;
302 }
303
304 int
305 tcp_ctloutput(int op, struct socket *so, int level, int optname,
306 struct mbuf *m)
307 {
308 int error = 0;
309 struct inpcb *inp;
310 struct tcpcb *tp;
311 int i;
312
313 inp = sotoinpcb(so);
314 if (inp == NULL)
315 return (ECONNRESET);
316 if (level != IPPROTO_TCP) {
317 switch (so->so_proto->pr_domain->dom_family) {
318 #ifdef INET6
319 case PF_INET6:
320 error = ip6_ctloutput(op, so, level, optname, m);
321 break;
322 #endif /* INET6 */
323 case PF_INET:
324 error = ip_ctloutput(op, so, level, optname, m);
325 break;
326 default:
327 error = EAFNOSUPPORT; /*?*/
328 break;
329 }
330 return (error);
331 }
332 tp = intotcpcb(inp);
333
334 switch (op) {
335
336 case PRCO_SETOPT:
337 switch (optname) {
338
339 case TCP_NODELAY:
340 if (m == NULL || m->m_len < sizeof (int))
341 error = EINVAL;
342 else if (*mtod(m, int *))
343 tp->t_flags |= TF_NODELAY;
344 else
345 tp->t_flags &= ~TF_NODELAY;
346 break;
347
348 case TCP_NOPUSH:
349 if (m == NULL || m->m_len < sizeof (int))
350 error = EINVAL;
351 else if (*mtod(m, int *))
352 tp->t_flags |= TF_NOPUSH;
353 else if (tp->t_flags & TF_NOPUSH) {
354 tp->t_flags &= ~TF_NOPUSH;
355 if (TCPS_HAVEESTABLISHED(tp->t_state))
356 error = tcp_output(tp);
357 }
358 break;
359
360 case TCP_MAXSEG:
361 if (m == NULL || m->m_len < sizeof (int)) {
362 error = EINVAL;
363 break;
364 }
365
366 i = *mtod(m, int *);
367 if (i > 0 && i <= tp->t_maxseg)
368 tp->t_maxseg = i;
369 else
370 error = EINVAL;
371 break;
372
373 case TCP_SACK_ENABLE:
374 if (m == NULL || m->m_len < sizeof (int)) {
375 error = EINVAL;
376 break;
377 }
378
379 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
380 error = EPERM;
381 break;
382 }
383
384 if (tp->t_flags & TF_SIGNATURE) {
385 error = EPERM;
386 break;
387 }
388
389 if (*mtod(m, int *))
390 tp->sack_enable = 1;
391 else
392 tp->sack_enable = 0;
393 break;
394 #ifdef TCP_SIGNATURE
395 case TCP_MD5SIG:
396 if (m == NULL || m->m_len < sizeof (int)) {
397 error = EINVAL;
398 break;
399 }
400
401 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
402 error = EPERM;
403 break;
404 }
405
406 if (*mtod(m, int *)) {
407 tp->t_flags |= TF_SIGNATURE;
408 tp->sack_enable = 0;
409 } else
410 tp->t_flags &= ~TF_SIGNATURE;
411 break;
412 #endif /* TCP_SIGNATURE */
413 default:
414 error = ENOPROTOOPT;
415 break;
416 }
417 break;
418
419 case PRCO_GETOPT:
420 switch (optname) {
421 case TCP_NODELAY:
422 m->m_len = sizeof(int);
423 *mtod(m, int *) = tp->t_flags & TF_NODELAY;
424 break;
425 case TCP_NOPUSH:
426 m->m_len = sizeof(int);
427 *mtod(m, int *) = tp->t_flags & TF_NOPUSH;
428 break;
429 case TCP_MAXSEG:
430 m->m_len = sizeof(int);
431 *mtod(m, int *) = tp->t_maxseg;
432 break;
433 case TCP_SACK_ENABLE:
434 m->m_len = sizeof(int);
435 *mtod(m, int *) = tp->sack_enable;
436 break;
437 case TCP_INFO:
438 error = tcp_fill_info(tp, so, m);
439 break;
440 #ifdef TCP_SIGNATURE
441 case TCP_MD5SIG:
442 m->m_len = sizeof(int);
443 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
444 break;
445 #endif
446 default:
447 error = ENOPROTOOPT;
448 break;
449 }
450 break;
451 }
452 return (error);
453 }
454
455 /*
456 * Attach TCP protocol to socket, allocating
457 * internet protocol control block, tcp control block,
458 * buffer space, and entering LISTEN state to accept connections.
459 */
460 int
461 tcp_attach(struct socket *so, int proto, int wait)
462 {
463 struct tcpcb *tp;
464 struct inpcb *inp;
465 int error;
466
467 if (so->so_pcb)
468 return EISCONN;
469 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
470 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
471 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
472 error = soreserve(so, tcp_sendspace, tcp_recvspace);
473 if (error)
474 return (error);
475 }
476
477 NET_ASSERT_LOCKED();
478 error = in_pcballoc(so, &tcbtable, wait);
479 if (error)
480 return (error);
481 inp = sotoinpcb(so);
482 tp = tcp_newtcpcb(inp, wait);
483 if (tp == NULL) {
484 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */
485
486 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
487 in_pcbdetach(inp);
488 so->so_state |= nofd;
489 return (ENOBUFS);
490 }
491 tp->t_state = TCPS_CLOSED;
492 #ifdef INET6
493 /* we disallow IPv4 mapped address completely. */
494 if (inp->inp_flags & INP_IPV6)
495 tp->pf = PF_INET6;
496 else
497 tp->pf = PF_INET;
498 #else
499 tp->pf = PF_INET;
500 #endif
501 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
502 so->so_linger = TCP_LINGERTIME;
503
504 if (so->so_options & SO_DEBUG)
505 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0);
506 return (0);
507 }
508
509 int
510 tcp_detach(struct socket *so)
511 {
512 struct inpcb *inp;
513 struct tcpcb *otp = NULL, *tp;
514 int error = 0;
515 short ostate;
516
517 soassertlocked(so);
518
519 if ((error = tcp_sogetpcb(so, &inp, &tp)))
520 return (error);
521
522 if (so->so_options & SO_DEBUG) {
523 otp = tp;
524 ostate = tp->t_state;
525 }
526
527 /*
528 * Detach the TCP protocol from the socket.
529 * If the protocol state is non-embryonic, then can't
530 * do this directly: have to initiate a PRU_DISCONNECT,
531 * which may finish later; embryonic TCB's can just
532 * be discarded here.
533 */
534 tp = tcp_dodisconnect(tp);
535
536 if (otp)
537 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0);
538 return (error);
539 }
540
541 /*
542 * Give the socket an address.
543 */
544 int
545 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p)
546 {
547 struct inpcb *inp;
548 struct tcpcb *tp;
549 int error;
550 short ostate;
551
552 soassertlocked(so);
553
554 if ((error = tcp_sogetpcb(so, &inp, &tp)))
555 return (error);
556
557 if (so->so_options & SO_DEBUG)
558 ostate = tp->t_state;
559
560 error = in_pcbbind(inp, nam, p);
561
562 if (so->so_options & SO_DEBUG)
563 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0);
564 return (error);
565 }
566
567 /*
568 * Prepare to accept connections.
569 */
570 int
571 tcp_listen(struct socket *so)
572 {
573 struct inpcb *inp;
574 struct tcpcb *tp, *otp = NULL;
575 int error;
576 short ostate;
577
578 soassertlocked(so);
579
580 if ((error = tcp_sogetpcb(so, &inp, &tp)))
581 return (error);
582
583 if (so->so_options & SO_DEBUG) {
584 otp = tp;
585 ostate = tp->t_state;
586 }
587
588 if (inp->inp_lport == 0)
589 if ((error = in_pcbbind(inp, NULL, curproc)))
590 goto out;
591
592 /*
593 * If the in_pcbbind() above is called, the tp->pf
594 * should still be whatever it was before.
595 */
596 tp->t_state = TCPS_LISTEN;
597
598 out:
599 if (otp)
600 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0);
601 return (error);
602 }
603
604 /*
605 * Initiate connection to peer.
606 * Create a template for use in transmissions on this connection.
607 * Enter SYN_SENT state, and mark socket as connecting.
608 * Start keep-alive timer, and seed output sequence space.
609 * Send initial segment on connection.
610 */
611 int
612 tcp_connect(struct socket *so, struct mbuf *nam)
613 {
614 struct inpcb *inp;
615 struct tcpcb *tp, *otp = NULL;
616 int error;
617 short ostate;
618
619 soassertlocked(so);
620
621 if ((error = tcp_sogetpcb(so, &inp, &tp)))
622 return (error);
623
624 if (so->so_options & SO_DEBUG) {
625 otp = tp;
626 ostate = tp->t_state;
627 }
628
629 #ifdef INET6
630 if (inp->inp_flags & INP_IPV6) {
631 struct sockaddr_in6 *sin6;
632
633 if ((error = in6_nam2sin6(nam, &sin6)))
634 goto out;
635 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
636 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
637 error = EINVAL;
638 goto out;
639 }
640 error = in6_pcbconnect(inp, nam);
641 } else
642 #endif /* INET6 */
643 {
644 struct sockaddr_in *sin;
645
646 if ((error = in_nam2sin(nam, &sin)))
647 goto out;
648 if ((sin->sin_addr.s_addr == INADDR_ANY) ||
649 (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
650 IN_MULTICAST(sin->sin_addr.s_addr) ||
651 in_broadcast(sin->sin_addr, inp->inp_rtableid)) {
652 error = EINVAL;
653 goto out;
654 }
655 error = in_pcbconnect(inp, nam);
656 }
657 if (error)
658 goto out;
659
660 tp->t_template = tcp_template(tp);
661 if (tp->t_template == 0) {
662 in_pcbdisconnect(inp);
663 error = ENOBUFS;
664 goto out;
665 }
666
667 so->so_state |= SS_CONNECTOUT;
668
669 /* Compute window scaling to request. */
670 tcp_rscale(tp, sb_max);
671
672 soisconnecting(so);
673 tcpstat_inc(tcps_connattempt);
674 tp->t_state = TCPS_SYN_SENT;
675 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcptv_keep_init));
676 tcp_set_iss_tsm(tp);
677 tcp_sendseqinit(tp);
678 tp->snd_last = tp->snd_una;
679 error = tcp_output(tp);
680
681 out:
682 if (otp)
683 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0);
684 return (error);
685 }
686
687 /*
688 * Accept a connection. Essentially all the work is done at higher
689 * levels; just return the address of the peer, storing through addr.
690 */
691 int
692 tcp_accept(struct socket *so, struct mbuf *nam)
693 {
694 struct inpcb *inp;
695 struct tcpcb *tp;
696 int error;
697 short ostate;
698
699 soassertlocked(so);
700
701 if ((error = tcp_sogetpcb(so, &inp, &tp)))
702 return (error);
703
704 if (so->so_options & SO_DEBUG)
705 ostate = tp->t_state;
706
707 #ifdef INET6
708 if (inp->inp_flags & INP_IPV6)
709 in6_setpeeraddr(inp, nam);
710 else
711 #endif
712 in_setpeeraddr(inp, nam);
713
714 if (so->so_options & SO_DEBUG)
715 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_ACCEPT, 0);
716 return (error);
717 }
718
719 /*
720 * Initiate disconnect from peer.
721 * If connection never passed embryonic stage, just drop;
722 * else if don't need to let data drain, then can just drop anyways,
723 * else have to begin TCP shutdown process: mark socket disconnecting,
724 * drain unread data, state switch to reflect user close, and
725 * send segment (e.g. FIN) to peer. Socket will be really disconnected
726 * when peer sends FIN and acks ours.
727 *
728 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
729 */
730 int
731 tcp_disconnect(struct socket *so)
732 {
733 struct inpcb *inp;
734 struct tcpcb *tp, *otp = NULL;
735 int error;
736 short ostate;
737
738 soassertlocked(so);
739
740 if ((error = tcp_sogetpcb(so, &inp, &tp)))
741 return (error);
742
743 if (so->so_options & SO_DEBUG) {
744 otp = tp;
745 ostate = tp->t_state;
746 }
747
748 tp = tcp_dodisconnect(tp);
749
750 if (otp)
751 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0);
752 return (0);
753 }
754
755 /*
756 * Mark the connection as being incapable of further output.
757 */
758 int
759 tcp_shutdown(struct socket *so)
760 {
761 struct inpcb *inp;
762 struct tcpcb *tp, *otp = NULL;
763 int error;
764 short ostate;
765
766 soassertlocked(so);
767
768 if ((error = tcp_sogetpcb(so, &inp, &tp)))
769 return (error);
770
771 if (so->so_options & SO_DEBUG) {
772 otp = tp;
773 ostate = tp->t_state;
774 }
775
776 if (so->so_snd.sb_state & SS_CANTSENDMORE)
777 goto out;
778
779 socantsendmore(so);
780 tp = tcp_usrclosed(tp);
781 if (tp)
782 error = tcp_output(tp);
783
784 out:
785 if (otp)
786 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0);
787 return (error);
788 }
789
790 /*
791 * After a receive, possibly send window update to peer.
792 */
793 void
794 tcp_rcvd(struct socket *so)
795 {
796 struct inpcb *inp;
797 struct tcpcb *tp;
798 short ostate;
799
800 soassertlocked(so);
801
802 if (tcp_sogetpcb(so, &inp, &tp))
803 return;
804
805 if (so->so_options & SO_DEBUG)
806 ostate = tp->t_state;
807
808 /*
809 * soreceive() calls this function when a user receives
810 * ancillary data on a listening socket. We don't call
811 * tcp_output in such a case, since there is no header
812 * template for a listening socket and hence the kernel
813 * will panic.
814 */
815 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
816 (void) tcp_output(tp);
817
818 if (so->so_options & SO_DEBUG)
819 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0);
820 }
821
822 /*
823 * Do a send by putting data in output queue and updating urgent
824 * marker if URG set. Possibly send more data.
825 */
826 int
827 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
828 struct mbuf *control)
829 {
830 struct inpcb *inp;
831 struct tcpcb *tp;
832 int error;
833 short ostate;
834
835 soassertlocked(so);
836
837 if (control && control->m_len) {
838 error = EINVAL;
839 goto out;
840 }
841
842 if ((error = tcp_sogetpcb(so, &inp, &tp)))
843 goto out;
844
845 if (so->so_options & SO_DEBUG)
846 ostate = tp->t_state;
847
848 sbappendstream(so, &so->so_snd, m);
849 m = NULL;
850
851 error = tcp_output(tp);
852
853 if (so->so_options & SO_DEBUG)
854 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0);
855
856 out:
857 m_freem(control);
858 m_freem(m);
859
860 return (error);
861 }
862
863 /*
864 * Abort the TCP.
865 */
866 void
867 tcp_abort(struct socket *so)
868 {
869 struct inpcb *inp;
870 struct tcpcb *tp, *otp = NULL;
871 short ostate;
872
873 soassertlocked(so);
874
875 if (tcp_sogetpcb(so, &inp, &tp))
876 return;
877
878 if (so->so_options & SO_DEBUG) {
879 otp = tp;
880 ostate = tp->t_state;
881 }
882
883 tp = tcp_drop(tp, ECONNABORTED);
884
885 if (otp)
886 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0);
887 }
888
889 int
890 tcp_sense(struct socket *so, struct stat *ub)
891 {
892 struct inpcb *inp;
893 struct tcpcb *tp;
894 int error;
895
896 soassertlocked(so);
897
898 if ((error = tcp_sogetpcb(so, &inp, &tp)))
899 return (error);
900
901 ub->st_blksize = so->so_snd.sb_hiwat;
902
903 if (so->so_options & SO_DEBUG)
904 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
905 return (0);
906 }
907
908 int
909 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags)
910 {
911 struct inpcb *inp;
912 struct tcpcb *tp;
913 int error;
914
915 soassertlocked(so);
916
917 if ((error = tcp_sogetpcb(so, &inp, &tp)))
918 return (error);
919
920 if ((so->so_oobmark == 0 &&
921 (so->so_rcv.sb_state & SS_RCVATMARK) == 0) ||
922 so->so_options & SO_OOBINLINE ||
923 tp->t_oobflags & TCPOOB_HADDATA) {
924 error = EINVAL;
925 goto out;
926 }
927 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
928 error = EWOULDBLOCK;
929 goto out;
930 }
931 m->m_len = 1;
932 *mtod(m, caddr_t) = tp->t_iobc;
933 if ((flags & MSG_PEEK) == 0)
934 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
935 out:
936 if (so->so_options & SO_DEBUG)
937 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0);
938 return (error);
939 }
940
941 int
942 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam,
943 struct mbuf *control)
944 {
945 struct inpcb *inp;
946 struct tcpcb *tp;
947 int error;
948 short ostate;
949
950 soassertlocked(so);
951
952 if (control && control->m_len) {
953 error = EINVAL;
954 goto release;
955 }
956
957 if ((error = tcp_sogetpcb(so, &inp, &tp)))
958 goto release;
959
960 if (so->so_options & SO_DEBUG)
961 ostate = tp->t_state;
962
963 if (sbspace(so, &so->so_snd) < -512) {
964 error = ENOBUFS;
965 goto out;
966 }
967
968 /*
969 * According to RFC961 (Assigned Protocols),
970 * the urgent pointer points to the last octet
971 * of urgent data. We continue, however,
972 * to consider it to indicate the first octet
973 * of data past the urgent section.
974 * Otherwise, snd_up should be one lower.
975 */
976 sbappendstream(so, &so->so_snd, m);
977 m = NULL;
978 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
979 tp->t_force = 1;
980 error = tcp_output(tp);
981 tp->t_force = 0;
982
983 out:
984 if (so->so_options & SO_DEBUG)
985 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0);
986
987 release:
988 m_freem(control);
989 m_freem(m);
990
991 return (error);
992 }
993
994 int
995 tcp_sockaddr(struct socket *so, struct mbuf *nam)
996 {
997 struct inpcb *inp;
998 struct tcpcb *tp;
999 int error;
1000
1001 soassertlocked(so);
1002
1003 if ((error = tcp_sogetpcb(so, &inp, &tp)))
1004 return (error);
1005
1006 #ifdef INET6
1007 if (inp->inp_flags & INP_IPV6)
1008 in6_setsockaddr(inp, nam);
1009 else
1010 #endif
1011 in_setsockaddr(inp, nam);
1012
1013 if (so->so_options & SO_DEBUG)
1014 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
1015 PRU_SOCKADDR, 0);
1016 return (0);
1017 }
1018
1019 int
1020 tcp_peeraddr(struct socket *so, struct mbuf *nam)
1021 {
1022 struct inpcb *inp;
1023 struct tcpcb *tp;
1024 int error;
1025
1026 soassertlocked(so);
1027
1028 if ((error = tcp_sogetpcb(so, &inp, &tp)))
1029 return (error);
1030
1031 #ifdef INET6
1032 if (inp->inp_flags & INP_IPV6)
1033 in6_setpeeraddr(inp, nam);
1034 else
1035 #endif
1036 in_setpeeraddr(inp, nam);
1037
1038 if (so->so_options & SO_DEBUG)
1039 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
1040 PRU_PEERADDR, 0);
1041 return (0);
1042 }
1043
1044 /*
1045 * Initiate (or continue) disconnect.
1046 * If embryonic state, just send reset (once).
1047 * If in ``let data drain'' option and linger null, just drop.
1048 * Otherwise (hard), mark socket disconnecting and drop
1049 * current input data; switch states based on user close, and
1050 * send segment to peer (with FIN).
1051 */
1052 struct tcpcb *
1053 tcp_dodisconnect(struct tcpcb *tp)
1054 {
1055 struct socket *so = tp->t_inpcb->inp_socket;
1056
1057 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
1058 tp = tcp_close(tp);
1059 else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1060 tp = tcp_drop(tp, 0);
1061 else {
1062 soisdisconnecting(so);
1063 sbflush(so, &so->so_rcv);
1064 tp = tcp_usrclosed(tp);
1065 if (tp)
1066 (void) tcp_output(tp);
1067 }
1068 return (tp);
1069 }
1070
1071 /*
1072 * User issued close, and wish to trail through shutdown states:
1073 * if never received SYN, just forget it. If got a SYN from peer,
1074 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1075 * If already got a FIN from peer, then almost done; go to LAST_ACK
1076 * state. In all other cases, have already sent FIN to peer (e.g.
1077 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1078 * for peer to send FIN or not respond to keep-alives, etc.
1079 * We can let the user exit from the close as soon as the FIN is acked.
1080 */
1081 struct tcpcb *
1082 tcp_usrclosed(struct tcpcb *tp)
1083 {
1084
1085 switch (tp->t_state) {
1086
1087 case TCPS_CLOSED:
1088 case TCPS_LISTEN:
1089 case TCPS_SYN_SENT:
1090 tp->t_state = TCPS_CLOSED;
1091 tp = tcp_close(tp);
1092 break;
1093
1094 case TCPS_SYN_RECEIVED:
1095 case TCPS_ESTABLISHED:
1096 tp->t_state = TCPS_FIN_WAIT_1;
1097 break;
1098
1099 case TCPS_CLOSE_WAIT:
1100 tp->t_state = TCPS_LAST_ACK;
1101 break;
1102 }
1103 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1104 soisdisconnected(tp->t_inpcb->inp_socket);
1105 /*
1106 * If we are in FIN_WAIT_2, we arrived here because the
1107 * application did a shutdown of the send side. Like the
1108 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
1109 * a full close, we start a timer to make sure sockets are
1110 * not left in FIN_WAIT_2 forever.
1111 */
1112 if (tp->t_state == TCPS_FIN_WAIT_2)
1113 TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(tcp_maxidle));
1114 }
1115 return (tp);
1116 }
1117
1118 /*
1119 * Look up a socket for ident or tcpdrop, ...
1120 */
1121 int
1122 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
1123 {
1124 int error = 0;
1125 struct tcp_ident_mapping tir;
1126 struct inpcb *inp;
1127 struct tcpcb *tp = NULL;
1128 struct sockaddr_in *fin, *lin;
1129 #ifdef INET6
1130 struct sockaddr_in6 *fin6, *lin6;
1131 struct in6_addr f6, l6;
1132 #endif
1133
1134 NET_ASSERT_LOCKED();
1135
1136 if (dodrop) {
1137 if (oldp != NULL || *oldlenp != 0)
1138 return (EINVAL);
1139 if (newp == NULL)
1140 return (EPERM);
1141 if (newlen < sizeof(tir))
1142 return (ENOMEM);
1143 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
1144 return (error);
1145 } else {
1146 if (oldp == NULL)
1147 return (EINVAL);
1148 if (*oldlenp < sizeof(tir))
1149 return (ENOMEM);
1150 if (newp != NULL || newlen != 0)
1151 return (EINVAL);
1152 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
1153 return (error);
1154 }
1155 switch (tir.faddr.ss_family) {
1156 #ifdef INET6
1157 case AF_INET6:
1158 fin6 = (struct sockaddr_in6 *)&tir.faddr;
1159 error = in6_embedscope(&f6, fin6, NULL);
1160 if (error)
1161 return EINVAL; /*?*/
1162 lin6 = (struct sockaddr_in6 *)&tir.laddr;
1163 error = in6_embedscope(&l6, lin6, NULL);
1164 if (error)
1165 return EINVAL; /*?*/
1166 break;
1167 #endif
1168 case AF_INET:
1169 fin = (struct sockaddr_in *)&tir.faddr;
1170 lin = (struct sockaddr_in *)&tir.laddr;
1171 break;
1172 default:
1173 return (EINVAL);
1174 }
1175
1176 switch (tir.faddr.ss_family) {
1177 #ifdef INET6
1178 case AF_INET6:
1179 inp = in6_pcblookup(&tcbtable, &f6,
1180 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain);
1181 break;
1182 #endif
1183 case AF_INET:
1184 inp = in_pcblookup(&tcbtable, fin->sin_addr,
1185 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain);
1186 break;
1187 default:
1188 unhandled_af(tir.faddr.ss_family);
1189 }
1190
1191 if (dodrop) {
1192 if (inp && (tp = intotcpcb(inp)) &&
1193 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
1194 tp = tcp_drop(tp, ECONNABORTED);
1195 else
1196 error = ESRCH;
1197 in_pcbunref(inp);
1198 return (error);
1199 }
1200
1201 if (inp == NULL) {
1202 tcpstat_inc(tcps_pcbhashmiss);
1203 switch (tir.faddr.ss_family) {
1204 #ifdef INET6
1205 case AF_INET6:
1206 inp = in6_pcblookup_listen(&tcbtable,
1207 &l6, lin6->sin6_port, NULL, tir.rdomain);
1208 break;
1209 #endif
1210 case AF_INET:
1211 inp = in_pcblookup_listen(&tcbtable,
1212 lin->sin_addr, lin->sin_port, NULL, tir.rdomain);
1213 break;
1214 }
1215 }
1216
1217 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
1218 tir.ruid = inp->inp_socket->so_ruid;
1219 tir.euid = inp->inp_socket->so_euid;
1220 } else {
1221 tir.ruid = -1;
1222 tir.euid = -1;
1223 }
1224
1225 *oldlenp = sizeof (tir);
1226 error = copyout((void *)&tir, oldp, sizeof (tir));
1227 in_pcbunref(inp);
1228 return (error);
1229 }
1230
1231 int
1232 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp)
1233 {
1234 uint64_t counters[tcps_ncounters];
1235 struct tcpstat tcpstat;
1236 struct syn_cache_set *set;
1237 int i = 0;
1238
1239 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0)
1240
1241 memset(&tcpstat, 0, sizeof tcpstat);
1242 counters_read(tcpcounters, counters, nitems(counters));
1243 ASSIGN(tcps_connattempt);
1244 ASSIGN(tcps_accepts);
1245 ASSIGN(tcps_connects);
1246 ASSIGN(tcps_drops);
1247 ASSIGN(tcps_conndrops);
1248 ASSIGN(tcps_closed);
1249 ASSIGN(tcps_segstimed);
1250 ASSIGN(tcps_rttupdated);
1251 ASSIGN(tcps_delack);
1252 ASSIGN(tcps_timeoutdrop);
1253 ASSIGN(tcps_rexmttimeo);
1254 ASSIGN(tcps_persisttimeo);
1255 ASSIGN(tcps_persistdrop);
1256 ASSIGN(tcps_keeptimeo);
1257 ASSIGN(tcps_keepprobe);
1258 ASSIGN(tcps_keepdrops);
1259 ASSIGN(tcps_sndtotal);
1260 ASSIGN(tcps_sndpack);
1261 ASSIGN(tcps_sndbyte);
1262 ASSIGN(tcps_sndrexmitpack);
1263 ASSIGN(tcps_sndrexmitbyte);
1264 ASSIGN(tcps_sndrexmitfast);
1265 ASSIGN(tcps_sndacks);
1266 ASSIGN(tcps_sndprobe);
1267 ASSIGN(tcps_sndurg);
1268 ASSIGN(tcps_sndwinup);
1269 ASSIGN(tcps_sndctrl);
1270 ASSIGN(tcps_rcvtotal);
1271 ASSIGN(tcps_rcvpack);
1272 ASSIGN(tcps_rcvbyte);
1273 ASSIGN(tcps_rcvbadsum);
1274 ASSIGN(tcps_rcvbadoff);
1275 ASSIGN(tcps_rcvmemdrop);
1276 ASSIGN(tcps_rcvnosec);
1277 ASSIGN(tcps_rcvshort);
1278 ASSIGN(tcps_rcvduppack);
1279 ASSIGN(tcps_rcvdupbyte);
1280 ASSIGN(tcps_rcvpartduppack);
1281 ASSIGN(tcps_rcvpartdupbyte);
1282 ASSIGN(tcps_rcvoopack);
1283 ASSIGN(tcps_rcvoobyte);
1284 ASSIGN(tcps_rcvpackafterwin);
1285 ASSIGN(tcps_rcvbyteafterwin);
1286 ASSIGN(tcps_rcvafterclose);
1287 ASSIGN(tcps_rcvwinprobe);
1288 ASSIGN(tcps_rcvdupack);
1289 ASSIGN(tcps_rcvacktoomuch);
1290 ASSIGN(tcps_rcvacktooold);
1291 ASSIGN(tcps_rcvackpack);
1292 ASSIGN(tcps_rcvackbyte);
1293 ASSIGN(tcps_rcvwinupd);
1294 ASSIGN(tcps_pawsdrop);
1295 ASSIGN(tcps_predack);
1296 ASSIGN(tcps_preddat);
1297 ASSIGN(tcps_pcbhashmiss);
1298 ASSIGN(tcps_noport);
1299 ASSIGN(tcps_badsyn);
1300 ASSIGN(tcps_dropsyn);
1301 ASSIGN(tcps_rcvbadsig);
1302 ASSIGN(tcps_rcvgoodsig);
1303 ASSIGN(tcps_inswcsum);
1304 ASSIGN(tcps_outswcsum);
1305 ASSIGN(tcps_ecn_accepts);
1306 ASSIGN(tcps_ecn_rcvece);
1307 ASSIGN(tcps_ecn_rcvcwr);
1308 ASSIGN(tcps_ecn_rcvce);
1309 ASSIGN(tcps_ecn_sndect);
1310 ASSIGN(tcps_ecn_sndece);
1311 ASSIGN(tcps_ecn_sndcwr);
1312 ASSIGN(tcps_cwr_ecn);
1313 ASSIGN(tcps_cwr_frecovery);
1314 ASSIGN(tcps_cwr_timeout);
1315 ASSIGN(tcps_sc_added);
1316 ASSIGN(tcps_sc_completed);
1317 ASSIGN(tcps_sc_timed_out);
1318 ASSIGN(tcps_sc_overflowed);
1319 ASSIGN(tcps_sc_reset);
1320 ASSIGN(tcps_sc_unreach);
1321 ASSIGN(tcps_sc_bucketoverflow);
1322 ASSIGN(tcps_sc_aborted);
1323 ASSIGN(tcps_sc_dupesyn);
1324 ASSIGN(tcps_sc_dropped);
1325 ASSIGN(tcps_sc_collisions);
1326 ASSIGN(tcps_sc_retransmitted);
1327 ASSIGN(tcps_sc_seedrandom);
1328 ASSIGN(tcps_sc_hash_size);
1329 ASSIGN(tcps_sc_entry_count);
1330 ASSIGN(tcps_sc_entry_limit);
1331 ASSIGN(tcps_sc_bucket_maxlen);
1332 ASSIGN(tcps_sc_bucket_limit);
1333 ASSIGN(tcps_sc_uses_left);
1334 ASSIGN(tcps_conndrained);
1335 ASSIGN(tcps_sack_recovery_episode);
1336 ASSIGN(tcps_sack_rexmits);
1337 ASSIGN(tcps_sack_rexmit_bytes);
1338 ASSIGN(tcps_sack_rcv_opts);
1339 ASSIGN(tcps_sack_snd_opts);
1340 ASSIGN(tcps_sack_drop_opts);
1341
1342 #undef ASSIGN
1343
1344 set = &tcp_syn_cache[tcp_syn_cache_active];
1345 tcpstat.tcps_sc_hash_size = set->scs_size;
1346 tcpstat.tcps_sc_entry_count = set->scs_count;
1347 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit;
1348 tcpstat.tcps_sc_bucket_maxlen = 0;
1349 for (i = 0; i < set->scs_size; i++) {
1350 if (tcpstat.tcps_sc_bucket_maxlen <
1351 set->scs_buckethead[i].sch_length)
1352 tcpstat.tcps_sc_bucket_maxlen =
1353 set->scs_buckethead[i].sch_length;
1354 }
1355 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit;
1356 tcpstat.tcps_sc_uses_left = set->scs_use;
1357
1358 return (sysctl_rdstruct(oldp, oldlenp, newp,
1359 &tcpstat, sizeof(tcpstat)));
1360 }
1361
1362 /*
1363 * Sysctl for tcp variables.
1364 */
1365 int
1366 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1367 size_t newlen)
1368 {
1369 int error, nval;
1370
1371 /* All sysctl names at this level are terminal. */
1372 if (namelen != 1)
1373 return (ENOTDIR);
1374
1375 switch (name[0]) {
1376 case TCPCTL_BADDYNAMIC:
1377 NET_LOCK();
1378 error = sysctl_struct(oldp, oldlenp, newp, newlen,
1379 baddynamicports.tcp, sizeof(baddynamicports.tcp));
1380 NET_UNLOCK();
1381 return (error);
1382
1383 case TCPCTL_ROOTONLY:
1384 if (newp && securelevel > 0)
1385 return (EPERM);
1386 NET_LOCK();
1387 error = sysctl_struct(oldp, oldlenp, newp, newlen,
1388 rootonlyports.tcp, sizeof(rootonlyports.tcp));
1389 NET_UNLOCK();
1390 return (error);
1391
1392 case TCPCTL_IDENT:
1393 NET_LOCK();
1394 error = tcp_ident(oldp, oldlenp, newp, newlen, 0);
1395 NET_UNLOCK();
1396 return (error);
1397
1398 case TCPCTL_DROP:
1399 NET_LOCK();
1400 error = tcp_ident(oldp, oldlenp, newp, newlen, 1);
1401 NET_UNLOCK();
1402 return (error);
1403
1404 case TCPCTL_REASS_LIMIT:
1405 NET_LOCK();
1406 nval = tcp_reass_limit;
1407 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1408 if (!error && nval != tcp_reass_limit) {
1409 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
1410 if (!error)
1411 tcp_reass_limit = nval;
1412 }
1413 NET_UNLOCK();
1414 return (error);
1415
1416 case TCPCTL_SACKHOLE_LIMIT:
1417 NET_LOCK();
1418 nval = tcp_sackhole_limit;
1419 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1420 if (!error && nval != tcp_sackhole_limit) {
1421 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
1422 if (!error)
1423 tcp_sackhole_limit = nval;
1424 }
1425 NET_UNLOCK();
1426 return (error);
1427
1428 case TCPCTL_STATS:
1429 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp));
1430
1431 case TCPCTL_SYN_USE_LIMIT:
1432 NET_LOCK();
1433 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1434 &tcp_syn_use_limit, 0, INT_MAX);
1435 if (!error && newp != NULL) {
1436 /*
1437 * Global tcp_syn_use_limit is used when reseeding a
1438 * new cache. Also update the value in active cache.
1439 */
1440 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit)
1441 tcp_syn_cache[0].scs_use = tcp_syn_use_limit;
1442 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit)
1443 tcp_syn_cache[1].scs_use = tcp_syn_use_limit;
1444 }
1445 NET_UNLOCK();
1446 return (error);
1447
1448 case TCPCTL_SYN_HASH_SIZE:
1449 NET_LOCK();
1450 nval = tcp_syn_hash_size;
1451 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1452 &nval, 1, 100000);
1453 if (!error && nval != tcp_syn_hash_size) {
1454 /*
1455 * If global hash size has been changed,
1456 * switch sets as soon as possible. Then
1457 * the actual hash array will be reallocated.
1458 */
1459 if (tcp_syn_cache[0].scs_size != nval)
1460 tcp_syn_cache[0].scs_use = 0;
1461 if (tcp_syn_cache[1].scs_size != nval)
1462 tcp_syn_cache[1].scs_use = 0;
1463 tcp_syn_hash_size = nval;
1464 }
1465 NET_UNLOCK();
1466 return (error);
1467
1468 default:
1469 NET_LOCK();
1470 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name,
1471 namelen, oldp, oldlenp, newp, newlen);
1472 NET_UNLOCK();
1473 return (error);
1474 }
1475 /* NOTREACHED */
1476 }
1477
1478 /*
1479 * Scale the send buffer so that inflight data is not accounted against
1480 * the limit. The buffer will scale with the congestion window, if the
1481 * the receiver stops acking data the window will shrink and therefore
1482 * the buffer size will shrink as well.
1483 * In low memory situation try to shrink the buffer to the initial size
1484 * disabling the send buffer scaling as long as the situation persists.
1485 */
1486 void
1487 tcp_update_sndspace(struct tcpcb *tp)
1488 {
1489 struct socket *so = tp->t_inpcb->inp_socket;
1490 u_long nmax = so->so_snd.sb_hiwat;
1491
1492 if (sbchecklowmem()) {
1493 /* low on memory try to get rid of some */
1494 if (tcp_sendspace < nmax)
1495 nmax = tcp_sendspace;
1496 } else if (so->so_snd.sb_wat != tcp_sendspace)
1497 /* user requested buffer size, auto-scaling disabled */
1498 nmax = so->so_snd.sb_wat;
1499 else
1500 /* automatic buffer scaling */
1501 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
1502 tp->snd_una);
1503
1504 /* a writable socket must be preserved because of poll(2) semantics */
1505 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
1506 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
1507 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
1508 /* keep in sync with sbreserve() calculation */
1509 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat)
1510 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8;
1511 }
1512
1513 /* round to MSS boundary */
1514 nmax = roundup(nmax, tp->t_maxseg);
1515
1516 if (nmax != so->so_snd.sb_hiwat)
1517 sbreserve(so, &so->so_snd, nmax);
1518 }
1519
1520 /*
1521 * Scale the recv buffer by looking at how much data was transferred in
1522 * one approximated RTT. If more than a big part of the recv buffer was
1523 * transferred during that time we increase the buffer by a constant.
1524 * In low memory situation try to shrink the buffer to the initial size.
1525 */
1526 void
1527 tcp_update_rcvspace(struct tcpcb *tp)
1528 {
1529 struct socket *so = tp->t_inpcb->inp_socket;
1530 u_long nmax = so->so_rcv.sb_hiwat;
1531
1532 if (sbchecklowmem()) {
1533 /* low on memory try to get rid of some */
1534 if (tcp_recvspace < nmax)
1535 nmax = tcp_recvspace;
1536 } else if (so->so_rcv.sb_wat != tcp_recvspace)
1537 /* user requested buffer size, auto-scaling disabled */
1538 nmax = so->so_rcv.sb_wat;
1539 else {
1540 /* automatic buffer scaling */
1541 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
1542 nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
1543 tcp_autorcvbuf_inc);
1544 }
1545
1546 /* a readable socket must be preserved because of poll(2) semantics */
1547 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
1548 nmax < so->so_snd.sb_lowat)
1549 nmax = so->so_snd.sb_lowat;
1550
1551 if (nmax == so->so_rcv.sb_hiwat)
1552 return;
1553
1554 /* round to MSS boundary */
1555 nmax = roundup(nmax, tp->t_maxseg);
1556 sbreserve(so, &so->so_rcv, nmax);
1557 }
Cache object: 5523041c5f4e697d384ff49c08e3e858
|