1 /* $OpenBSD: ip_output.c,v 1.382 2022/08/12 17:04:16 bluhm Exp $ */
2 /* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
33 */
34
35 #include "pf.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/mbuf.h>
40 #include <sys/protosw.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/proc.h>
44 #include <sys/kernel.h>
45
46 #include <net/if.h>
47 #include <net/if_var.h>
48 #include <net/if_enc.h>
49 #include <net/route.h>
50
51 #include <netinet/in.h>
52 #include <netinet/ip.h>
53 #include <netinet/in_pcb.h>
54 #include <netinet/in_var.h>
55 #include <netinet/ip_var.h>
56 #include <netinet/ip_icmp.h>
57 #include <netinet/tcp.h>
58 #include <netinet/udp.h>
59 #include <netinet/tcp_timer.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/udp_var.h>
62
63 #if NPF > 0
64 #include <net/pfvar.h>
65 #endif
66
67 #ifdef IPSEC
68 #ifdef ENCDEBUG
69 #define DPRINTF(fmt, args...) \
70 do { \
71 if (encdebug) \
72 printf("%s: " fmt "\n", __func__, ## args); \
73 } while (0)
74 #else
75 #define DPRINTF(fmt, args...) \
76 do { } while (0)
77 #endif
78 #endif /* IPSEC */
79
80 int ip_pcbopts(struct mbuf **, struct mbuf *);
81 int ip_multicast_if(struct ip_mreqn *, u_int, unsigned int *);
82 int ip_setmoptions(int, struct ip_moptions **, struct mbuf *, u_int);
83 void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *);
84 static __inline u_int16_t __attribute__((__unused__))
85 in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t);
86 void in_delayed_cksum(struct mbuf *);
87 int in_ifcap_cksum(struct mbuf *, struct ifnet *, int);
88
89 int ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp,
90 struct tdb **, int ipsecflowinfo);
91 void ip_output_ipsec_pmtu_update(struct tdb *, struct route *, struct in_addr,
92 int, int);
93 int ip_output_ipsec_send(struct tdb *, struct mbuf *, struct route *, int);
94
95 /*
96 * IP output. The packet in mbuf chain m contains a skeletal IP
97 * header (with len, off, ttl, proto, tos, src, dst).
98 * The mbuf chain containing the packet will be freed.
99 * The mbuf opt, if present, will not be freed.
100 */
101 int
102 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
103 struct ip_moptions *imo, struct inpcb *inp, u_int32_t ipsecflowinfo)
104 {
105 struct ip *ip;
106 struct ifnet *ifp = NULL;
107 struct mbuf_list fml;
108 int hlen = sizeof (struct ip);
109 int error = 0;
110 struct route iproute;
111 struct sockaddr_in *dst;
112 struct tdb *tdb = NULL;
113 u_long mtu;
114 #if NPF > 0
115 u_int orig_rtableid;
116 #endif
117
118 NET_ASSERT_LOCKED();
119
120 #ifdef IPSEC
121 if (inp && (inp->inp_flags & INP_IPV6) != 0)
122 panic("ip_output: IPv6 pcb is passed");
123 #endif /* IPSEC */
124
125 #ifdef DIAGNOSTIC
126 if ((m->m_flags & M_PKTHDR) == 0)
127 panic("ip_output no HDR");
128 #endif
129 if (opt)
130 m = ip_insertoptions(m, opt, &hlen);
131
132 ip = mtod(m, struct ip *);
133
134 /*
135 * Fill in IP header.
136 */
137 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
138 ip->ip_v = IPVERSION;
139 ip->ip_off &= htons(IP_DF);
140 ip->ip_id = htons(ip_randomid());
141 ip->ip_hl = hlen >> 2;
142 ipstat_inc(ips_localout);
143 } else {
144 hlen = ip->ip_hl << 2;
145 }
146
147 /*
148 * We should not send traffic to 0/8 say both Stevens and RFCs
149 * 5735 section 3 and 1122 sections 3.2.1.3 and 3.3.6.
150 */
151 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == 0) {
152 error = ENETUNREACH;
153 goto bad;
154 }
155
156 #if NPF > 0
157 orig_rtableid = m->m_pkthdr.ph_rtableid;
158 reroute:
159 #endif
160
161 /*
162 * Do a route lookup now in case we need the source address to
163 * do an SPD lookup in IPsec; for most packets, the source address
164 * is set at a higher level protocol. ICMPs and other packets
165 * though (e.g., traceroute) have a source address of zeroes.
166 */
167 if (ro == NULL) {
168 ro = &iproute;
169 memset(ro, 0, sizeof(*ro));
170 }
171
172 dst = satosin(&ro->ro_dst);
173
174 /*
175 * If there is a cached route, check that it is to the same
176 * destination and is still up. If not, free it and try again.
177 */
178 if (!rtisvalid(ro->ro_rt) ||
179 dst->sin_addr.s_addr != ip->ip_dst.s_addr ||
180 ro->ro_tableid != m->m_pkthdr.ph_rtableid) {
181 rtfree(ro->ro_rt);
182 ro->ro_rt = NULL;
183 }
184
185 if (ro->ro_rt == NULL) {
186 dst->sin_family = AF_INET;
187 dst->sin_len = sizeof(*dst);
188 dst->sin_addr = ip->ip_dst;
189 ro->ro_tableid = m->m_pkthdr.ph_rtableid;
190 }
191
192 if ((IN_MULTICAST(ip->ip_dst.s_addr) ||
193 (ip->ip_dst.s_addr == INADDR_BROADCAST)) &&
194 imo != NULL && (ifp = if_get(imo->imo_ifidx)) != NULL) {
195
196 mtu = ifp->if_mtu;
197 if (ip->ip_src.s_addr == INADDR_ANY) {
198 struct in_ifaddr *ia;
199
200 IFP_TO_IA(ifp, ia);
201 if (ia != NULL)
202 ip->ip_src = ia->ia_addr.sin_addr;
203 }
204 } else {
205 struct in_ifaddr *ia;
206
207 if (ro->ro_rt == NULL)
208 ro->ro_rt = rtalloc_mpath(&ro->ro_dst,
209 &ip->ip_src.s_addr, ro->ro_tableid);
210
211 if (ro->ro_rt == NULL) {
212 ipstat_inc(ips_noroute);
213 error = EHOSTUNREACH;
214 goto bad;
215 }
216
217 ia = ifatoia(ro->ro_rt->rt_ifa);
218 if (ISSET(ro->ro_rt->rt_flags, RTF_LOCAL))
219 ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid));
220 else
221 ifp = if_get(ro->ro_rt->rt_ifidx);
222 /*
223 * We aren't using rtisvalid() here because the UP/DOWN state
224 * machine is broken with some Ethernet drivers like em(4).
225 * As a result we might try to use an invalid cached route
226 * entry while an interface is being detached.
227 */
228 if (ifp == NULL) {
229 ipstat_inc(ips_noroute);
230 error = EHOSTUNREACH;
231 goto bad;
232 }
233 if ((mtu = ro->ro_rt->rt_mtu) == 0)
234 mtu = ifp->if_mtu;
235
236 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
237 dst = satosin(ro->ro_rt->rt_gateway);
238
239 /* Set the source IP address */
240 if (ip->ip_src.s_addr == INADDR_ANY && ia)
241 ip->ip_src = ia->ia_addr.sin_addr;
242 }
243
244 #ifdef IPSEC
245 if (ipsec_in_use || inp != NULL) {
246 /* Do we have any pending SAs to apply ? */
247 error = ip_output_ipsec_lookup(m, hlen, inp, &tdb,
248 ipsecflowinfo);
249 if (error) {
250 /* Should silently drop packet */
251 if (error == -EINVAL)
252 error = 0;
253 goto bad;
254 }
255 if (tdb != NULL) {
256 /*
257 * If it needs TCP/UDP hardware-checksumming, do the
258 * computation now.
259 */
260 in_proto_cksum_out(m, NULL);
261 }
262 }
263 #endif /* IPSEC */
264
265 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
266 (ip->ip_dst.s_addr == INADDR_BROADCAST)) {
267
268 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
269 M_BCAST : M_MCAST;
270
271 /*
272 * IP destination address is multicast. Make sure "dst"
273 * still points to the address in "ro". (It may have been
274 * changed to point to a gateway address, above.)
275 */
276 dst = satosin(&ro->ro_dst);
277
278 /*
279 * See if the caller provided any multicast options
280 */
281 if (imo != NULL)
282 ip->ip_ttl = imo->imo_ttl;
283 else
284 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
285
286 /*
287 * if we don't know the outgoing ifp yet, we can't generate
288 * output
289 */
290 if (!ifp) {
291 ipstat_inc(ips_noroute);
292 error = EHOSTUNREACH;
293 goto bad;
294 }
295
296 /*
297 * Confirm that the outgoing interface supports multicast,
298 * but only if the packet actually is going out on that
299 * interface (i.e., no IPsec is applied).
300 */
301 if ((((m->m_flags & M_MCAST) &&
302 (ifp->if_flags & IFF_MULTICAST) == 0) ||
303 ((m->m_flags & M_BCAST) &&
304 (ifp->if_flags & IFF_BROADCAST) == 0)) && (tdb == NULL)) {
305 ipstat_inc(ips_noroute);
306 error = ENETUNREACH;
307 goto bad;
308 }
309
310 /*
311 * If source address not specified yet, use address
312 * of outgoing interface.
313 */
314 if (ip->ip_src.s_addr == INADDR_ANY) {
315 struct in_ifaddr *ia;
316
317 IFP_TO_IA(ifp, ia);
318 if (ia != NULL)
319 ip->ip_src = ia->ia_addr.sin_addr;
320 }
321
322 if ((imo == NULL || imo->imo_loop) &&
323 in_hasmulti(&ip->ip_dst, ifp)) {
324 /*
325 * If we belong to the destination multicast group
326 * on the outgoing interface, and the caller did not
327 * forbid loopback, loop back a copy.
328 * Can't defer TCP/UDP checksumming, do the
329 * computation now.
330 */
331 in_proto_cksum_out(m, NULL);
332 ip_mloopback(ifp, m, dst);
333 }
334 #ifdef MROUTING
335 else {
336 /*
337 * If we are acting as a multicast router, perform
338 * multicast forwarding as if the packet had just
339 * arrived on the interface to which we are about
340 * to send. The multicast forwarding function
341 * recursively calls this function, using the
342 * IP_FORWARDING flag to prevent infinite recursion.
343 *
344 * Multicasts that are looped back by ip_mloopback(),
345 * above, will be forwarded by the ip_input() routine,
346 * if necessary.
347 */
348 if (ipmforwarding && ip_mrouter[ifp->if_rdomain] &&
349 (flags & IP_FORWARDING) == 0) {
350 int rv;
351
352 KERNEL_LOCK();
353 rv = ip_mforward(m, ifp);
354 KERNEL_UNLOCK();
355 if (rv != 0)
356 goto bad;
357 }
358 }
359 #endif
360 /*
361 * Multicasts with a time-to-live of zero may be looped-
362 * back, above, but must not be transmitted on a network.
363 * Also, multicasts addressed to the loopback interface
364 * are not sent -- the above call to ip_mloopback() will
365 * loop back a copy if this host actually belongs to the
366 * destination group on the loopback interface.
367 */
368 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0)
369 goto bad;
370
371 goto sendit;
372 }
373
374 /*
375 * Look for broadcast address and verify user is allowed to send
376 * such a packet; if the packet is going in an IPsec tunnel, skip
377 * this check.
378 */
379 if ((tdb == NULL) && ((dst->sin_addr.s_addr == INADDR_BROADCAST) ||
380 (ro && ro->ro_rt && ISSET(ro->ro_rt->rt_flags, RTF_BROADCAST)))) {
381 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
382 error = EADDRNOTAVAIL;
383 goto bad;
384 }
385 if ((flags & IP_ALLOWBROADCAST) == 0) {
386 error = EACCES;
387 goto bad;
388 }
389
390 /* Don't allow broadcast messages to be fragmented */
391 if (ntohs(ip->ip_len) > ifp->if_mtu) {
392 error = EMSGSIZE;
393 goto bad;
394 }
395 m->m_flags |= M_BCAST;
396 } else
397 m->m_flags &= ~M_BCAST;
398
399 sendit:
400 /*
401 * If we're doing Path MTU discovery, we need to set DF unless
402 * the route's MTU is locked.
403 */
404 if ((flags & IP_MTUDISC) && ro && ro->ro_rt &&
405 (ro->ro_rt->rt_locks & RTV_MTU) == 0)
406 ip->ip_off |= htons(IP_DF);
407
408 #ifdef IPSEC
409 /*
410 * Check if the packet needs encapsulation.
411 */
412 if (tdb != NULL) {
413 /* Callee frees mbuf */
414 error = ip_output_ipsec_send(tdb, m, ro,
415 (flags & IP_FORWARDING) ? 1 : 0);
416 goto done;
417 }
418 #endif /* IPSEC */
419
420 /*
421 * Packet filter
422 */
423 #if NPF > 0
424 if (pf_test(AF_INET, (flags & IP_FORWARDING) ? PF_FWD : PF_OUT,
425 ifp, &m) != PF_PASS) {
426 error = EACCES;
427 goto bad;
428 }
429 if (m == NULL)
430 goto done;
431 ip = mtod(m, struct ip *);
432 hlen = ip->ip_hl << 2;
433 if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) ==
434 (PF_TAG_REROUTE | PF_TAG_GENERATED))
435 /* already rerun the route lookup, go on */
436 m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE);
437 else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) {
438 /* tag as generated to skip over pf_test on rerun */
439 m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
440 ro = NULL;
441 if_put(ifp); /* drop reference since target changed */
442 ifp = NULL;
443 goto reroute;
444 }
445 #endif
446 in_proto_cksum_out(m, ifp);
447
448 #ifdef IPSEC
449 if (ipsec_in_use && (flags & IP_FORWARDING) && (ipforwarding == 2) &&
450 (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) == NULL)) {
451 error = EHOSTUNREACH;
452 goto bad;
453 }
454 #endif
455
456 /*
457 * If small enough for interface, can just send directly.
458 */
459 if (ntohs(ip->ip_len) <= mtu) {
460 ip->ip_sum = 0;
461 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4))
462 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
463 else {
464 ipstat_inc(ips_outswcsum);
465 ip->ip_sum = in_cksum(m, hlen);
466 }
467
468 error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt);
469 goto done;
470 }
471
472 /*
473 * Too large for interface; fragment if possible.
474 * Must be able to put at least 8 bytes per fragment.
475 */
476 if (ip->ip_off & htons(IP_DF)) {
477 #ifdef IPSEC
478 if (ip_mtudisc)
479 ipsec_adjust_mtu(m, ifp->if_mtu);
480 #endif
481 error = EMSGSIZE;
482 #if NPF > 0
483 /* pf changed routing table, use orig rtable for path MTU */
484 if (ro->ro_tableid != orig_rtableid) {
485 rtfree(ro->ro_rt);
486 ro->ro_tableid = orig_rtableid;
487 ro->ro_rt = icmp_mtudisc_clone(
488 satosin(&ro->ro_dst)->sin_addr, ro->ro_tableid, 0);
489 }
490 #endif
491 /*
492 * This case can happen if the user changed the MTU
493 * of an interface after enabling IP on it. Because
494 * most netifs don't keep track of routes pointing to
495 * them, there is no way for one to update all its
496 * routes when the MTU is changed.
497 */
498 if (rtisvalid(ro->ro_rt) &&
499 ISSET(ro->ro_rt->rt_flags, RTF_HOST) &&
500 !(ro->ro_rt->rt_locks & RTV_MTU) &&
501 (ro->ro_rt->rt_mtu > ifp->if_mtu)) {
502 ro->ro_rt->rt_mtu = ifp->if_mtu;
503 }
504 ipstat_inc(ips_cantfrag);
505 goto bad;
506 }
507
508 error = ip_fragment(m, &fml, ifp, mtu);
509 if (error)
510 goto done;
511
512 while ((m = ml_dequeue(&fml)) != NULL) {
513 error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt);
514 if (error)
515 break;
516 }
517 if (error)
518 ml_purge(&fml);
519 else
520 ipstat_inc(ips_fragmented);
521
522 done:
523 if (ro == &iproute && ro->ro_rt)
524 rtfree(ro->ro_rt);
525 if_put(ifp);
526 #ifdef IPSEC
527 tdb_unref(tdb);
528 #endif /* IPSEC */
529 return (error);
530
531 bad:
532 m_freem(m);
533 goto done;
534 }
535
536 #ifdef IPSEC
537 int
538 ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp,
539 struct tdb **tdbout, int ipsecflowinfo)
540 {
541 struct m_tag *mtag;
542 struct tdb_ident *tdbi;
543 struct tdb *tdb;
544 struct ipsec_ids *ids = NULL;
545 int error;
546
547 /* Do we have any pending SAs to apply ? */
548 if (ipsecflowinfo)
549 ids = ipsp_ids_lookup(ipsecflowinfo);
550 error = ipsp_spd_lookup(m, AF_INET, hlen, IPSP_DIRECTION_OUT,
551 NULL, inp, &tdb, ids);
552 ipsp_ids_free(ids);
553 if (error || tdb == NULL) {
554 *tdbout = NULL;
555 return error;
556 }
557 /* Loop detection */
558 for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) {
559 if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE)
560 continue;
561 tdbi = (struct tdb_ident *)(mtag + 1);
562 if (tdbi->spi == tdb->tdb_spi &&
563 tdbi->proto == tdb->tdb_sproto &&
564 tdbi->rdomain == tdb->tdb_rdomain &&
565 !memcmp(&tdbi->dst, &tdb->tdb_dst,
566 sizeof(union sockaddr_union))) {
567 /* no IPsec needed */
568 tdb_unref(tdb);
569 *tdbout = NULL;
570 return 0;
571 }
572 }
573 *tdbout = tdb;
574 return 0;
575 }
576
577 void
578 ip_output_ipsec_pmtu_update(struct tdb *tdb, struct route *ro,
579 struct in_addr dst, int rtableid, int transportmode)
580 {
581 struct rtentry *rt = NULL;
582 int rt_mtucloned = 0;
583
584 /* Find a host route to store the mtu in */
585 if (ro != NULL)
586 rt = ro->ro_rt;
587 /* but don't add a PMTU route for transport mode SAs */
588 if (transportmode)
589 rt = NULL;
590 else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) {
591 rt = icmp_mtudisc_clone(dst, rtableid, 1);
592 rt_mtucloned = 1;
593 }
594 DPRINTF("spi %08x mtu %d rt %p cloned %d",
595 ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned);
596 if (rt != NULL) {
597 rt->rt_mtu = tdb->tdb_mtu;
598 if (ro != NULL && ro->ro_rt != NULL) {
599 rtfree(ro->ro_rt);
600 ro->ro_rt = rtalloc(&ro->ro_dst, RT_RESOLVE, rtableid);
601 }
602 if (rt_mtucloned)
603 rtfree(rt);
604 }
605 }
606
607 int
608 ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd)
609 {
610 #if NPF > 0
611 struct ifnet *encif;
612 #endif
613 struct ip *ip;
614 struct in_addr dst;
615 int error, rtableid;
616
617 #if NPF > 0
618 /*
619 * Packet filter
620 */
621 if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL ||
622 pf_test(AF_INET, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) {
623 m_freem(m);
624 return EACCES;
625 }
626 if (m == NULL)
627 return 0;
628 /*
629 * PF_TAG_REROUTE handling or not...
630 * Packet is entering IPsec so the routing is
631 * already overruled by the IPsec policy.
632 * Until now the change was not reconsidered.
633 * What's the behaviour?
634 */
635 in_proto_cksum_out(m, encif);
636 #endif
637
638 /* Check if we are allowed to fragment */
639 ip = mtod(m, struct ip *);
640 dst = ip->ip_dst;
641 rtableid = m->m_pkthdr.ph_rtableid;
642 if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu &&
643 ntohs(ip->ip_len) > tdb->tdb_mtu &&
644 tdb->tdb_mtutimeout > gettime()) {
645 int transportmode;
646
647 transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) &&
648 (tdb->tdb_dst.sin.sin_addr.s_addr == dst.s_addr);
649 ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid,
650 transportmode);
651 ipsec_adjust_mtu(m, tdb->tdb_mtu);
652 m_freem(m);
653 return EMSGSIZE;
654 }
655 /* propagate IP_DF for v4-over-v6 */
656 if (ip_mtudisc && ip->ip_off & htons(IP_DF))
657 SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
658
659 /*
660 * Clear these -- they'll be set in the recursive invocation
661 * as needed.
662 */
663 m->m_flags &= ~(M_MCAST | M_BCAST);
664
665 /* Callee frees mbuf */
666 KERNEL_LOCK();
667 error = ipsp_process_packet(m, tdb, AF_INET, 0);
668 KERNEL_UNLOCK();
669 if (error) {
670 ipsecstat_inc(ipsec_odrops);
671 tdbstat_inc(tdb, tdb_odrops);
672 }
673 if (ip_mtudisc && error == EMSGSIZE)
674 ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 0);
675 return error;
676 }
677 #endif /* IPSEC */
678
679 int
680 ip_fragment(struct mbuf *m0, struct mbuf_list *fml, struct ifnet *ifp,
681 u_long mtu)
682 {
683 struct mbuf *m;
684 struct ip *ip;
685 int firstlen, hlen, tlen, len, off;
686 int error;
687
688 ml_init(fml);
689 ml_enqueue(fml, m0);
690
691 ip = mtod(m0, struct ip *);
692 hlen = ip->ip_hl << 2;
693 tlen = m0->m_pkthdr.len;
694 len = (mtu - hlen) &~ 7;
695 if (len < 8) {
696 error = EMSGSIZE;
697 goto bad;
698 }
699 firstlen = len;
700
701 /*
702 * If we are doing fragmentation, we can't defer TCP/UDP
703 * checksumming; compute the checksum and clear the flag.
704 */
705 in_proto_cksum_out(m0, NULL);
706
707 /*
708 * Loop through length of segment after first fragment,
709 * make new header and copy data of each part and link onto chain.
710 */
711 for (off = hlen + firstlen; off < tlen; off += len) {
712 struct ip *mhip;
713 int mhlen;
714
715 MGETHDR(m, M_DONTWAIT, MT_HEADER);
716 if (m == NULL) {
717 error = ENOBUFS;
718 goto bad;
719 }
720 ml_enqueue(fml, m);
721
722 if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0)
723 goto bad;
724 m->m_data += max_linkhdr;
725 mhip = mtod(m, struct ip *);
726 *mhip = *ip;
727 if (hlen > sizeof(struct ip)) {
728 mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
729 mhip->ip_hl = mhlen >> 2;
730 } else
731 mhlen = sizeof(struct ip);
732 m->m_len = mhlen;
733
734 mhip->ip_off = ((off - hlen) >> 3) +
735 (ntohs(ip->ip_off) & ~IP_MF);
736 if (ip->ip_off & htons(IP_MF))
737 mhip->ip_off |= IP_MF;
738 if (off + len >= tlen)
739 len = tlen - off;
740 else
741 mhip->ip_off |= IP_MF;
742 mhip->ip_off = htons(mhip->ip_off);
743
744 m->m_pkthdr.len = mhlen + len;
745 mhip->ip_len = htons(m->m_pkthdr.len);
746 m->m_next = m_copym(m0, off, len, M_NOWAIT);
747 if (m->m_next == NULL) {
748 error = ENOBUFS;
749 goto bad;
750 }
751
752 mhip->ip_sum = 0;
753 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4))
754 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
755 else {
756 ipstat_inc(ips_outswcsum);
757 mhip->ip_sum = in_cksum(m, mhlen);
758 }
759 }
760
761 /*
762 * Update first fragment by trimming what's been copied out
763 * and updating header, then send each fragment (in order).
764 */
765 m = m0;
766 m_adj(m, hlen + firstlen - tlen);
767 ip->ip_off |= htons(IP_MF);
768 ip->ip_len = htons(m->m_pkthdr.len);
769
770 ip->ip_sum = 0;
771 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4))
772 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
773 else {
774 ipstat_inc(ips_outswcsum);
775 ip->ip_sum = in_cksum(m, hlen);
776 }
777
778 ipstat_add(ips_ofragments, ml_len(fml));
779 return (0);
780
781 bad:
782 ipstat_inc(ips_odropped);
783 ml_purge(fml);
784 return (error);
785 }
786
787 /*
788 * Insert IP options into preformed packet.
789 * Adjust IP destination as required for IP source routing,
790 * as indicated by a non-zero in_addr at the start of the options.
791 */
792 struct mbuf *
793 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
794 {
795 struct ipoption *p = mtod(opt, struct ipoption *);
796 struct mbuf *n;
797 struct ip *ip = mtod(m, struct ip *);
798 unsigned int optlen;
799
800 optlen = opt->m_len - sizeof(p->ipopt_dst);
801 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET)
802 return (m); /* XXX should fail */
803
804 /* check if options will fit to IP header */
805 if ((optlen + sizeof(struct ip)) > (0x0f << 2)) {
806 *phlen = sizeof(struct ip);
807 return (m);
808 }
809
810 if (p->ipopt_dst.s_addr)
811 ip->ip_dst = p->ipopt_dst;
812 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
813 MGETHDR(n, M_DONTWAIT, MT_HEADER);
814 if (n == NULL)
815 return (m);
816 M_MOVE_HDR(n, m);
817 n->m_pkthdr.len += optlen;
818 m->m_len -= sizeof(struct ip);
819 m->m_data += sizeof(struct ip);
820 n->m_next = m;
821 m = n;
822 m->m_len = optlen + sizeof(struct ip);
823 m->m_data += max_linkhdr;
824 memcpy(mtod(m, caddr_t), ip, sizeof(struct ip));
825 } else {
826 m->m_data -= optlen;
827 m->m_len += optlen;
828 m->m_pkthdr.len += optlen;
829 memmove(mtod(m, caddr_t), (caddr_t)ip, sizeof(struct ip));
830 }
831 ip = mtod(m, struct ip *);
832 memcpy(ip + 1, p->ipopt_list, optlen);
833 *phlen = sizeof(struct ip) + optlen;
834 ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
835 return (m);
836 }
837
838 /*
839 * Copy options from ip to jp,
840 * omitting those not copied during fragmentation.
841 */
842 int
843 ip_optcopy(struct ip *ip, struct ip *jp)
844 {
845 u_char *cp, *dp;
846 int opt, optlen, cnt;
847
848 cp = (u_char *)(ip + 1);
849 dp = (u_char *)(jp + 1);
850 cnt = (ip->ip_hl << 2) - sizeof (struct ip);
851 for (; cnt > 0; cnt -= optlen, cp += optlen) {
852 opt = cp[0];
853 if (opt == IPOPT_EOL)
854 break;
855 if (opt == IPOPT_NOP) {
856 /* Preserve for IP mcast tunnel's LSRR alignment. */
857 *dp++ = IPOPT_NOP;
858 optlen = 1;
859 continue;
860 }
861 #ifdef DIAGNOSTIC
862 if (cnt < IPOPT_OLEN + sizeof(*cp))
863 panic("malformed IPv4 option passed to ip_optcopy");
864 #endif
865 optlen = cp[IPOPT_OLEN];
866 #ifdef DIAGNOSTIC
867 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
868 panic("malformed IPv4 option passed to ip_optcopy");
869 #endif
870 /* bogus lengths should have been caught by ip_dooptions */
871 if (optlen > cnt)
872 optlen = cnt;
873 if (IPOPT_COPIED(opt)) {
874 memcpy(dp, cp, optlen);
875 dp += optlen;
876 }
877 }
878 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
879 *dp++ = IPOPT_EOL;
880 return (optlen);
881 }
882
883 /*
884 * IP socket option processing.
885 */
886 int
887 ip_ctloutput(int op, struct socket *so, int level, int optname,
888 struct mbuf *m)
889 {
890 struct inpcb *inp = sotoinpcb(so);
891 int optval = 0;
892 struct proc *p = curproc; /* XXX */
893 int error = 0;
894 u_int rtableid, rtid = 0;
895
896 if (level != IPPROTO_IP)
897 return (EINVAL);
898
899 rtableid = p->p_p->ps_rtableid;
900
901 switch (op) {
902 case PRCO_SETOPT:
903 switch (optname) {
904 case IP_OPTIONS:
905 return (ip_pcbopts(&inp->inp_options, m));
906
907 case IP_TOS:
908 case IP_TTL:
909 case IP_MINTTL:
910 case IP_RECVOPTS:
911 case IP_RECVRETOPTS:
912 case IP_RECVDSTADDR:
913 case IP_RECVIF:
914 case IP_RECVTTL:
915 case IP_RECVDSTPORT:
916 case IP_RECVRTABLE:
917 case IP_IPSECFLOWINFO:
918 if (m == NULL || m->m_len != sizeof(int))
919 error = EINVAL;
920 else {
921 optval = *mtod(m, int *);
922 switch (optname) {
923
924 case IP_TOS:
925 inp->inp_ip.ip_tos = optval;
926 break;
927
928 case IP_TTL:
929 if (optval > 0 && optval <= MAXTTL)
930 inp->inp_ip.ip_ttl = optval;
931 else if (optval == -1)
932 inp->inp_ip.ip_ttl = ip_defttl;
933 else
934 error = EINVAL;
935 break;
936
937 case IP_MINTTL:
938 if (optval >= 0 && optval <= MAXTTL)
939 inp->inp_ip_minttl = optval;
940 else
941 error = EINVAL;
942 break;
943 #define OPTSET(bit) \
944 if (optval) \
945 inp->inp_flags |= bit; \
946 else \
947 inp->inp_flags &= ~bit;
948
949 case IP_RECVOPTS:
950 OPTSET(INP_RECVOPTS);
951 break;
952
953 case IP_RECVRETOPTS:
954 OPTSET(INP_RECVRETOPTS);
955 break;
956
957 case IP_RECVDSTADDR:
958 OPTSET(INP_RECVDSTADDR);
959 break;
960 case IP_RECVIF:
961 OPTSET(INP_RECVIF);
962 break;
963 case IP_RECVTTL:
964 OPTSET(INP_RECVTTL);
965 break;
966 case IP_RECVDSTPORT:
967 OPTSET(INP_RECVDSTPORT);
968 break;
969 case IP_RECVRTABLE:
970 OPTSET(INP_RECVRTABLE);
971 break;
972 case IP_IPSECFLOWINFO:
973 OPTSET(INP_IPSECFLOWINFO);
974 break;
975 }
976 }
977 break;
978 #undef OPTSET
979
980 case IP_MULTICAST_IF:
981 case IP_MULTICAST_TTL:
982 case IP_MULTICAST_LOOP:
983 case IP_ADD_MEMBERSHIP:
984 case IP_DROP_MEMBERSHIP:
985 error = ip_setmoptions(optname, &inp->inp_moptions, m,
986 inp->inp_rtableid);
987 break;
988
989 case IP_PORTRANGE:
990 if (m == NULL || m->m_len != sizeof(int))
991 error = EINVAL;
992 else {
993 optval = *mtod(m, int *);
994
995 switch (optval) {
996
997 case IP_PORTRANGE_DEFAULT:
998 inp->inp_flags &= ~(INP_LOWPORT);
999 inp->inp_flags &= ~(INP_HIGHPORT);
1000 break;
1001
1002 case IP_PORTRANGE_HIGH:
1003 inp->inp_flags &= ~(INP_LOWPORT);
1004 inp->inp_flags |= INP_HIGHPORT;
1005 break;
1006
1007 case IP_PORTRANGE_LOW:
1008 inp->inp_flags &= ~(INP_HIGHPORT);
1009 inp->inp_flags |= INP_LOWPORT;
1010 break;
1011
1012 default:
1013
1014 error = EINVAL;
1015 break;
1016 }
1017 }
1018 break;
1019 case IP_AUTH_LEVEL:
1020 case IP_ESP_TRANS_LEVEL:
1021 case IP_ESP_NETWORK_LEVEL:
1022 case IP_IPCOMP_LEVEL:
1023 #ifndef IPSEC
1024 error = EOPNOTSUPP;
1025 #else
1026 if (m == NULL || m->m_len != sizeof(int)) {
1027 error = EINVAL;
1028 break;
1029 }
1030 optval = *mtod(m, int *);
1031
1032 if (optval < IPSEC_LEVEL_BYPASS ||
1033 optval > IPSEC_LEVEL_UNIQUE) {
1034 error = EINVAL;
1035 break;
1036 }
1037
1038 switch (optname) {
1039 case IP_AUTH_LEVEL:
1040 if (optval < IPSEC_AUTH_LEVEL_DEFAULT &&
1041 suser(p)) {
1042 error = EACCES;
1043 break;
1044 }
1045 inp->inp_seclevel[SL_AUTH] = optval;
1046 break;
1047
1048 case IP_ESP_TRANS_LEVEL:
1049 if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT &&
1050 suser(p)) {
1051 error = EACCES;
1052 break;
1053 }
1054 inp->inp_seclevel[SL_ESP_TRANS] = optval;
1055 break;
1056
1057 case IP_ESP_NETWORK_LEVEL:
1058 if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT &&
1059 suser(p)) {
1060 error = EACCES;
1061 break;
1062 }
1063 inp->inp_seclevel[SL_ESP_NETWORK] = optval;
1064 break;
1065 case IP_IPCOMP_LEVEL:
1066 if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT &&
1067 suser(p)) {
1068 error = EACCES;
1069 break;
1070 }
1071 inp->inp_seclevel[SL_IPCOMP] = optval;
1072 break;
1073 }
1074 #endif
1075 break;
1076
1077 case IP_IPSEC_LOCAL_ID:
1078 case IP_IPSEC_REMOTE_ID:
1079 error = EOPNOTSUPP;
1080 break;
1081 case SO_RTABLE:
1082 if (m == NULL || m->m_len < sizeof(u_int)) {
1083 error = EINVAL;
1084 break;
1085 }
1086 rtid = *mtod(m, u_int *);
1087 if (inp->inp_rtableid == rtid)
1088 break;
1089 /* needs privileges to switch when already set */
1090 if (rtableid != rtid && rtableid != 0 &&
1091 (error = suser(p)) != 0)
1092 break;
1093 /* table must exist */
1094 if (!rtable_exists(rtid)) {
1095 error = EINVAL;
1096 break;
1097 }
1098 if (inp->inp_lport) {
1099 error = EBUSY;
1100 break;
1101 }
1102 inp->inp_rtableid = rtid;
1103 in_pcbrehash(inp);
1104 break;
1105 case IP_PIPEX:
1106 if (m != NULL && m->m_len == sizeof(int))
1107 inp->inp_pipex = *mtod(m, int *);
1108 else
1109 error = EINVAL;
1110 break;
1111
1112 default:
1113 error = ENOPROTOOPT;
1114 break;
1115 }
1116 break;
1117
1118 case PRCO_GETOPT:
1119 switch (optname) {
1120 case IP_OPTIONS:
1121 case IP_RETOPTS:
1122 if (inp->inp_options) {
1123 m->m_len = inp->inp_options->m_len;
1124 memcpy(mtod(m, caddr_t),
1125 mtod(inp->inp_options, caddr_t), m->m_len);
1126 } else
1127 m->m_len = 0;
1128 break;
1129
1130 case IP_TOS:
1131 case IP_TTL:
1132 case IP_MINTTL:
1133 case IP_RECVOPTS:
1134 case IP_RECVRETOPTS:
1135 case IP_RECVDSTADDR:
1136 case IP_RECVIF:
1137 case IP_RECVTTL:
1138 case IP_RECVDSTPORT:
1139 case IP_RECVRTABLE:
1140 case IP_IPSECFLOWINFO:
1141 case IP_IPDEFTTL:
1142 m->m_len = sizeof(int);
1143 switch (optname) {
1144
1145 case IP_TOS:
1146 optval = inp->inp_ip.ip_tos;
1147 break;
1148
1149 case IP_TTL:
1150 optval = inp->inp_ip.ip_ttl;
1151 break;
1152
1153 case IP_MINTTL:
1154 optval = inp->inp_ip_minttl;
1155 break;
1156
1157 case IP_IPDEFTTL:
1158 optval = ip_defttl;
1159 break;
1160
1161 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
1162
1163 case IP_RECVOPTS:
1164 optval = OPTBIT(INP_RECVOPTS);
1165 break;
1166
1167 case IP_RECVRETOPTS:
1168 optval = OPTBIT(INP_RECVRETOPTS);
1169 break;
1170
1171 case IP_RECVDSTADDR:
1172 optval = OPTBIT(INP_RECVDSTADDR);
1173 break;
1174 case IP_RECVIF:
1175 optval = OPTBIT(INP_RECVIF);
1176 break;
1177 case IP_RECVTTL:
1178 optval = OPTBIT(INP_RECVTTL);
1179 break;
1180 case IP_RECVDSTPORT:
1181 optval = OPTBIT(INP_RECVDSTPORT);
1182 break;
1183 case IP_RECVRTABLE:
1184 optval = OPTBIT(INP_RECVRTABLE);
1185 break;
1186 case IP_IPSECFLOWINFO:
1187 optval = OPTBIT(INP_IPSECFLOWINFO);
1188 break;
1189 }
1190 *mtod(m, int *) = optval;
1191 break;
1192
1193 case IP_MULTICAST_IF:
1194 case IP_MULTICAST_TTL:
1195 case IP_MULTICAST_LOOP:
1196 case IP_ADD_MEMBERSHIP:
1197 case IP_DROP_MEMBERSHIP:
1198 error = ip_getmoptions(optname, inp->inp_moptions, m);
1199 break;
1200
1201 case IP_PORTRANGE:
1202 m->m_len = sizeof(int);
1203
1204 if (inp->inp_flags & INP_HIGHPORT)
1205 optval = IP_PORTRANGE_HIGH;
1206 else if (inp->inp_flags & INP_LOWPORT)
1207 optval = IP_PORTRANGE_LOW;
1208 else
1209 optval = 0;
1210
1211 *mtod(m, int *) = optval;
1212 break;
1213
1214 case IP_AUTH_LEVEL:
1215 case IP_ESP_TRANS_LEVEL:
1216 case IP_ESP_NETWORK_LEVEL:
1217 case IP_IPCOMP_LEVEL:
1218 #ifndef IPSEC
1219 m->m_len = sizeof(int);
1220 *mtod(m, int *) = IPSEC_LEVEL_NONE;
1221 #else
1222 m->m_len = sizeof(int);
1223 switch (optname) {
1224 case IP_AUTH_LEVEL:
1225 optval = inp->inp_seclevel[SL_AUTH];
1226 break;
1227
1228 case IP_ESP_TRANS_LEVEL:
1229 optval = inp->inp_seclevel[SL_ESP_TRANS];
1230 break;
1231
1232 case IP_ESP_NETWORK_LEVEL:
1233 optval = inp->inp_seclevel[SL_ESP_NETWORK];
1234 break;
1235 case IP_IPCOMP_LEVEL:
1236 optval = inp->inp_seclevel[SL_IPCOMP];
1237 break;
1238 }
1239 *mtod(m, int *) = optval;
1240 #endif
1241 break;
1242 case IP_IPSEC_LOCAL_ID:
1243 case IP_IPSEC_REMOTE_ID:
1244 error = EOPNOTSUPP;
1245 break;
1246 case SO_RTABLE:
1247 m->m_len = sizeof(u_int);
1248 *mtod(m, u_int *) = inp->inp_rtableid;
1249 break;
1250 case IP_PIPEX:
1251 m->m_len = sizeof(int);
1252 *mtod(m, int *) = inp->inp_pipex;
1253 break;
1254 default:
1255 error = ENOPROTOOPT;
1256 break;
1257 }
1258 break;
1259 }
1260 return (error);
1261 }
1262
1263 /*
1264 * Set up IP options in pcb for insertion in output packets.
1265 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1266 * with destination address if source routed.
1267 */
1268 int
1269 ip_pcbopts(struct mbuf **pcbopt, struct mbuf *m)
1270 {
1271 struct mbuf *n;
1272 struct ipoption *p;
1273 int cnt, off, optlen;
1274 u_char *cp;
1275 u_char opt;
1276
1277 /* turn off any old options */
1278 m_freem(*pcbopt);
1279 *pcbopt = NULL;
1280 if (m == NULL || m->m_len == 0) {
1281 /*
1282 * Only turning off any previous options.
1283 */
1284 return (0);
1285 }
1286
1287 if (m->m_len % sizeof(int32_t) ||
1288 m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1289 return (EINVAL);
1290
1291 /* Don't sleep because NET_LOCK() is hold. */
1292 if ((n = m_get(M_NOWAIT, MT_SOOPTS)) == NULL)
1293 return (ENOBUFS);
1294 p = mtod(n, struct ipoption *);
1295 memset(p, 0, sizeof (*p)); /* 0 = IPOPT_EOL, needed for padding */
1296 n->m_len = sizeof(struct in_addr);
1297
1298 off = 0;
1299 cnt = m->m_len;
1300 cp = mtod(m, u_char *);
1301
1302 while (cnt > 0) {
1303 opt = cp[IPOPT_OPTVAL];
1304
1305 if (opt == IPOPT_NOP || opt == IPOPT_EOL) {
1306 optlen = 1;
1307 } else {
1308 if (cnt < IPOPT_OLEN + sizeof(*cp))
1309 goto bad;
1310 optlen = cp[IPOPT_OLEN];
1311 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1312 goto bad;
1313 }
1314 switch (opt) {
1315 default:
1316 memcpy(p->ipopt_list + off, cp, optlen);
1317 break;
1318
1319 case IPOPT_LSRR:
1320 case IPOPT_SSRR:
1321 /*
1322 * user process specifies route as:
1323 * ->A->B->C->D
1324 * D must be our final destination (but we can't
1325 * check that since we may not have connected yet).
1326 * A is first hop destination, which doesn't appear in
1327 * actual IP option, but is stored before the options.
1328 */
1329 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1330 goto bad;
1331
1332 /*
1333 * Optlen is smaller because first address is popped.
1334 * Cnt and cp will be adjusted a bit later to reflect
1335 * this.
1336 */
1337 optlen -= sizeof(struct in_addr);
1338 p->ipopt_list[off + IPOPT_OPTVAL] = opt;
1339 p->ipopt_list[off + IPOPT_OLEN] = optlen;
1340
1341 /*
1342 * Move first hop before start of options.
1343 */
1344 memcpy(&p->ipopt_dst, cp + IPOPT_OFFSET,
1345 sizeof(struct in_addr));
1346 cp += sizeof(struct in_addr);
1347 cnt -= sizeof(struct in_addr);
1348 /*
1349 * Then copy rest of options
1350 */
1351 memcpy(p->ipopt_list + off + IPOPT_OFFSET,
1352 cp + IPOPT_OFFSET, optlen - IPOPT_OFFSET);
1353 break;
1354 }
1355 off += optlen;
1356 cp += optlen;
1357 cnt -= optlen;
1358
1359 if (opt == IPOPT_EOL)
1360 break;
1361 }
1362 /* pad options to next word, since p was zeroed just adjust off */
1363 off = (off + sizeof(int32_t) - 1) & ~(sizeof(int32_t) - 1);
1364 n->m_len += off;
1365 if (n->m_len > sizeof(*p)) {
1366 bad:
1367 m_freem(n);
1368 return (EINVAL);
1369 }
1370
1371 *pcbopt = n;
1372 return (0);
1373 }
1374
1375 /*
1376 * Lookup the interface based on the information in the ip_mreqn struct.
1377 */
1378 int
1379 ip_multicast_if(struct ip_mreqn *mreq, u_int rtableid, unsigned int *ifidx)
1380 {
1381 struct sockaddr_in sin;
1382 struct rtentry *rt;
1383
1384 /*
1385 * In case userland provides the imr_ifindex use this as interface.
1386 * If no interface address was provided, use the interface of
1387 * the route to the given multicast address.
1388 */
1389 if (mreq->imr_ifindex != 0) {
1390 *ifidx = mreq->imr_ifindex;
1391 } else if (mreq->imr_address.s_addr == INADDR_ANY) {
1392 memset(&sin, 0, sizeof(sin));
1393 sin.sin_len = sizeof(sin);
1394 sin.sin_family = AF_INET;
1395 sin.sin_addr = mreq->imr_multiaddr;
1396 rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid);
1397 if (!rtisvalid(rt)) {
1398 rtfree(rt);
1399 return EADDRNOTAVAIL;
1400 }
1401 *ifidx = rt->rt_ifidx;
1402 rtfree(rt);
1403 } else {
1404 memset(&sin, 0, sizeof(sin));
1405 sin.sin_len = sizeof(sin);
1406 sin.sin_family = AF_INET;
1407 sin.sin_addr = mreq->imr_address;
1408 rt = rtalloc(sintosa(&sin), 0, rtableid);
1409 if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) {
1410 rtfree(rt);
1411 return EADDRNOTAVAIL;
1412 }
1413 *ifidx = rt->rt_ifidx;
1414 rtfree(rt);
1415 }
1416
1417 return 0;
1418 }
1419
1420 /*
1421 * Set the IP multicast options in response to user setsockopt().
1422 */
1423 int
1424 ip_setmoptions(int optname, struct ip_moptions **imop, struct mbuf *m,
1425 u_int rtableid)
1426 {
1427 struct in_addr addr;
1428 struct in_ifaddr *ia;
1429 struct ip_mreqn mreqn;
1430 struct ifnet *ifp = NULL;
1431 struct ip_moptions *imo = *imop;
1432 struct in_multi **immp;
1433 struct sockaddr_in sin;
1434 unsigned int ifidx;
1435 int i, error = 0;
1436 u_char loop;
1437
1438 if (imo == NULL) {
1439 /*
1440 * No multicast option buffer attached to the pcb;
1441 * allocate one and initialize to default values.
1442 */
1443 imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK|M_ZERO);
1444 immp = mallocarray(IP_MIN_MEMBERSHIPS, sizeof(*immp), M_IPMOPTS,
1445 M_WAITOK|M_ZERO);
1446 *imop = imo;
1447 imo->imo_ifidx = 0;
1448 imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL;
1449 imo->imo_loop = IP_DEFAULT_MULTICAST_LOOP;
1450 imo->imo_num_memberships = 0;
1451 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
1452 imo->imo_membership = immp;
1453 }
1454
1455 switch (optname) {
1456
1457 case IP_MULTICAST_IF:
1458 /*
1459 * Select the interface for outgoing multicast packets.
1460 */
1461 if (m == NULL) {
1462 error = EINVAL;
1463 break;
1464 }
1465 if (m->m_len == sizeof(struct in_addr)) {
1466 addr = *(mtod(m, struct in_addr *));
1467 } else if (m->m_len == sizeof(struct ip_mreq) ||
1468 m->m_len == sizeof(struct ip_mreqn)) {
1469 memset(&mreqn, 0, sizeof(mreqn));
1470 memcpy(&mreqn, mtod(m, void *), m->m_len);
1471
1472 /*
1473 * If an interface index is given use this
1474 * index to set the imo_ifidx but check first
1475 * that the interface actually exists.
1476 * In the other case just set the addr to
1477 * the imr_address and fall through to the
1478 * regular code.
1479 */
1480 if (mreqn.imr_ifindex != 0) {
1481 ifp = if_get(mreqn.imr_ifindex);
1482 if (ifp == NULL ||
1483 ifp->if_rdomain != rtable_l2(rtableid)) {
1484 error = EADDRNOTAVAIL;
1485 if_put(ifp);
1486 break;
1487 }
1488 imo->imo_ifidx = ifp->if_index;
1489 if_put(ifp);
1490 break;
1491 } else
1492 addr = mreqn.imr_address;
1493 } else {
1494 error = EINVAL;
1495 break;
1496 }
1497 /*
1498 * INADDR_ANY is used to remove a previous selection.
1499 * When no interface is selected, a default one is
1500 * chosen every time a multicast packet is sent.
1501 */
1502 if (addr.s_addr == INADDR_ANY) {
1503 imo->imo_ifidx = 0;
1504 break;
1505 }
1506 /*
1507 * The selected interface is identified by its local
1508 * IP address. Find the interface and confirm that
1509 * it supports multicasting.
1510 */
1511 memset(&sin, 0, sizeof(sin));
1512 sin.sin_len = sizeof(sin);
1513 sin.sin_family = AF_INET;
1514 sin.sin_addr = addr;
1515 ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid));
1516 if (ia == NULL ||
1517 (ia->ia_ifp->if_flags & IFF_MULTICAST) == 0) {
1518 error = EADDRNOTAVAIL;
1519 break;
1520 }
1521 imo->imo_ifidx = ia->ia_ifp->if_index;
1522 break;
1523
1524 case IP_MULTICAST_TTL:
1525 /*
1526 * Set the IP time-to-live for outgoing multicast packets.
1527 */
1528 if (m == NULL || m->m_len != 1) {
1529 error = EINVAL;
1530 break;
1531 }
1532 imo->imo_ttl = *(mtod(m, u_char *));
1533 break;
1534
1535 case IP_MULTICAST_LOOP:
1536 /*
1537 * Set the loopback flag for outgoing multicast packets.
1538 * Must be zero or one.
1539 */
1540 if (m == NULL || m->m_len != 1 ||
1541 (loop = *(mtod(m, u_char *))) > 1) {
1542 error = EINVAL;
1543 break;
1544 }
1545 imo->imo_loop = loop;
1546 break;
1547
1548 case IP_ADD_MEMBERSHIP:
1549 /*
1550 * Add a multicast group membership.
1551 * Group must be a valid IP multicast address.
1552 */
1553 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) ||
1554 m->m_len == sizeof(struct ip_mreqn))) {
1555 error = EINVAL;
1556 break;
1557 }
1558 memset(&mreqn, 0, sizeof(mreqn));
1559 memcpy(&mreqn, mtod(m, void *), m->m_len);
1560 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) {
1561 error = EINVAL;
1562 break;
1563 }
1564
1565 error = ip_multicast_if(&mreqn, rtableid, &ifidx);
1566 if (error)
1567 break;
1568
1569 /*
1570 * See if we found an interface, and confirm that it
1571 * supports multicast.
1572 */
1573 ifp = if_get(ifidx);
1574 if (ifp == NULL || ifp->if_rdomain != rtable_l2(rtableid) ||
1575 (ifp->if_flags & IFF_MULTICAST) == 0) {
1576 error = EADDRNOTAVAIL;
1577 if_put(ifp);
1578 break;
1579 }
1580
1581 /*
1582 * See if the membership already exists or if all the
1583 * membership slots are full.
1584 */
1585 for (i = 0; i < imo->imo_num_memberships; ++i) {
1586 if (imo->imo_membership[i]->inm_ifidx == ifidx &&
1587 imo->imo_membership[i]->inm_addr.s_addr
1588 == mreqn.imr_multiaddr.s_addr)
1589 break;
1590 }
1591 if (i < imo->imo_num_memberships) {
1592 error = EADDRINUSE;
1593 if_put(ifp);
1594 break;
1595 }
1596 if (imo->imo_num_memberships == imo->imo_max_memberships) {
1597 struct in_multi **nmships, **omships;
1598 size_t newmax;
1599 /*
1600 * Resize the vector to next power-of-two minus 1. If
1601 * the size would exceed the maximum then we know we've
1602 * really run out of entries. Otherwise, we reallocate
1603 * the vector.
1604 */
1605 nmships = NULL;
1606 omships = imo->imo_membership;
1607 newmax = ((imo->imo_max_memberships + 1) * 2) - 1;
1608 if (newmax <= IP_MAX_MEMBERSHIPS) {
1609 nmships = mallocarray(newmax, sizeof(*nmships),
1610 M_IPMOPTS, M_NOWAIT|M_ZERO);
1611 if (nmships != NULL) {
1612 memcpy(nmships, omships,
1613 sizeof(*omships) *
1614 imo->imo_max_memberships);
1615 free(omships, M_IPMOPTS,
1616 sizeof(*omships) *
1617 imo->imo_max_memberships);
1618 imo->imo_membership = nmships;
1619 imo->imo_max_memberships = newmax;
1620 }
1621 }
1622 if (nmships == NULL) {
1623 error = ENOBUFS;
1624 if_put(ifp);
1625 break;
1626 }
1627 }
1628 /*
1629 * Everything looks good; add a new record to the multicast
1630 * address list for the given interface.
1631 */
1632 if ((imo->imo_membership[i] =
1633 in_addmulti(&mreqn.imr_multiaddr, ifp)) == NULL) {
1634 error = ENOBUFS;
1635 if_put(ifp);
1636 break;
1637 }
1638 ++imo->imo_num_memberships;
1639 if_put(ifp);
1640 break;
1641
1642 case IP_DROP_MEMBERSHIP:
1643 /*
1644 * Drop a multicast group membership.
1645 * Group must be a valid IP multicast address.
1646 */
1647 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) ||
1648 m->m_len == sizeof(struct ip_mreqn))) {
1649 error = EINVAL;
1650 break;
1651 }
1652 memset(&mreqn, 0, sizeof(mreqn));
1653 memcpy(&mreqn, mtod(m, void *), m->m_len);
1654 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) {
1655 error = EINVAL;
1656 break;
1657 }
1658
1659 /*
1660 * If an interface address was specified, get a pointer
1661 * to its ifnet structure.
1662 */
1663 error = ip_multicast_if(&mreqn, rtableid, &ifidx);
1664 if (error)
1665 break;
1666
1667 /*
1668 * Find the membership in the membership array.
1669 */
1670 for (i = 0; i < imo->imo_num_memberships; ++i) {
1671 if ((ifidx == 0 ||
1672 imo->imo_membership[i]->inm_ifidx == ifidx) &&
1673 imo->imo_membership[i]->inm_addr.s_addr ==
1674 mreqn.imr_multiaddr.s_addr)
1675 break;
1676 }
1677 if (i == imo->imo_num_memberships) {
1678 error = EADDRNOTAVAIL;
1679 break;
1680 }
1681 /*
1682 * Give up the multicast address record to which the
1683 * membership points.
1684 */
1685 in_delmulti(imo->imo_membership[i]);
1686 /*
1687 * Remove the gap in the membership array.
1688 */
1689 for (++i; i < imo->imo_num_memberships; ++i)
1690 imo->imo_membership[i-1] = imo->imo_membership[i];
1691 --imo->imo_num_memberships;
1692 break;
1693
1694 default:
1695 error = EOPNOTSUPP;
1696 break;
1697 }
1698
1699 /*
1700 * If all options have default values, no need to keep the data.
1701 */
1702 if (imo->imo_ifidx == 0 &&
1703 imo->imo_ttl == IP_DEFAULT_MULTICAST_TTL &&
1704 imo->imo_loop == IP_DEFAULT_MULTICAST_LOOP &&
1705 imo->imo_num_memberships == 0) {
1706 free(imo->imo_membership , M_IPMOPTS,
1707 imo->imo_max_memberships * sizeof(struct in_multi *));
1708 free(*imop, M_IPMOPTS, sizeof(**imop));
1709 *imop = NULL;
1710 }
1711
1712 return (error);
1713 }
1714
1715 /*
1716 * Return the IP multicast options in response to user getsockopt().
1717 */
1718 int
1719 ip_getmoptions(int optname, struct ip_moptions *imo, struct mbuf *m)
1720 {
1721 u_char *ttl;
1722 u_char *loop;
1723 struct in_addr *addr;
1724 struct in_ifaddr *ia;
1725 struct ifnet *ifp;
1726
1727 switch (optname) {
1728
1729 case IP_MULTICAST_IF:
1730 addr = mtod(m, struct in_addr *);
1731 m->m_len = sizeof(struct in_addr);
1732 if (imo == NULL || (ifp = if_get(imo->imo_ifidx)) == NULL)
1733 addr->s_addr = INADDR_ANY;
1734 else {
1735 IFP_TO_IA(ifp, ia);
1736 addr->s_addr = (ia == NULL) ? INADDR_ANY
1737 : ia->ia_addr.sin_addr.s_addr;
1738 if_put(ifp);
1739 }
1740 return (0);
1741
1742 case IP_MULTICAST_TTL:
1743 ttl = mtod(m, u_char *);
1744 m->m_len = 1;
1745 *ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL
1746 : imo->imo_ttl;
1747 return (0);
1748
1749 case IP_MULTICAST_LOOP:
1750 loop = mtod(m, u_char *);
1751 m->m_len = 1;
1752 *loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP
1753 : imo->imo_loop;
1754 return (0);
1755
1756 default:
1757 return (EOPNOTSUPP);
1758 }
1759 }
1760
1761 /*
1762 * Discard the IP multicast options.
1763 */
1764 void
1765 ip_freemoptions(struct ip_moptions *imo)
1766 {
1767 int i;
1768
1769 if (imo != NULL) {
1770 for (i = 0; i < imo->imo_num_memberships; ++i)
1771 in_delmulti(imo->imo_membership[i]);
1772 free(imo->imo_membership, M_IPMOPTS,
1773 imo->imo_max_memberships * sizeof(struct in_multi *));
1774 free(imo, M_IPMOPTS, sizeof(*imo));
1775 }
1776 }
1777
1778 /*
1779 * Routine called from ip_output() to loop back a copy of an IP multicast
1780 * packet to the input queue of a specified interface.
1781 */
1782 void
1783 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst)
1784 {
1785 struct ip *ip;
1786 struct mbuf *copym;
1787
1788 copym = m_dup_pkt(m, max_linkhdr, M_DONTWAIT);
1789 if (copym != NULL) {
1790 /*
1791 * We don't bother to fragment if the IP length is greater
1792 * than the interface's MTU. Can this possibly matter?
1793 */
1794 ip = mtod(copym, struct ip *);
1795 ip->ip_sum = 0;
1796 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
1797 if_input_local(ifp, copym, dst->sin_family);
1798 }
1799 }
1800
1801 /*
1802 * Compute significant parts of the IPv4 checksum pseudo-header
1803 * for use in a delayed TCP/UDP checksum calculation.
1804 */
1805 static __inline u_int16_t __attribute__((__unused__))
1806 in_cksum_phdr(u_int32_t src, u_int32_t dst, u_int32_t lenproto)
1807 {
1808 u_int32_t sum;
1809
1810 sum = lenproto +
1811 (u_int16_t)(src >> 16) +
1812 (u_int16_t)(src /*& 0xffff*/) +
1813 (u_int16_t)(dst >> 16) +
1814 (u_int16_t)(dst /*& 0xffff*/);
1815
1816 sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/);
1817
1818 if (sum > 0xffff)
1819 sum -= 0xffff;
1820
1821 return (sum);
1822 }
1823
1824 /*
1825 * Process a delayed payload checksum calculation.
1826 */
1827 void
1828 in_delayed_cksum(struct mbuf *m)
1829 {
1830 struct ip *ip;
1831 u_int16_t csum, offset;
1832
1833 ip = mtod(m, struct ip *);
1834 offset = ip->ip_hl << 2;
1835 csum = in4_cksum(m, 0, offset, m->m_pkthdr.len - offset);
1836 if (csum == 0 && ip->ip_p == IPPROTO_UDP)
1837 csum = 0xffff;
1838
1839 switch (ip->ip_p) {
1840 case IPPROTO_TCP:
1841 offset += offsetof(struct tcphdr, th_sum);
1842 break;
1843
1844 case IPPROTO_UDP:
1845 offset += offsetof(struct udphdr, uh_sum);
1846 break;
1847
1848 case IPPROTO_ICMP:
1849 offset += offsetof(struct icmp, icmp_cksum);
1850 break;
1851
1852 default:
1853 return;
1854 }
1855
1856 if ((offset + sizeof(u_int16_t)) > m->m_len)
1857 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT);
1858 else
1859 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum;
1860 }
1861
1862 void
1863 in_proto_cksum_out(struct mbuf *m, struct ifnet *ifp)
1864 {
1865 struct ip *ip = mtod(m, struct ip *);
1866
1867 /* some hw and in_delayed_cksum need the pseudo header cksum */
1868 if (m->m_pkthdr.csum_flags &
1869 (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) {
1870 u_int16_t csum = 0, offset;
1871
1872 offset = ip->ip_hl << 2;
1873 if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
1874 csum = in_cksum_phdr(ip->ip_src.s_addr,
1875 ip->ip_dst.s_addr, htonl(ntohs(ip->ip_len) -
1876 offset + ip->ip_p));
1877 if (ip->ip_p == IPPROTO_TCP)
1878 offset += offsetof(struct tcphdr, th_sum);
1879 else if (ip->ip_p == IPPROTO_UDP)
1880 offset += offsetof(struct udphdr, uh_sum);
1881 else if (ip->ip_p == IPPROTO_ICMP)
1882 offset += offsetof(struct icmp, icmp_cksum);
1883 if ((offset + sizeof(u_int16_t)) > m->m_len)
1884 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT);
1885 else
1886 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum;
1887 }
1888
1889 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) {
1890 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_TCPv4) ||
1891 ip->ip_hl != 5) {
1892 tcpstat_inc(tcps_outswcsum);
1893 in_delayed_cksum(m);
1894 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */
1895 }
1896 } else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) {
1897 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_UDPv4) ||
1898 ip->ip_hl != 5) {
1899 udpstat_inc(udps_outswcsum);
1900 in_delayed_cksum(m);
1901 m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */
1902 }
1903 } else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) {
1904 in_delayed_cksum(m);
1905 m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */
1906 }
1907 }
1908
1909 int
1910 in_ifcap_cksum(struct mbuf *m, struct ifnet *ifp, int ifcap)
1911 {
1912 if ((ifp == NULL) ||
1913 !ISSET(ifp->if_capabilities, ifcap) ||
1914 (ifp->if_bridgeidx != 0))
1915 return (0);
1916 /*
1917 * Simplex interface sends packet back without hardware cksum.
1918 * Keep this check in sync with the condition where ether_resolve()
1919 * calls if_input_local().
1920 */
1921 if (ISSET(m->m_flags, M_BCAST) &&
1922 ISSET(ifp->if_flags, IFF_SIMPLEX) &&
1923 !m->m_pkthdr.pf.routed)
1924 return (0);
1925 return (1);
1926 }
Cache object: ad4acf3e2d1d152308066bc93ce7e0b0
|