The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/ip_mroute.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: ip_mroute.c,v 1.165 2022/03/15 21:39:59 andvar Exp $   */
    2 
    3 /*
    4  * Copyright (c) 1992, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Stephen Deering of Stanford University.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   35  */
   36 
   37 /*
   38  * Copyright (c) 1989 Stephen Deering
   39  *
   40  * This code is derived from software contributed to Berkeley by
   41  * Stephen Deering of Stanford University.
   42  *
   43  * Redistribution and use in source and binary forms, with or without
   44  * modification, are permitted provided that the following conditions
   45  * are met:
   46  * 1. Redistributions of source code must retain the above copyright
   47  *    notice, this list of conditions and the following disclaimer.
   48  * 2. Redistributions in binary form must reproduce the above copyright
   49  *    notice, this list of conditions and the following disclaimer in the
   50  *    documentation and/or other materials provided with the distribution.
   51  * 3. All advertising materials mentioning features or use of this software
   52  *    must display the following acknowledgement:
   53  *      This product includes software developed by the University of
   54  *      California, Berkeley and its contributors.
   55  * 4. Neither the name of the University nor the names of its contributors
   56  *    may be used to endorse or promote products derived from this software
   57  *    without specific prior written permission.
   58  *
   59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   69  * SUCH DAMAGE.
   70  *
   71  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   72  */
   73 
   74 /*
   75  * IP multicast forwarding procedures
   76  *
   77  * Written by David Waitzman, BBN Labs, August 1988.
   78  * Modified by Steve Deering, Stanford, February 1989.
   79  * Modified by Mark J. Steiglitz, Stanford, May, 1991
   80  * Modified by Van Jacobson, LBL, January 1993
   81  * Modified by Ajit Thyagarajan, PARC, August 1993
   82  * Modified by Bill Fenner, PARC, April 1994
   83  * Modified by Charles M. Hannum, NetBSD, May 1995.
   84  * Modified by Ahmed Helmy, SGI, June 1996
   85  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
   86  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
   87  * Modified by Hitoshi Asaeda, WIDE, August 2000
   88  * Modified by Pavlin Radoslavov, ICSI, October 2002
   89  *
   90  * MROUTING Revision: 1.2
   91  * and PIM-SMv2 and PIM-DM support, advanced API support,
   92  * bandwidth metering and signaling
   93  */
   94 
   95 #include <sys/cdefs.h>
   96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.165 2022/03/15 21:39:59 andvar Exp $");
   97 
   98 #ifdef _KERNEL_OPT
   99 #include "opt_inet.h"
  100 #include "opt_ipsec.h"
  101 #include "opt_pim.h"
  102 #endif
  103 
  104 #ifdef PIM
  105 #define _PIM_VT 1
  106 #endif
  107 
  108 #include <sys/param.h>
  109 #include <sys/systm.h>
  110 #include <sys/callout.h>
  111 #include <sys/mbuf.h>
  112 #include <sys/socket.h>
  113 #include <sys/socketvar.h>
  114 #include <sys/errno.h>
  115 #include <sys/time.h>
  116 #include <sys/kernel.h>
  117 #include <sys/kmem.h>
  118 #include <sys/ioctl.h>
  119 #include <sys/syslog.h>
  120 
  121 #include <net/if.h>
  122 #include <net/raw_cb.h>
  123 
  124 #include <netinet/in.h>
  125 #include <netinet/in_var.h>
  126 #include <netinet/in_systm.h>
  127 #include <netinet/in_offload.h>
  128 #include <netinet/ip.h>
  129 #include <netinet/ip_var.h>
  130 #include <netinet/in_pcb.h>
  131 #include <netinet/udp.h>
  132 #include <netinet/igmp.h>
  133 #include <netinet/igmp_var.h>
  134 #include <netinet/ip_mroute.h>
  135 #ifdef PIM
  136 #include <netinet/pim.h>
  137 #include <netinet/pim_var.h>
  138 #endif
  139 #include <netinet/ip_encap.h>
  140 
  141 #ifdef IPSEC
  142 #include <netipsec/ipsec.h>
  143 #include <netipsec/key.h>
  144 #endif
  145 
  146 #define IP_MULTICASTOPTS 0
  147 #define M_PULLUP(m, len)                                                 \
  148         do {                                                             \
  149                 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
  150                         (m) = m_pullup((m), (len));                      \
  151         } while (/*CONSTCOND*/ 0)
  152 
  153 /*
  154  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
  155  * except for netstat or debugging purposes.
  156  */
  157 struct socket  *ip_mrouter  = NULL;
  158 int             ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
  159 
  160 #define MFCHASH(a, g)                                                   \
  161         ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^        \
  162           ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
  163 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
  164 u_long  mfchash;
  165 
  166 u_char          nexpire[MFCTBLSIZ];
  167 struct vif      viftable[MAXVIFS];
  168 struct mrtstat  mrtstat;
  169 u_int           mrtdebug = 0;   /* debug level */
  170 #define         DEBUG_MFC       0x02
  171 #define         DEBUG_FORWARD   0x04
  172 #define         DEBUG_EXPIRE    0x08
  173 #define         DEBUG_XMIT      0x10
  174 #define         DEBUG_PIM       0x20
  175 
  176 #define         VIFI_INVALID    ((vifi_t) -1)
  177 
  178 u_int tbfdebug = 0;     /* tbf debug level */
  179 
  180 /* vif attachment using sys/netinet/ip_encap.c */
  181 static void vif_input(struct mbuf *, int, int, void *);
  182 static int vif_encapcheck(struct mbuf *, int, int, void *);
  183 
  184 static const struct encapsw vif_encapsw = {
  185         .encapsw4 = {
  186                 .pr_input       = vif_input,
  187                 .pr_ctlinput    = NULL,
  188         }
  189 };
  190 
  191 #define         EXPIRE_TIMEOUT  (hz / 4)        /* 4x / second */
  192 #define         UPCALL_EXPIRE   6               /* number of timeouts */
  193 
  194 /*
  195  * Define the token bucket filter structures
  196  */
  197 
  198 #define         TBF_REPROCESS   (hz / 100)      /* 100x / second */
  199 
  200 static int get_sg_cnt(struct sioc_sg_req *);
  201 static int get_vif_cnt(struct sioc_vif_req *);
  202 static int ip_mrouter_init(struct socket *, int);
  203 static int set_assert(int);
  204 static int add_vif(struct vifctl *);
  205 static int del_vif(vifi_t *);
  206 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
  207 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
  208 static void expire_mfc(struct mfc *);
  209 static int add_mfc(struct sockopt *);
  210 #ifdef UPCALL_TIMING
  211 static void collate(struct timeval *);
  212 #endif
  213 static int del_mfc(struct sockopt *);
  214 static int set_api_config(struct sockopt *); /* chose API capabilities */
  215 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
  216 static void expire_upcalls(void *);
  217 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
  218 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
  219 static void encap_send(struct ip *, struct vif *, struct mbuf *);
  220 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t);
  221 static void tbf_queue(struct vif *, struct mbuf *);
  222 static void tbf_process_q(struct vif *);
  223 static void tbf_reprocess_q(void *);
  224 static int tbf_dq_sel(struct vif *, struct ip *);
  225 static void tbf_send_packet(struct vif *, struct mbuf *);
  226 static void tbf_update_tokens(struct vif *);
  227 static int priority(struct vif *, struct ip *);
  228 static int ip_mforward_real(struct mbuf *, struct ifnet *);
  229 
  230 
  231 /*
  232  * Bandwidth monitoring
  233  */
  234 static void free_bw_list(struct bw_meter *);
  235 static int add_bw_upcall(struct bw_upcall *);
  236 static int del_bw_upcall(struct bw_upcall *);
  237 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
  238 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
  239 static void bw_upcalls_send(void);
  240 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
  241 static void unschedule_bw_meter(struct bw_meter *);
  242 static void bw_meter_process(void);
  243 static void expire_bw_upcalls_send(void *);
  244 static void expire_bw_meter_process(void *);
  245 
  246 #ifdef PIM
  247 static int pim_register_send(struct ip *, struct vif *,
  248     struct mbuf *, struct mfc *);
  249 static int pim_register_send_rp(struct ip *, struct vif *,
  250     struct mbuf *, struct mfc *);
  251 static int pim_register_send_upcall(struct ip *, struct vif *,
  252     struct mbuf *, struct mfc *);
  253 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
  254 #endif
  255 
  256 #define ENCAP_TTL       64
  257 #define ENCAP_PROTO     IPPROTO_IPIP
  258 
  259 /* prototype IP hdr for encapsulated packets */
  260 static const struct ip multicast_encap_iphdr = {
  261         .ip_hl = sizeof(struct ip) >> 2,
  262         .ip_v = IPVERSION,
  263         .ip_len = sizeof(struct ip),
  264         .ip_ttl = ENCAP_TTL,
  265         .ip_p = ENCAP_PROTO,
  266 };
  267 
  268 /*
  269  * Bandwidth meter variables and constants
  270  */
  271 
  272 /*
  273  * Pending timeouts are stored in a hash table, the key being the
  274  * expiration time. Periodically, the entries are analysed and processed.
  275  */
  276 #define BW_METER_BUCKETS        1024
  277 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
  278 struct callout bw_meter_ch;
  279 #define BW_METER_PERIOD (hz)            /* periodical handling of bw meters */
  280 
  281 /*
  282  * Pending upcalls are stored in a vector which is flushed when
  283  * full, or periodically
  284  */
  285 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX];
  286 static u_int    bw_upcalls_n; /* # of pending upcalls */
  287 struct callout  bw_upcalls_ch;
  288 #define BW_UPCALLS_PERIOD (hz)          /* periodical flush of bw upcalls */
  289 
  290 #ifdef PIM
  291 struct pimstat pimstat;
  292 
  293 /*
  294  * Note: the PIM Register encapsulation adds the following in front of a
  295  * data packet:
  296  *
  297  * struct pim_encap_hdr {
  298  *     struct ip ip;
  299  *     struct pim_encap_pimhdr  pim;
  300  * }
  301  */
  302 
  303 struct pim_encap_pimhdr {
  304         struct pim pim;
  305         uint32_t   flags;
  306 };
  307 
  308 static struct ip pim_encap_iphdr = {
  309         .ip_v = IPVERSION,
  310         .ip_hl = sizeof(struct ip) >> 2,
  311         .ip_len = sizeof(struct ip),
  312         .ip_ttl = ENCAP_TTL,
  313         .ip_p = IPPROTO_PIM,
  314 };
  315 
  316 static struct pim_encap_pimhdr pim_encap_pimhdr = {
  317     {
  318         PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
  319         0,                      /* reserved */
  320         0,                      /* checksum */
  321     },
  322     0                           /* flags */
  323 };
  324 
  325 static struct ifnet multicast_register_if;
  326 static vifi_t reg_vif_num = VIFI_INVALID;
  327 #endif /* PIM */
  328 
  329 
  330 /*
  331  * Private variables.
  332  */
  333 static vifi_t      numvifs = 0;
  334 
  335 static struct callout expire_upcalls_ch;
  336 
  337 /*
  338  * whether or not special PIM assert processing is enabled.
  339  */
  340 static int pim_assert;
  341 /*
  342  * Rate limit for assert notification messages, in usec
  343  */
  344 #define ASSERT_MSG_TIME         3000000
  345 
  346 /*
  347  * Kernel multicast routing API capabilities and setup.
  348  * If more API capabilities are added to the kernel, they should be
  349  * recorded in `mrt_api_support'.
  350  */
  351 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
  352                                           MRT_MFC_FLAGS_BORDER_VIF |
  353                                           MRT_MFC_RP |
  354                                           MRT_MFC_BW_UPCALL);
  355 static u_int32_t mrt_api_config = 0;
  356 
  357 /*
  358  * Find a route for a given origin IP address and Multicast group address
  359  * Type of service parameter to be added in the future!!!
  360  * Statistics are updated by the caller if needed
  361  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
  362  */
  363 static struct mfc *
  364 mfc_find(struct in_addr *o, struct in_addr *g)
  365 {
  366         struct mfc *rt;
  367 
  368         LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
  369                 if (in_hosteq(rt->mfc_origin, *o) &&
  370                     in_hosteq(rt->mfc_mcastgrp, *g) &&
  371                     (rt->mfc_stall == NULL))
  372                         break;
  373         }
  374 
  375         return rt;
  376 }
  377 
  378 /*
  379  * Macros to compute elapsed time efficiently
  380  * Borrowed from Van Jacobson's scheduling code
  381  */
  382 #define TV_DELTA(a, b, delta) do {                                      \
  383         int xxs;                                                        \
  384         delta = (a).tv_usec - (b).tv_usec;                              \
  385         xxs = (a).tv_sec - (b).tv_sec;                                  \
  386         switch (xxs) {                                                  \
  387         case 2:                                                         \
  388                 delta += 1000000;                                       \
  389                 /* fall through */                                      \
  390         case 1:                                                         \
  391                 delta += 1000000;                                       \
  392                 /* fall through */                                      \
  393         case 0:                                                         \
  394                 break;                                                  \
  395         default:                                                        \
  396                 delta += (1000000 * xxs);                               \
  397                 break;                                                  \
  398         }                                                               \
  399 } while (/*CONSTCOND*/ 0)
  400 
  401 #ifdef UPCALL_TIMING
  402 u_int32_t upcall_data[51];
  403 #endif /* UPCALL_TIMING */
  404 
  405 /*
  406  * Handle MRT setsockopt commands to modify the multicast routing tables.
  407  */
  408 int
  409 ip_mrouter_set(struct socket *so, struct sockopt *sopt)
  410 {
  411         int error;
  412         int optval;
  413         struct vifctl vifc;
  414         vifi_t vifi;
  415         struct bw_upcall bwuc;
  416 
  417         if (sopt->sopt_name != MRT_INIT && so != ip_mrouter)
  418                 error = ENOPROTOOPT;
  419         else {
  420                 switch (sopt->sopt_name) {
  421                 case MRT_INIT:
  422                         error = sockopt_getint(sopt, &optval);
  423                         if (error)
  424                                 break;
  425 
  426                         error = ip_mrouter_init(so, optval);
  427                         break;
  428                 case MRT_DONE:
  429                         error = ip_mrouter_done();
  430                         break;
  431                 case MRT_ADD_VIF:
  432                         error = sockopt_get(sopt, &vifc, sizeof(vifc));
  433                         if (error)
  434                                 break;
  435                         error = add_vif(&vifc);
  436                         break;
  437                 case MRT_DEL_VIF:
  438                         error = sockopt_get(sopt, &vifi, sizeof(vifi));
  439                         if (error)
  440                                 break;
  441                         error = del_vif(&vifi);
  442                         break;
  443                 case MRT_ADD_MFC:
  444                         error = add_mfc(sopt);
  445                         break;
  446                 case MRT_DEL_MFC:
  447                         error = del_mfc(sopt);
  448                         break;
  449                 case MRT_ASSERT:
  450                         error = sockopt_getint(sopt, &optval);
  451                         if (error)
  452                                 break;
  453                         error = set_assert(optval);
  454                         break;
  455                 case MRT_API_CONFIG:
  456                         error = set_api_config(sopt);
  457                         break;
  458                 case MRT_ADD_BW_UPCALL:
  459                         error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
  460                         if (error)
  461                                 break;
  462                         error = add_bw_upcall(&bwuc);
  463                         break;
  464                 case MRT_DEL_BW_UPCALL:
  465                         error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
  466                         if (error)
  467                                 break;
  468                         error = del_bw_upcall(&bwuc);
  469                         break;
  470                 default:
  471                         error = ENOPROTOOPT;
  472                         break;
  473                 }
  474         }
  475         return error;
  476 }
  477 
  478 /*
  479  * Handle MRT getsockopt commands
  480  */
  481 int
  482 ip_mrouter_get(struct socket *so, struct sockopt *sopt)
  483 {
  484         int error;
  485 
  486         if (so != ip_mrouter)
  487                 error = ENOPROTOOPT;
  488         else {
  489                 switch (sopt->sopt_name) {
  490                 case MRT_VERSION:
  491                         error = sockopt_setint(sopt, 0x0305); /* XXX !!!! */
  492                         break;
  493                 case MRT_ASSERT:
  494                         error = sockopt_setint(sopt, pim_assert);
  495                         break;
  496                 case MRT_API_SUPPORT:
  497                         error = sockopt_set(sopt, &mrt_api_support,
  498                             sizeof(mrt_api_support));
  499                         break;
  500                 case MRT_API_CONFIG:
  501                         error = sockopt_set(sopt, &mrt_api_config,
  502                             sizeof(mrt_api_config));
  503                         break;
  504                 default:
  505                         error = ENOPROTOOPT;
  506                         break;
  507                 }
  508         }
  509         return error;
  510 }
  511 
  512 /*
  513  * Handle ioctl commands to obtain information from the cache
  514  */
  515 int
  516 mrt_ioctl(struct socket *so, u_long cmd, void *data)
  517 {
  518         int error;
  519 
  520         if (so != ip_mrouter)
  521                 error = EINVAL;
  522         else
  523                 switch (cmd) {
  524                 case SIOCGETVIFCNT:
  525                         error = get_vif_cnt((struct sioc_vif_req *)data);
  526                         break;
  527                 case SIOCGETSGCNT:
  528                         error = get_sg_cnt((struct sioc_sg_req *)data);
  529                         break;
  530                 default:
  531                         error = EINVAL;
  532                         break;
  533                 }
  534 
  535         return error;
  536 }
  537 
  538 /*
  539  * returns the packet, byte, rpf-failure count for the source group provided
  540  */
  541 static int
  542 get_sg_cnt(struct sioc_sg_req *req)
  543 {
  544         int s;
  545         struct mfc *rt;
  546 
  547         s = splsoftnet();
  548         rt = mfc_find(&req->src, &req->grp);
  549         if (rt == NULL) {
  550                 splx(s);
  551                 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
  552                 return EADDRNOTAVAIL;
  553         }
  554         req->pktcnt = rt->mfc_pkt_cnt;
  555         req->bytecnt = rt->mfc_byte_cnt;
  556         req->wrong_if = rt->mfc_wrong_if;
  557         splx(s);
  558 
  559         return 0;
  560 }
  561 
  562 /*
  563  * returns the input and output packet and byte counts on the vif provided
  564  */
  565 static int
  566 get_vif_cnt(struct sioc_vif_req *req)
  567 {
  568         vifi_t vifi = req->vifi;
  569 
  570         if (vifi >= numvifs)
  571                 return EINVAL;
  572 
  573         req->icount = viftable[vifi].v_pkt_in;
  574         req->ocount = viftable[vifi].v_pkt_out;
  575         req->ibytes = viftable[vifi].v_bytes_in;
  576         req->obytes = viftable[vifi].v_bytes_out;
  577 
  578         return 0;
  579 }
  580 
  581 /*
  582  * Enable multicast routing
  583  */
  584 static int
  585 ip_mrouter_init(struct socket *so, int v)
  586 {
  587         if (mrtdebug)
  588                 log(LOG_DEBUG,
  589                     "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
  590                     so->so_type, so->so_proto->pr_protocol);
  591 
  592         if (so->so_type != SOCK_RAW ||
  593             so->so_proto->pr_protocol != IPPROTO_IGMP)
  594                 return EOPNOTSUPP;
  595 
  596         if (v != 1)
  597                 return EINVAL;
  598 
  599         if (ip_mrouter != NULL)
  600                 return EADDRINUSE;
  601 
  602         ip_mrouter = so;
  603 
  604         mfchashtbl = hashinit(MFCTBLSIZ, HASH_LIST, true, &mfchash);
  605         memset((void *)nexpire, 0, sizeof(nexpire));
  606 
  607         pim_assert = 0;
  608 
  609         callout_init(&expire_upcalls_ch, 0);
  610         callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
  611                       expire_upcalls, NULL);
  612 
  613         callout_init(&bw_upcalls_ch, 0);
  614         callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
  615                       expire_bw_upcalls_send, NULL);
  616 
  617         callout_init(&bw_meter_ch, 0);
  618         callout_reset(&bw_meter_ch, BW_METER_PERIOD,
  619                       expire_bw_meter_process, NULL);
  620 
  621         if (mrtdebug)
  622                 log(LOG_DEBUG, "ip_mrouter_init\n");
  623 
  624         return 0;
  625 }
  626 
  627 /*
  628  * Disable multicast routing
  629  */
  630 int
  631 ip_mrouter_done(void)
  632 {
  633         vifi_t vifi;
  634         struct vif *vifp;
  635         int i;
  636         int s;
  637 
  638         s = splsoftnet();
  639 
  640         /* Clear out all the vifs currently in use. */
  641         for (vifi = 0; vifi < numvifs; vifi++) {
  642                 vifp = &viftable[vifi];
  643                 if (!in_nullhost(vifp->v_lcl_addr))
  644                         reset_vif(vifp);
  645         }
  646 
  647         numvifs = 0;
  648         pim_assert = 0;
  649         mrt_api_config = 0;
  650 
  651         callout_stop(&expire_upcalls_ch);
  652         callout_stop(&bw_upcalls_ch);
  653         callout_stop(&bw_meter_ch);
  654 
  655         /*
  656          * Free all multicast forwarding cache entries.
  657          */
  658         for (i = 0; i < MFCTBLSIZ; i++) {
  659                 struct mfc *rt, *nrt;
  660 
  661                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
  662                         nrt = LIST_NEXT(rt, mfc_hash);
  663 
  664                         expire_mfc(rt);
  665                 }
  666         }
  667 
  668         memset((void *)nexpire, 0, sizeof(nexpire));
  669         hashdone(mfchashtbl, HASH_LIST, mfchash);
  670         mfchashtbl = NULL;
  671 
  672         bw_upcalls_n = 0;
  673         memset(bw_meter_timers, 0, sizeof(bw_meter_timers));
  674 
  675         /* Reset de-encapsulation cache. */
  676 
  677         ip_mrouter = NULL;
  678 
  679         splx(s);
  680 
  681         if (mrtdebug)
  682                 log(LOG_DEBUG, "ip_mrouter_done\n");
  683 
  684         return 0;
  685 }
  686 
  687 void
  688 ip_mrouter_detach(struct ifnet *ifp)
  689 {
  690         int vifi, i;
  691         struct vif *vifp;
  692         struct mfc *rt;
  693         struct rtdetq *rte;
  694 
  695         /* XXX not sure about side effect to userland routing daemon */
  696         for (vifi = 0; vifi < numvifs; vifi++) {
  697                 vifp = &viftable[vifi];
  698                 if (vifp->v_ifp == ifp)
  699                         reset_vif(vifp);
  700         }
  701         for (i = 0; i < MFCTBLSIZ; i++) {
  702                 if (nexpire[i] == 0)
  703                         continue;
  704                 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
  705                         for (rte = rt->mfc_stall; rte; rte = rte->next) {
  706                                 if (rte->ifp == ifp)
  707                                         rte->ifp = NULL;
  708                         }
  709                 }
  710         }
  711 }
  712 
  713 /*
  714  * Set PIM assert processing global
  715  */
  716 static int
  717 set_assert(int i)
  718 {
  719         pim_assert = !!i;
  720         return 0;
  721 }
  722 
  723 /*
  724  * Configure API capabilities
  725  */
  726 static int
  727 set_api_config(struct sockopt *sopt)
  728 {
  729         u_int32_t apival;
  730         int i, error;
  731 
  732         /*
  733          * We can set the API capabilities only if it is the first operation
  734          * after MRT_INIT. I.e.:
  735          *  - there are no vifs installed
  736          *  - pim_assert is not enabled
  737          *  - the MFC table is empty
  738          */
  739         error = sockopt_get(sopt, &apival, sizeof(apival));
  740         if (error)
  741                 return error;
  742         if (numvifs > 0)
  743                 return EPERM;
  744         if (pim_assert)
  745                 return EPERM;
  746         for (i = 0; i < MFCTBLSIZ; i++) {
  747                 if (LIST_FIRST(&mfchashtbl[i]) != NULL)
  748                         return EPERM;
  749         }
  750 
  751         mrt_api_config = apival & mrt_api_support;
  752         return 0;
  753 }
  754 
  755 /*
  756  * Add a vif to the vif table
  757  */
  758 static int
  759 add_vif(struct vifctl *vifcp)
  760 {
  761         struct vif *vifp;
  762         struct ifnet *ifp;
  763         int error, s;
  764         struct sockaddr_in sin;
  765 
  766         if (vifcp->vifc_vifi >= MAXVIFS)
  767                 return EINVAL;
  768         if (in_nullhost(vifcp->vifc_lcl_addr))
  769                 return EADDRNOTAVAIL;
  770 
  771         vifp = &viftable[vifcp->vifc_vifi];
  772         if (!in_nullhost(vifp->v_lcl_addr))
  773                 return EADDRINUSE;
  774 
  775         /* Find the interface with an address in AF_INET family. */
  776 #ifdef PIM
  777         if (vifcp->vifc_flags & VIFF_REGISTER) {
  778                 /*
  779                  * XXX: Because VIFF_REGISTER does not really need a valid
  780                  * local interface (e.g. it could be 127.0.0.2), we don't
  781                  * check its address.
  782                  */
  783                 ifp = NULL;
  784         } else
  785 #endif
  786         {
  787                 struct ifaddr *ifa;
  788 
  789                 sockaddr_in_init(&sin, &vifcp->vifc_lcl_addr, 0);
  790                 s = pserialize_read_enter();
  791                 ifa = ifa_ifwithaddr(sintosa(&sin));
  792                 if (ifa == NULL) {
  793                         pserialize_read_exit(s);
  794                         return EADDRNOTAVAIL;
  795                 }
  796                 ifp = ifa->ifa_ifp;
  797                 /* FIXME NOMPSAFE */
  798                 pserialize_read_exit(s);
  799         }
  800 
  801         if (vifcp->vifc_flags & VIFF_TUNNEL) {
  802                 if (vifcp->vifc_flags & VIFF_SRCRT) {
  803                         log(LOG_ERR, "source routed tunnels not supported\n");
  804                         return EOPNOTSUPP;
  805                 }
  806 
  807                 /* attach this vif to decapsulator dispatch table */
  808                 /*
  809                  * XXX Use addresses in registration so that matching
  810                  * can be done with radix tree in decapsulator.  But,
  811                  * we need to check inner header for multicast, so
  812                  * this requires both radix tree lookup and then a
  813                  * function to check, and this is not supported yet.
  814                  */
  815                 error = encap_lock_enter();
  816                 if (error)
  817                         return error;
  818                 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
  819                     vif_encapcheck, &vif_encapsw, vifp);
  820                 encap_lock_exit();
  821                 if (!vifp->v_encap_cookie)
  822                         return EINVAL;
  823 
  824                 /* Create a fake encapsulation interface. */
  825                 ifp = malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK|M_ZERO);
  826                 snprintf(ifp->if_xname, sizeof(ifp->if_xname),
  827                          "mdecap%d", vifcp->vifc_vifi);
  828 
  829                 /* Prepare cached route entry. */
  830                 memset(&vifp->v_route, 0, sizeof(vifp->v_route));
  831 #ifdef PIM
  832         } else if (vifcp->vifc_flags & VIFF_REGISTER) {
  833                 ifp = &multicast_register_if;
  834                 if (mrtdebug)
  835                         log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
  836                             (void *)ifp);
  837                 if (reg_vif_num == VIFI_INVALID) {
  838                         memset(ifp, 0, sizeof(*ifp));
  839                         snprintf(ifp->if_xname, sizeof(ifp->if_xname),
  840                                  "register_vif");
  841                         ifp->if_flags = IFF_LOOPBACK;
  842                         memset(&vifp->v_route, 0, sizeof(vifp->v_route));
  843                         reg_vif_num = vifcp->vifc_vifi;
  844                 }
  845 #endif
  846         } else {
  847                 /* Make sure the interface supports multicast. */
  848                 if ((ifp->if_flags & IFF_MULTICAST) == 0)
  849                         return EOPNOTSUPP;
  850 
  851                 /* Enable promiscuous reception of all IP multicasts. */
  852                 sockaddr_in_init(&sin, &zeroin_addr, 0);
  853                 error = if_mcast_op(ifp, SIOCADDMULTI, sintosa(&sin));
  854                 if (error)
  855                         return error;
  856         }
  857 
  858         s = splsoftnet();
  859 
  860         /* Define parameters for the tbf structure. */
  861         vifp->tbf_q = NULL;
  862         vifp->tbf_t = &vifp->tbf_q;
  863         microtime(&vifp->tbf_last_pkt_t);
  864         vifp->tbf_n_tok = 0;
  865         vifp->tbf_q_len = 0;
  866         vifp->tbf_max_q_len = MAXQSIZE;
  867 
  868         vifp->v_flags = vifcp->vifc_flags;
  869         vifp->v_threshold = vifcp->vifc_threshold;
  870         /* scaling up here allows division by 1024 in critical code */
  871         vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
  872         vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
  873         vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
  874         vifp->v_ifp = ifp;
  875         /* Initialize per vif pkt counters. */
  876         vifp->v_pkt_in = 0;
  877         vifp->v_pkt_out = 0;
  878         vifp->v_bytes_in = 0;
  879         vifp->v_bytes_out = 0;
  880 
  881         callout_init(&vifp->v_repq_ch, 0);
  882 
  883         splx(s);
  884 
  885         /* Adjust numvifs up if the vifi is higher than numvifs. */
  886         if (numvifs <= vifcp->vifc_vifi)
  887                 numvifs = vifcp->vifc_vifi + 1;
  888 
  889         if (mrtdebug)
  890                 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
  891                     vifcp->vifc_vifi,
  892                     ntohl(vifcp->vifc_lcl_addr.s_addr),
  893                     (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
  894                     ntohl(vifcp->vifc_rmt_addr.s_addr),
  895                     vifcp->vifc_threshold,
  896                     vifcp->vifc_rate_limit);
  897 
  898         return 0;
  899 }
  900 
  901 void
  902 reset_vif(struct vif *vifp)
  903 {
  904         struct mbuf *m, *n;
  905         struct ifnet *ifp;
  906         struct sockaddr_in sin;
  907 
  908         callout_stop(&vifp->v_repq_ch);
  909 
  910         /* detach this vif from decapsulator dispatch table */
  911         encap_lock_enter();
  912         encap_detach(vifp->v_encap_cookie);
  913         encap_lock_exit();
  914         vifp->v_encap_cookie = NULL;
  915 
  916         /*
  917          * Free packets queued at the interface
  918          */
  919         for (m = vifp->tbf_q; m != NULL; m = n) {
  920                 n = m->m_nextpkt;
  921                 m_freem(m);
  922         }
  923 
  924         if (vifp->v_flags & VIFF_TUNNEL)
  925                 free(vifp->v_ifp, M_MRTABLE);
  926         else if (vifp->v_flags & VIFF_REGISTER) {
  927 #ifdef PIM
  928                 reg_vif_num = VIFI_INVALID;
  929 #endif
  930         } else {
  931                 sockaddr_in_init(&sin, &zeroin_addr, 0);
  932                 ifp = vifp->v_ifp;
  933                 if_mcast_op(ifp, SIOCDELMULTI, sintosa(&sin));
  934         }
  935         memset((void *)vifp, 0, sizeof(*vifp));
  936 }
  937 
  938 /*
  939  * Delete a vif from the vif table
  940  */
  941 static int
  942 del_vif(vifi_t *vifip)
  943 {
  944         struct vif *vifp;
  945         vifi_t vifi;
  946         int s;
  947 
  948         if (*vifip >= numvifs)
  949                 return EINVAL;
  950 
  951         vifp = &viftable[*vifip];
  952         if (in_nullhost(vifp->v_lcl_addr))
  953                 return EADDRNOTAVAIL;
  954 
  955         s = splsoftnet();
  956 
  957         reset_vif(vifp);
  958 
  959         /* Adjust numvifs down */
  960         for (vifi = numvifs; vifi > 0; vifi--)
  961                 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
  962                         break;
  963         numvifs = vifi;
  964 
  965         splx(s);
  966 
  967         if (mrtdebug)
  968                 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
  969 
  970         return 0;
  971 }
  972 
  973 /*
  974  * update an mfc entry without resetting counters and S,G addresses.
  975  */
  976 static void
  977 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
  978 {
  979         int i;
  980 
  981         rt->mfc_parent = mfccp->mfcc_parent;
  982         for (i = 0; i < numvifs; i++) {
  983                 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
  984                 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
  985                         MRT_MFC_FLAGS_ALL;
  986         }
  987         /* set the RP address */
  988         if (mrt_api_config & MRT_MFC_RP)
  989                 rt->mfc_rp = mfccp->mfcc_rp;
  990         else
  991                 rt->mfc_rp = zeroin_addr;
  992 }
  993 
  994 /*
  995  * fully initialize an mfc entry from the parameter.
  996  */
  997 static void
  998 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
  999 {
 1000         rt->mfc_origin     = mfccp->mfcc_origin;
 1001         rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 1002 
 1003         update_mfc_params(rt, mfccp);
 1004 
 1005         /* initialize pkt counters per src-grp */
 1006         rt->mfc_pkt_cnt    = 0;
 1007         rt->mfc_byte_cnt   = 0;
 1008         rt->mfc_wrong_if   = 0;
 1009         timerclear(&rt->mfc_last_assert);
 1010 }
 1011 
 1012 static void
 1013 expire_mfc(struct mfc *rt)
 1014 {
 1015         struct rtdetq *rte, *nrte;
 1016 
 1017         free_bw_list(rt->mfc_bw_meter);
 1018 
 1019         for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
 1020                 nrte = rte->next;
 1021                 m_freem(rte->m);
 1022                 free(rte, M_MRTABLE);
 1023         }
 1024 
 1025         LIST_REMOVE(rt, mfc_hash);
 1026         free(rt, M_MRTABLE);
 1027 }
 1028 
 1029 /*
 1030  * Add an mfc entry
 1031  */
 1032 static int
 1033 add_mfc(struct sockopt *sopt)
 1034 {
 1035         struct mfcctl2 mfcctl2;
 1036         struct mfcctl2 *mfccp;
 1037         struct mfc *rt;
 1038         u_int32_t hash = 0;
 1039         struct rtdetq *rte, *nrte;
 1040         u_short nstl;
 1041         int s;
 1042         int error;
 1043 
 1044         /*
 1045          * select data size depending on API version.
 1046          */
 1047         mfccp = &mfcctl2;
 1048         memset(&mfcctl2, 0, sizeof(mfcctl2));
 1049 
 1050         if (mrt_api_config & MRT_API_FLAGS_ALL)
 1051                 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
 1052         else
 1053                 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
 1054 
 1055         if (error)
 1056                 return error;
 1057 
 1058         s = splsoftnet();
 1059         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1060 
 1061         /* If an entry already exists, just update the fields */
 1062         if (rt) {
 1063                 if (mrtdebug & DEBUG_MFC)
 1064                         log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
 1065                             ntohl(mfccp->mfcc_origin.s_addr),
 1066                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1067                             mfccp->mfcc_parent);
 1068 
 1069                 update_mfc_params(rt, mfccp);
 1070 
 1071                 splx(s);
 1072                 return 0;
 1073         }
 1074 
 1075         /*
 1076          * Find the entry for which the upcall was made and update
 1077          */
 1078         nstl = 0;
 1079         hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
 1080         LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1081                 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1082                     in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 1083                     rt->mfc_stall != NULL) {
 1084                         if (nstl++)
 1085                                 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
 1086                                     "multiple kernel entries",
 1087                                     ntohl(mfccp->mfcc_origin.s_addr),
 1088                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1089                                     mfccp->mfcc_parent, rt->mfc_stall);
 1090 
 1091                         if (mrtdebug & DEBUG_MFC)
 1092                                 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n",
 1093                                     ntohl(mfccp->mfcc_origin.s_addr),
 1094                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1095                                     mfccp->mfcc_parent, rt->mfc_stall);
 1096 
 1097                         rte = rt->mfc_stall;
 1098                         init_mfc_params(rt, mfccp);
 1099                         rt->mfc_stall = NULL;
 1100 
 1101                         rt->mfc_expire = 0; /* Don't clean this guy up */
 1102                         nexpire[hash]--;
 1103 
 1104                         /* free packets Qed at the end of this entry */
 1105                         for (; rte != NULL; rte = nrte) {
 1106                                 nrte = rte->next;
 1107                                 if (rte->ifp) {
 1108                                         ip_mdq(rte->m, rte->ifp, rt);
 1109                                 }
 1110                                 m_freem(rte->m);
 1111 #ifdef UPCALL_TIMING
 1112                                 collate(&rte->t);
 1113 #endif /* UPCALL_TIMING */
 1114                                 free(rte, M_MRTABLE);
 1115                         }
 1116                 }
 1117         }
 1118 
 1119         /*
 1120          * It is possible that an entry is being inserted without an upcall
 1121          */
 1122         if (nstl == 0) {
 1123                 /*
 1124                  * No mfc; make a new one
 1125                  */
 1126                 if (mrtdebug & DEBUG_MFC)
 1127                         log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
 1128                             ntohl(mfccp->mfcc_origin.s_addr),
 1129                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1130                             mfccp->mfcc_parent);
 1131 
 1132                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1133                         if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1134                             in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 1135                                 init_mfc_params(rt, mfccp);
 1136                                 if (rt->mfc_expire)
 1137                                         nexpire[hash]--;
 1138                                 rt->mfc_expire = 0;
 1139                                 break; /* XXX */
 1140                         }
 1141                 }
 1142                 if (rt == NULL) {       /* no upcall, so make a new entry */
 1143                         rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 1144                         if (rt == NULL) {
 1145                                 splx(s);
 1146                                 return ENOBUFS;
 1147                         }
 1148 
 1149                         init_mfc_params(rt, mfccp);
 1150                         rt->mfc_expire  = 0;
 1151                         rt->mfc_stall   = NULL;
 1152                         rt->mfc_bw_meter = NULL;
 1153 
 1154                         /* insert new entry at head of hash chain */
 1155                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1156                 }
 1157         }
 1158 
 1159         splx(s);
 1160         return 0;
 1161 }
 1162 
 1163 #ifdef UPCALL_TIMING
 1164 /*
 1165  * collect delay statistics on the upcalls
 1166  */
 1167 static void
 1168 collate(struct timeval *t)
 1169 {
 1170         u_int32_t d;
 1171         struct timeval tp;
 1172         u_int32_t delta;
 1173 
 1174         microtime(&tp);
 1175 
 1176         if (timercmp(t, &tp, <)) {
 1177                 TV_DELTA(tp, *t, delta);
 1178 
 1179                 d = delta >> 10;
 1180                 if (d > 50)
 1181                         d = 50;
 1182 
 1183                 ++upcall_data[d];
 1184         }
 1185 }
 1186 #endif /* UPCALL_TIMING */
 1187 
 1188 /*
 1189  * Delete an mfc entry
 1190  */
 1191 static int
 1192 del_mfc(struct sockopt *sopt)
 1193 {
 1194         struct mfcctl2 mfcctl2;
 1195         struct mfcctl2 *mfccp;
 1196         struct mfc *rt;
 1197         int s;
 1198         int error;
 1199 
 1200         /*
 1201          * XXX: for deleting MFC entries the information in entries
 1202          * of size "struct mfcctl" is sufficient.
 1203          */
 1204 
 1205         mfccp = &mfcctl2;
 1206         memset(&mfcctl2, 0, sizeof(mfcctl2));
 1207 
 1208         error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
 1209         if (error) {
 1210                 /* Try with the size of mfcctl2. */
 1211                 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
 1212                 if (error)
 1213                         return error;
 1214         }
 1215 
 1216         if (mrtdebug & DEBUG_MFC)
 1217                 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
 1218                     ntohl(mfccp->mfcc_origin.s_addr),
 1219                     ntohl(mfccp->mfcc_mcastgrp.s_addr));
 1220 
 1221         s = splsoftnet();
 1222 
 1223         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1224         if (rt == NULL) {
 1225                 splx(s);
 1226                 return EADDRNOTAVAIL;
 1227         }
 1228 
 1229         /*
 1230          * free the bw_meter entries
 1231          */
 1232         free_bw_list(rt->mfc_bw_meter);
 1233         rt->mfc_bw_meter = NULL;
 1234 
 1235         LIST_REMOVE(rt, mfc_hash);
 1236         free(rt, M_MRTABLE);
 1237 
 1238         splx(s);
 1239         return 0;
 1240 }
 1241 
 1242 static int
 1243 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 1244 {
 1245         if (s) {
 1246                 if (sbappendaddr(&s->so_rcv, sintosa(src), mm, NULL) != 0) {
 1247                         sorwakeup(s);
 1248                         return 0;
 1249                 }
 1250                 soroverflow(s);
 1251         }
 1252         m_freem(mm);
 1253         return -1;
 1254 }
 1255 
 1256 /*
 1257  * IP multicast forwarding function. This function assumes that the packet
 1258  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
 1259  * pointed to by "ifp", and the packet is to be relayed to other networks
 1260  * that have members of the packet's destination IP multicast group.
 1261  *
 1262  * The packet is returned unscathed to the caller, unless it is
 1263  * erroneous, in which case a non-zero return value tells the caller to
 1264  * discard it.
 1265  */
 1266 
 1267 #define IP_HDR_LEN  20  /* # bytes of fixed IP header (excluding options) */
 1268 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 1269 
 1270 int
 1271 ip_mforward(struct mbuf *m, struct ifnet *ifp)
 1272 {
 1273         int rc;
 1274         /*
 1275          * save csum_flags to uphold the 
 1276          * "unscathed" guarantee.
 1277          * ip_output() relies on that and
 1278          * without it we send out
 1279          * multicast packets with an invalid
 1280          * checksum
 1281          *
 1282          * see PR kern/55779
 1283          */
 1284         int csum_flags = m->m_pkthdr.csum_flags;
 1285 
 1286         /*
 1287          * Temporarily clear any in-bound checksum flags for this packet.
 1288          */
 1289         m->m_pkthdr.csum_flags = 0;
 1290 
 1291         rc = ip_mforward_real(m, ifp);
 1292 
 1293         m->m_pkthdr.csum_flags = csum_flags;
 1294 
 1295         return rc;
 1296 }
 1297 
 1298 static int
 1299 ip_mforward_real(struct mbuf *m, struct ifnet *ifp)
 1300 {
 1301         struct ip *ip = mtod(m, struct ip *);
 1302         struct mfc *rt;
 1303         static int srctun = 0;
 1304         struct mbuf *mm;
 1305         struct sockaddr_in sin;
 1306         int s;
 1307         vifi_t vifi;
 1308 
 1309         if (mrtdebug & DEBUG_FORWARD)
 1310                 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
 1311                     ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
 1312 
 1313         /*
 1314          * XXX XXX: Why do we check [1] against IPOPT_LSRR? Because we
 1315          * expect [0] to be IPOPT_NOP, maybe? In all cases that doesn't
 1316          * make a lot of sense, a forged packet can just put two IPOPT_NOPs
 1317          * followed by one IPOPT_LSRR, and bypass the check.
 1318          */
 1319         if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
 1320             ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
 1321                 /*
 1322                  * Packet arrived via a physical interface or
 1323                  * an encapsulated tunnel or a register_vif.
 1324                  */
 1325         } else {
 1326                 /*
 1327                  * Packet arrived through a source-route tunnel.
 1328                  * Source-route tunnels are no longer supported.
 1329                  */
 1330                 if ((srctun++ % 1000) == 0)
 1331                         log(LOG_ERR,
 1332                             "ip_mforward: received source-routed packet from %x\n",
 1333                             ntohl(ip->ip_src.s_addr));
 1334                 return EOPNOTSUPP;
 1335         }
 1336 
 1337         /*
 1338          * Don't forward a packet with time-to-live of zero or one,
 1339          * or a packet destined to a local-only group.
 1340          */
 1341         if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
 1342                 return 0;
 1343 
 1344         /*
 1345          * Determine forwarding vifs from the forwarding cache table
 1346          */
 1347         s = splsoftnet();
 1348         ++mrtstat.mrts_mfc_lookups;
 1349         rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 1350 
 1351         /* Entry exists, so forward if necessary */
 1352         if (rt != NULL) {
 1353                 splx(s);
 1354                 return ip_mdq(m, ifp, rt);
 1355         } else {
 1356                 /*
 1357                  * If we don't have a route for packet's origin, make a copy
 1358                  * of the packet and send message to routing daemon.
 1359                  */
 1360 
 1361                 struct mbuf *mb0;
 1362                 struct rtdetq *rte;
 1363                 u_int32_t hash;
 1364                 const int hlen = ip->ip_hl << 2;
 1365 #ifdef UPCALL_TIMING
 1366                 struct timeval tp;
 1367                 microtime(&tp);
 1368 #endif
 1369 
 1370                 ++mrtstat.mrts_mfc_misses;
 1371 
 1372                 mrtstat.mrts_no_route++;
 1373                 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
 1374                         log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
 1375                             ntohl(ip->ip_src.s_addr),
 1376                             ntohl(ip->ip_dst.s_addr));
 1377 
 1378                 /*
 1379                  * Allocate mbufs early so that we don't do extra work if we are
 1380                  * just going to fail anyway.  Make sure to pullup the header so
 1381                  * that other people can't step on it.
 1382                  */
 1383                 rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT);
 1384                 if (rte == NULL) {
 1385                         splx(s);
 1386                         return ENOBUFS;
 1387                 }
 1388                 mb0 = m_copypacket(m, M_DONTWAIT);
 1389                 M_PULLUP(mb0, hlen);
 1390                 if (mb0 == NULL) {
 1391                         free(rte, M_MRTABLE);
 1392                         splx(s);
 1393                         return ENOBUFS;
 1394                 }
 1395 
 1396                 /* is there an upcall waiting for this flow? */
 1397                 hash = MFCHASH(ip->ip_src, ip->ip_dst);
 1398                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1399                         if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 1400                             in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 1401                             rt->mfc_stall != NULL)
 1402                                 break;
 1403                 }
 1404 
 1405                 if (rt == NULL) {
 1406                         int i;
 1407                         struct igmpmsg *im;
 1408 
 1409                         /*
 1410                          * Locate the vifi for the incoming interface for
 1411                          * this packet.
 1412                          * If none found, drop packet.
 1413                          */
 1414                         for (vifi = 0; vifi < numvifs &&
 1415                                  viftable[vifi].v_ifp != ifp; vifi++)
 1416                                 ;
 1417                         if (vifi >= numvifs) /* vif not found, drop packet */
 1418                                 goto non_fatal;
 1419 
 1420                         /* no upcall, so make a new entry */
 1421                         rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 1422                         if (rt == NULL)
 1423                                 goto fail;
 1424 
 1425                         /*
 1426                          * Make a copy of the header to send to the user level
 1427                          * process
 1428                          */
 1429                         mm = m_copym(m, 0, hlen, M_DONTWAIT);
 1430                         M_PULLUP(mm, hlen);
 1431                         if (mm == NULL)
 1432                                 goto fail1;
 1433 
 1434                         /*
 1435                          * Send message to routing daemon to install
 1436                          * a route into the kernel table
 1437                          */
 1438 
 1439                         im = mtod(mm, struct igmpmsg *);
 1440                         im->im_msgtype = IGMPMSG_NOCACHE;
 1441                         im->im_mbz = 0;
 1442                         im->im_vif = vifi;
 1443 
 1444                         mrtstat.mrts_upcalls++;
 1445 
 1446                         sockaddr_in_init(&sin, &ip->ip_src, 0);
 1447                         if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1448                                 log(LOG_WARNING,
 1449                                     "ip_mforward: ip_mrouter socket queue full\n");
 1450                                 ++mrtstat.mrts_upq_sockfull;
 1451                         fail1:
 1452                                 free(rt, M_MRTABLE);
 1453                         fail:
 1454                                 free(rte, M_MRTABLE);
 1455                                 m_freem(mb0);
 1456                                 splx(s);
 1457                                 return ENOBUFS;
 1458                         }
 1459 
 1460                         /* insert new entry at head of hash chain */
 1461                         rt->mfc_origin = ip->ip_src;
 1462                         rt->mfc_mcastgrp = ip->ip_dst;
 1463                         rt->mfc_pkt_cnt = 0;
 1464                         rt->mfc_byte_cnt = 0;
 1465                         rt->mfc_wrong_if = 0;
 1466                         rt->mfc_expire = UPCALL_EXPIRE;
 1467                         nexpire[hash]++;
 1468                         for (i = 0; i < numvifs; i++) {
 1469                                 rt->mfc_ttls[i] = 0;
 1470                                 rt->mfc_flags[i] = 0;
 1471                         }
 1472                         rt->mfc_parent = -1;
 1473 
 1474                         /* clear the RP address */
 1475                         rt->mfc_rp = zeroin_addr;
 1476 
 1477                         rt->mfc_bw_meter = NULL;
 1478 
 1479                         /* link into table */
 1480                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1481                         /* Add this entry to the end of the queue */
 1482                         rt->mfc_stall = rte;
 1483                 } else {
 1484                         /* determine if q has overflowed */
 1485                         struct rtdetq **p;
 1486                         int npkts = 0;
 1487 
 1488                         /*
 1489                          * XXX ouch! we need to append to the list, but we
 1490                          * only have a pointer to the front, so we have to
 1491                          * scan the entire list every time.
 1492                          */
 1493                         for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
 1494                                 if (++npkts > MAX_UPQ) {
 1495                                         mrtstat.mrts_upq_ovflw++;
 1496                                 non_fatal:
 1497                                         free(rte, M_MRTABLE);
 1498                                         m_freem(mb0);
 1499                                         splx(s);
 1500                                         return 0;
 1501                                 }
 1502 
 1503                         /* Add this entry to the end of the queue */
 1504                         *p = rte;
 1505                 }
 1506 
 1507                 rte->next = NULL;
 1508                 rte->m = mb0;
 1509                 rte->ifp = ifp;
 1510 #ifdef UPCALL_TIMING
 1511                 rte->t = tp;
 1512 #endif
 1513 
 1514                 splx(s);
 1515 
 1516                 return 0;
 1517         }
 1518 }
 1519 
 1520 /*ARGSUSED*/
 1521 static void
 1522 expire_upcalls(void *v)
 1523 {
 1524         int i;
 1525 
 1526         /* XXX NOMPSAFE still need softnet_lock */
 1527         mutex_enter(softnet_lock);
 1528         KERNEL_LOCK(1, NULL);
 1529 
 1530         for (i = 0; i < MFCTBLSIZ; i++) {
 1531                 struct mfc *rt, *nrt;
 1532 
 1533                 if (nexpire[i] == 0)
 1534                         continue;
 1535 
 1536                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
 1537                         nrt = LIST_NEXT(rt, mfc_hash);
 1538 
 1539                         if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 1540                                 continue;
 1541                         nexpire[i]--;
 1542 
 1543                         /*
 1544                          * free the bw_meter entries
 1545                          */
 1546                         while (rt->mfc_bw_meter != NULL) {
 1547                                 struct bw_meter *x = rt->mfc_bw_meter;
 1548 
 1549                                 rt->mfc_bw_meter = x->bm_mfc_next;
 1550                                 kmem_intr_free(x, sizeof(*x));
 1551                         }
 1552 
 1553                         ++mrtstat.mrts_cache_cleanups;
 1554                         if (mrtdebug & DEBUG_EXPIRE)
 1555                                 log(LOG_DEBUG,
 1556                                     "expire_upcalls: expiring (%x %x)\n",
 1557                                     ntohl(rt->mfc_origin.s_addr),
 1558                                     ntohl(rt->mfc_mcastgrp.s_addr));
 1559 
 1560                         expire_mfc(rt);
 1561                 }
 1562         }
 1563 
 1564         callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 1565             expire_upcalls, NULL);
 1566 
 1567         KERNEL_UNLOCK_ONE(NULL);
 1568         mutex_exit(softnet_lock);
 1569 }
 1570 
 1571 /*
 1572  * Macro to send packet on vif.
 1573  */
 1574 #define MC_SEND(ip, vifp, m) do {                                       \
 1575         if ((vifp)->v_flags & VIFF_TUNNEL)                              \
 1576                 encap_send((ip), (vifp), (m));                          \
 1577         else                                                            \
 1578                 phyint_send((ip), (vifp), (m));                         \
 1579 } while (/*CONSTCOND*/ 0)
 1580 
 1581 /*
 1582  * Packet forwarding routine once entry in the cache is made
 1583  */
 1584 static int
 1585 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
 1586 {
 1587         struct ip *ip = mtod(m, struct ip *);
 1588         vifi_t vifi;
 1589         struct vif *vifp;
 1590         struct sockaddr_in sin;
 1591         const int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
 1592 
 1593         /*
 1594          * Don't forward if it didn't arrive from the parent vif for its origin.
 1595          */
 1596         vifi = rt->mfc_parent;
 1597         if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
 1598                 /* came in the wrong interface */
 1599                 if (mrtdebug & DEBUG_FORWARD)
 1600                         log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
 1601                             ifp, vifi,
 1602                             vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
 1603                 ++mrtstat.mrts_wrong_if;
 1604                 ++rt->mfc_wrong_if;
 1605 
 1606                 /*
 1607                  * If we are doing PIM assert processing, send a message
 1608                  * to the routing daemon.
 1609                  *
 1610                  * XXX: A PIM-SM router needs the WRONGVIF detection so it
 1611                  * can complete the SPT switch, regardless of the type
 1612                  * of the iif (broadcast media, GRE tunnel, etc).
 1613                  */
 1614                 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
 1615                         struct timeval now;
 1616                         u_int32_t delta;
 1617 
 1618 #ifdef PIM
 1619                         if (ifp == &multicast_register_if)
 1620                                 pimstat.pims_rcv_registers_wrongiif++;
 1621 #endif
 1622 
 1623                         /* Get vifi for the incoming packet */
 1624                         for (vifi = 0;
 1625                              vifi < numvifs && viftable[vifi].v_ifp != ifp;
 1626                              vifi++)
 1627                             ;
 1628                         if (vifi >= numvifs) {
 1629                                 /* The iif is not found: ignore the packet. */
 1630                                 return 0;
 1631                         }
 1632 
 1633                         if (rt->mfc_flags[vifi] &
 1634                             MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
 1635                                 /* WRONGVIF disabled: ignore the packet */
 1636                                 return 0;
 1637                         }
 1638 
 1639                         microtime(&now);
 1640 
 1641                         TV_DELTA(rt->mfc_last_assert, now, delta);
 1642 
 1643                         if (delta > ASSERT_MSG_TIME) {
 1644                                 struct igmpmsg *im;
 1645                                 const int hlen = ip->ip_hl << 2;
 1646                                 struct mbuf *mm =
 1647                                     m_copym(m, 0, hlen, M_DONTWAIT);
 1648 
 1649                                 M_PULLUP(mm, hlen);
 1650                                 if (mm == NULL)
 1651                                         return ENOBUFS;
 1652 
 1653                                 rt->mfc_last_assert = now;
 1654 
 1655                                 im = mtod(mm, struct igmpmsg *);
 1656                                 im->im_msgtype  = IGMPMSG_WRONGVIF;
 1657                                 im->im_mbz      = 0;
 1658                                 im->im_vif      = vifi;
 1659 
 1660                                 mrtstat.mrts_upcalls++;
 1661 
 1662                                 sockaddr_in_init(&sin, &im->im_src, 0);
 1663                                 if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1664                                         log(LOG_WARNING,
 1665                                             "ip_mforward: ip_mrouter socket queue full\n");
 1666                                         ++mrtstat.mrts_upq_sockfull;
 1667                                         return ENOBUFS;
 1668                                 }
 1669                         }
 1670                 }
 1671                 return 0;
 1672         }
 1673 
 1674         /* If I sourced this packet, it counts as output, else it was input. */
 1675         if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
 1676                 viftable[vifi].v_pkt_out++;
 1677                 viftable[vifi].v_bytes_out += plen;
 1678         } else {
 1679                 viftable[vifi].v_pkt_in++;
 1680                 viftable[vifi].v_bytes_in += plen;
 1681         }
 1682         rt->mfc_pkt_cnt++;
 1683         rt->mfc_byte_cnt += plen;
 1684 
 1685         /*
 1686          * For each vif, decide if a copy of the packet should be forwarded.
 1687          * Forward if:
 1688          *  - the ttl exceeds the vif's threshold
 1689          *  - there are group members downstream on interface
 1690          */
 1691         for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) {
 1692                 if ((rt->mfc_ttls[vifi] > 0) &&
 1693                         (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 1694                         vifp->v_pkt_out++;
 1695                         vifp->v_bytes_out += plen;
 1696 #ifdef PIM
 1697                         if (vifp->v_flags & VIFF_REGISTER)
 1698                                 pim_register_send(ip, vifp, m, rt);
 1699                         else
 1700 #endif
 1701                         MC_SEND(ip, vifp, m);
 1702                 }
 1703         }
 1704 
 1705         /*
 1706          * Perform upcall-related bw measuring.
 1707          */
 1708         if (rt->mfc_bw_meter != NULL) {
 1709                 struct bw_meter *x;
 1710                 struct timeval now;
 1711 
 1712                 microtime(&now);
 1713                 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 1714                         bw_meter_receive_packet(x, plen, &now);
 1715         }
 1716 
 1717         return 0;
 1718 }
 1719 
 1720 static void
 1721 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1722 {
 1723         struct mbuf *mb_copy;
 1724         const int hlen = ip->ip_hl << 2;
 1725 
 1726         /*
 1727          * Make a new reference to the packet; make sure that
 1728          * the IP header is actually copied, not just referenced,
 1729          * so that ip_output() only scribbles on the copy.
 1730          */
 1731         mb_copy = m_copypacket(m, M_DONTWAIT);
 1732         M_PULLUP(mb_copy, hlen);
 1733         if (mb_copy == NULL)
 1734                 return;
 1735 
 1736         if (vifp->v_rate_limit <= 0)
 1737                 tbf_send_packet(vifp, mb_copy);
 1738         else
 1739                 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *),
 1740                     ntohs(ip->ip_len));
 1741 }
 1742 
 1743 static void
 1744 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1745 {
 1746         struct mbuf *mb_copy;
 1747         struct ip *ip_copy;
 1748         int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
 1749 
 1750         /* Take care of delayed checksums */
 1751         if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
 1752                 in_undefer_cksum_tcpudp(m);
 1753                 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
 1754         }
 1755 
 1756         /*
 1757          * copy the old packet & pullup its IP header into the
 1758          * new mbuf so we can modify it.  Try to fill the new
 1759          * mbuf since if we don't the ethernet driver will.
 1760          */
 1761         MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
 1762         if (mb_copy == NULL)
 1763                 return;
 1764         mb_copy->m_data += max_linkhdr;
 1765         mb_copy->m_pkthdr.len = len;
 1766         mb_copy->m_len = sizeof(multicast_encap_iphdr);
 1767 
 1768         if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
 1769                 m_freem(mb_copy);
 1770                 return;
 1771         }
 1772         i = MHLEN - max_linkhdr;
 1773         if (i > len)
 1774                 i = len;
 1775         mb_copy = m_pullup(mb_copy, i);
 1776         if (mb_copy == NULL)
 1777                 return;
 1778 
 1779         /*
 1780          * fill in the encapsulating IP header.
 1781          */
 1782         ip_copy = mtod(mb_copy, struct ip *);
 1783         *ip_copy = multicast_encap_iphdr;
 1784         if (len < IP_MINFRAGSIZE)
 1785                 ip_copy->ip_id = 0;
 1786         else
 1787                 ip_copy->ip_id = ip_newid(NULL);
 1788         ip_copy->ip_len = htons(len);
 1789         ip_copy->ip_src = vifp->v_lcl_addr;
 1790         ip_copy->ip_dst = vifp->v_rmt_addr;
 1791 
 1792         /*
 1793          * turn the encapsulated IP header back into a valid one.
 1794          */
 1795         ip = (struct ip *)((char *)ip_copy + sizeof(multicast_encap_iphdr));
 1796         --ip->ip_ttl;
 1797         ip->ip_sum = 0;
 1798         mb_copy->m_data += sizeof(multicast_encap_iphdr);
 1799         ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 1800         mb_copy->m_data -= sizeof(multicast_encap_iphdr);
 1801 
 1802         if (vifp->v_rate_limit <= 0)
 1803                 tbf_send_packet(vifp, mb_copy);
 1804         else
 1805                 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len));
 1806 }
 1807 
 1808 /*
 1809  * De-encapsulate a packet and feed it back through ip input.
 1810  */
 1811 static void
 1812 vif_input(struct mbuf *m, int off, int proto, void *eparg)
 1813 {
 1814         struct vif *vifp = eparg;
 1815 
 1816         KASSERT(vifp != NULL);
 1817 
 1818         if (proto != ENCAP_PROTO) {
 1819                 m_freem(m);
 1820                 mrtstat.mrts_bad_tunnel++;
 1821                 return;
 1822         }
 1823 
 1824         m_adj(m, off);
 1825         m_set_rcvif(m, vifp->v_ifp);
 1826 
 1827         if (__predict_false(!pktq_enqueue(ip_pktq, m, 0))) {
 1828                 m_freem(m);
 1829         }
 1830 }
 1831 
 1832 /*
 1833  * Check if the packet should be received on the vif denoted by arg.
 1834  * (The encap selection code will call this once per vif since each is
 1835  * registered separately.)
 1836  */
 1837 static int
 1838 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg)
 1839 {
 1840         struct vif *vifp;
 1841         struct ip ip;
 1842 
 1843 #ifdef DIAGNOSTIC
 1844         if (!arg || proto != IPPROTO_IPV4)
 1845                 panic("unexpected arg in vif_encapcheck");
 1846 #endif
 1847 
 1848         /*
 1849          * Accept the packet only if the inner header is multicast
 1850          * and the outer header matches a tunnel-mode vif.  Order
 1851          * checks in the hope that common non-matching packets will be
 1852          * rejected quickly.  Assume that unicast IPv4 traffic in a
 1853          * parallel tunnel (e.g. gif(4)) is unlikely.
 1854          */
 1855 
 1856         /* Obtain the outer IP header and the vif pointer. */
 1857         m_copydata(m, 0, sizeof(ip), (void *)&ip);
 1858         vifp = (struct vif *)arg;
 1859 
 1860         /*
 1861          * The outer source must match the vif's remote peer address.
 1862          * For a multicast router with several tunnels, this is the
 1863          * only check that will fail on packets in other tunnels,
 1864          * assuming the local address is the same.
 1865          */
 1866         if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src))
 1867                 return 0;
 1868 
 1869         /* The outer destination must match the vif's local address. */
 1870         if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst))
 1871                 return 0;
 1872 
 1873         /* The vif must be of tunnel type. */
 1874         if ((vifp->v_flags & VIFF_TUNNEL) == 0)
 1875                 return 0;
 1876 
 1877         /* Check that the inner destination is multicast. */
 1878         if (off + sizeof(ip) > m->m_pkthdr.len)
 1879                 return 0;
 1880         m_copydata(m, off, sizeof(ip), (void *)&ip);
 1881         if (!IN_MULTICAST(ip.ip_dst.s_addr))
 1882                 return 0;
 1883 
 1884         /*
 1885          * We have checked that both the outer src and dst addresses
 1886          * match the vif, and that the inner destination is multicast
 1887          * (224/5).  By claiming more than 64, we intend to
 1888          * preferentially take packets that also match a parallel
 1889          * gif(4).
 1890          */
 1891         return 32 + 32 + 5;
 1892 }
 1893 
 1894 /*
 1895  * Token bucket filter module
 1896  */
 1897 static void
 1898 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len)
 1899 {
 1900 
 1901         if (len > MAX_BKT_SIZE) {
 1902                 /* drop if packet is too large */
 1903                 mrtstat.mrts_pkt2large++;
 1904                 m_freem(m);
 1905                 return;
 1906         }
 1907 
 1908         tbf_update_tokens(vifp);
 1909 
 1910         /*
 1911          * If there are enough tokens, and the queue is empty, send this packet
 1912          * out immediately.  Otherwise, try to insert it on this vif's queue.
 1913          */
 1914         if (vifp->tbf_q_len == 0) {
 1915                 if (len <= vifp->tbf_n_tok) {
 1916                         vifp->tbf_n_tok -= len;
 1917                         tbf_send_packet(vifp, m);
 1918                 } else {
 1919                         /* queue packet and timeout till later */
 1920                         tbf_queue(vifp, m);
 1921                         callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
 1922                             tbf_reprocess_q, vifp);
 1923                 }
 1924         } else {
 1925                 if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
 1926                     !tbf_dq_sel(vifp, ip)) {
 1927                         /* queue full, and couldn't make room */
 1928                         mrtstat.mrts_q_overflow++;
 1929                         m_freem(m);
 1930                 } else {
 1931                         /* queue length low enough, or made room */
 1932                         tbf_queue(vifp, m);
 1933                         tbf_process_q(vifp);
 1934                 }
 1935         }
 1936 }
 1937 
 1938 /*
 1939  * adds a packet to the queue at the interface
 1940  */
 1941 static void
 1942 tbf_queue(struct vif *vifp, struct mbuf *m)
 1943 {
 1944         int s = splsoftnet();
 1945 
 1946         /* insert at tail */
 1947         *vifp->tbf_t = m;
 1948         vifp->tbf_t = &m->m_nextpkt;
 1949         vifp->tbf_q_len++;
 1950 
 1951         splx(s);
 1952 }
 1953 
 1954 /*
 1955  * processes the queue at the interface
 1956  */
 1957 static void
 1958 tbf_process_q(struct vif *vifp)
 1959 {
 1960         struct mbuf *m;
 1961         int len;
 1962         int s = splsoftnet();
 1963 
 1964         /*
 1965          * Loop through the queue at the interface and send as many packets
 1966          * as possible.
 1967          */
 1968         for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) {
 1969                 len = ntohs(mtod(m, struct ip *)->ip_len);
 1970 
 1971                 /* determine if the packet can be sent */
 1972                 if (len <= vifp->tbf_n_tok) {
 1973                         /* if so,
 1974                          * reduce no of tokens, dequeue the packet,
 1975                          * send the packet.
 1976                          */
 1977                         if ((vifp->tbf_q = m->m_nextpkt) == NULL)
 1978                                 vifp->tbf_t = &vifp->tbf_q;
 1979                         --vifp->tbf_q_len;
 1980 
 1981                         m->m_nextpkt = NULL;
 1982                         vifp->tbf_n_tok -= len;
 1983                         tbf_send_packet(vifp, m);
 1984                 } else
 1985                         break;
 1986         }
 1987         splx(s);
 1988 }
 1989 
 1990 static void
 1991 tbf_reprocess_q(void *arg)
 1992 {
 1993         struct vif *vifp = arg;
 1994 
 1995         if (ip_mrouter == NULL)
 1996                 return;
 1997 
 1998         tbf_update_tokens(vifp);
 1999         tbf_process_q(vifp);
 2000 
 2001         if (vifp->tbf_q_len != 0)
 2002                 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
 2003                     tbf_reprocess_q, vifp);
 2004 }
 2005 
 2006 /* function that will selectively discard a member of the queue
 2007  * based on the precedence value and the priority
 2008  */
 2009 static int
 2010 tbf_dq_sel(struct vif *vifp, struct ip *ip)
 2011 {
 2012         u_int p;
 2013         struct mbuf **mp, *m;
 2014         int s = splsoftnet();
 2015 
 2016         p = priority(vifp, ip);
 2017 
 2018         for (mp = &vifp->tbf_q, m = *mp;
 2019             m != NULL;
 2020             mp = &m->m_nextpkt, m = *mp) {
 2021                 if (p > priority(vifp, mtod(m, struct ip *))) {
 2022                         if ((*mp = m->m_nextpkt) == NULL)
 2023                                 vifp->tbf_t = mp;
 2024                         --vifp->tbf_q_len;
 2025 
 2026                         m_freem(m);
 2027                         mrtstat.mrts_drop_sel++;
 2028                         splx(s);
 2029                         return 1;
 2030                 }
 2031         }
 2032         splx(s);
 2033         return 0;
 2034 }
 2035 
 2036 static void
 2037 tbf_send_packet(struct vif *vifp, struct mbuf *m)
 2038 {
 2039         int error;
 2040         int s = splsoftnet();
 2041 
 2042         if (vifp->v_flags & VIFF_TUNNEL) {
 2043                 /* If tunnel options */
 2044                 ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL);
 2045         } else {
 2046                 /* if physical interface option, extract the options and then send */
 2047                 struct ip_moptions imo;
 2048 
 2049                 imo.imo_multicast_if_index = if_get_index(vifp->v_ifp);
 2050                 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
 2051                 imo.imo_multicast_loop = 1;
 2052 
 2053                 error = ip_output(m, NULL, NULL, IP_FORWARDING|IP_MULTICASTOPTS,
 2054                     &imo, NULL);
 2055 
 2056                 if (mrtdebug & DEBUG_XMIT)
 2057                         log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
 2058                             (long)(vifp - viftable), error);
 2059         }
 2060         splx(s);
 2061 }
 2062 
 2063 /* determine the current time and then
 2064  * the elapsed time (between the last time and time now)
 2065  * in milliseconds & update the no. of tokens in the bucket
 2066  */
 2067 static void
 2068 tbf_update_tokens(struct vif *vifp)
 2069 {
 2070         struct timeval tp;
 2071         u_int32_t tm;
 2072         int s = splsoftnet();
 2073 
 2074         microtime(&tp);
 2075 
 2076         TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
 2077 
 2078         /*
 2079          * This formula is actually
 2080          * "time in seconds" * "bytes/second".
 2081          *
 2082          * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
 2083          *
 2084          * The (1000/1024) was introduced in add_vif to optimize
 2085          * this divide into a shift.
 2086          */
 2087         vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
 2088         vifp->tbf_last_pkt_t = tp;
 2089 
 2090         if (vifp->tbf_n_tok > MAX_BKT_SIZE)
 2091                 vifp->tbf_n_tok = MAX_BKT_SIZE;
 2092 
 2093         splx(s);
 2094 }
 2095 
 2096 static int
 2097 priority(struct vif *vifp, struct ip *ip)
 2098 {
 2099         int prio = 50;  /* the lowest priority -- default case */
 2100 
 2101         /* temporary hack; may add general packet classifier some day */
 2102 
 2103         /*
 2104          * XXX XXX: We're reading the UDP header, but we didn't ensure
 2105          * it was present in the packet.
 2106          */
 2107 
 2108         /*
 2109          * The UDP port space is divided up into four priority ranges:
 2110          * [0, 16384)     : unclassified - lowest priority
 2111          * [16384, 32768) : audio - highest priority
 2112          * [32768, 49152) : whiteboard - medium priority
 2113          * [49152, 65536) : video - low priority
 2114          */
 2115         if (ip->ip_p == IPPROTO_UDP) {
 2116                 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
 2117 
 2118                 switch (ntohs(udp->uh_dport) & 0xc000) {
 2119                 case 0x4000:
 2120                         prio = 70;
 2121                         break;
 2122                 case 0x8000:
 2123                         prio = 60;
 2124                         break;
 2125                 case 0xc000:
 2126                         prio = 55;
 2127                         break;
 2128                 }
 2129 
 2130                 if (tbfdebug > 1)
 2131                         log(LOG_DEBUG, "port %x prio %d\n",
 2132                             ntohs(udp->uh_dport), prio);
 2133         }
 2134 
 2135         return prio;
 2136 }
 2137 
 2138 /*
 2139  * Code for bandwidth monitors
 2140  */
 2141 
 2142 /*
 2143  * Define common interface for timeval-related methods
 2144  */
 2145 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
 2146 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
 2147 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
 2148 
 2149 static uint32_t
 2150 compute_bw_meter_flags(struct bw_upcall *req)
 2151 {
 2152         uint32_t flags = 0;
 2153 
 2154         if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 2155                 flags |= BW_METER_UNIT_PACKETS;
 2156         if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 2157                 flags |= BW_METER_UNIT_BYTES;
 2158         if (req->bu_flags & BW_UPCALL_GEQ)
 2159                 flags |= BW_METER_GEQ;
 2160         if (req->bu_flags & BW_UPCALL_LEQ)
 2161                 flags |= BW_METER_LEQ;
 2162 
 2163         return flags;
 2164 }
 2165 
 2166 /*
 2167  * Add a bw_meter entry
 2168  */
 2169 static int
 2170 add_bw_upcall(struct bw_upcall *req)
 2171 {
 2172         int s;
 2173         struct mfc *mfc;
 2174         struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 2175                 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
 2176         struct timeval now;
 2177         struct bw_meter *x;
 2178         uint32_t flags;
 2179 
 2180         if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2181                 return EOPNOTSUPP;
 2182 
 2183         /* Test if the flags are valid */
 2184         if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 2185                 return EINVAL;
 2186         if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 2187                 return EINVAL;
 2188         if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2189             == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2190                 return EINVAL;
 2191 
 2192         /* Test if the threshold time interval is valid */
 2193         if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 2194                 return EINVAL;
 2195 
 2196         flags = compute_bw_meter_flags(req);
 2197 
 2198         /*
 2199          * Find if we have already same bw_meter entry
 2200          */
 2201         s = splsoftnet();
 2202         mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2203         if (mfc == NULL) {
 2204                 splx(s);
 2205                 return EADDRNOTAVAIL;
 2206         }
 2207         for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 2208                 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2209                     &req->bu_threshold.b_time, ==)) &&
 2210                     (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2211                     (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2212                     (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 2213                         splx(s);
 2214                         return 0;               /* XXX Already installed */
 2215                 }
 2216         }
 2217 
 2218         /* Allocate the new bw_meter entry */
 2219         x = kmem_intr_alloc(sizeof(*x), KM_NOSLEEP);
 2220         if (x == NULL) {
 2221                 splx(s);
 2222                 return ENOBUFS;
 2223         }
 2224 
 2225         /* Set the new bw_meter entry */
 2226         x->bm_threshold.b_time = req->bu_threshold.b_time;
 2227         microtime(&now);
 2228         x->bm_start_time = now;
 2229         x->bm_threshold.b_packets = req->bu_threshold.b_packets;
 2230         x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
 2231         x->bm_measured.b_packets = 0;
 2232         x->bm_measured.b_bytes = 0;
 2233         x->bm_flags = flags;
 2234         x->bm_time_next = NULL;
 2235         x->bm_time_hash = BW_METER_BUCKETS;
 2236 
 2237         /* Add the new bw_meter entry to the front of entries for this MFC */
 2238         x->bm_mfc = mfc;
 2239         x->bm_mfc_next = mfc->mfc_bw_meter;
 2240         mfc->mfc_bw_meter = x;
 2241         schedule_bw_meter(x, &now);
 2242         splx(s);
 2243 
 2244         return 0;
 2245 }
 2246 
 2247 static void
 2248 free_bw_list(struct bw_meter *list)
 2249 {
 2250         while (list != NULL) {
 2251                 struct bw_meter *x = list;
 2252 
 2253                 list = list->bm_mfc_next;
 2254                 unschedule_bw_meter(x);
 2255                 kmem_intr_free(x, sizeof(*x));
 2256         }
 2257 }
 2258 
 2259 /*
 2260  * Delete one or multiple bw_meter entries
 2261  */
 2262 static int
 2263 del_bw_upcall(struct bw_upcall *req)
 2264 {
 2265         int s;
 2266         struct mfc *mfc;
 2267         struct bw_meter *x;
 2268 
 2269         if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2270                 return EOPNOTSUPP;
 2271 
 2272         s = splsoftnet();
 2273         /* Find the corresponding MFC entry */
 2274         mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2275         if (mfc == NULL) {
 2276                 splx(s);
 2277                 return EADDRNOTAVAIL;
 2278         } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 2279                 /*
 2280                  * Delete all bw_meter entries for this mfc
 2281                  */
 2282                 struct bw_meter *list;
 2283 
 2284                 list = mfc->mfc_bw_meter;
 2285                 mfc->mfc_bw_meter = NULL;
 2286                 free_bw_list(list);
 2287                 splx(s);
 2288                 return 0;
 2289         } else {                        /* Delete a single bw_meter entry */
 2290                 struct bw_meter *prev;
 2291                 uint32_t flags = 0;
 2292 
 2293                 flags = compute_bw_meter_flags(req);
 2294 
 2295                 /* Find the bw_meter entry to delete */
 2296                 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 2297                      prev = x, x = x->bm_mfc_next) {
 2298                         if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2299                             &req->bu_threshold.b_time, ==)) &&
 2300                             (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2301                             (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2302                             (x->bm_flags & BW_METER_USER_FLAGS) == flags)
 2303                                 break;
 2304                 }
 2305                 if (x != NULL) { /* Delete entry from the list for this MFC */
 2306                         if (prev != NULL)
 2307                                 prev->bm_mfc_next = x->bm_mfc_next;     /* remove from middle*/
 2308                         else
 2309                                 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 2310 
 2311                         unschedule_bw_meter(x);
 2312                         splx(s);
 2313                         /* Free the bw_meter entry */
 2314                         kmem_intr_free(x, sizeof(*x));
 2315                         return 0;
 2316                 } else {
 2317                         splx(s);
 2318                         return EINVAL;
 2319                 }
 2320         }
 2321         /* NOTREACHED */
 2322 }
 2323 
 2324 /*
 2325  * Perform bandwidth measurement processing that may result in an upcall
 2326  */
 2327 static void
 2328 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 2329 {
 2330         struct timeval delta;
 2331 
 2332         delta = *nowp;
 2333         BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2334 
 2335         if (x->bm_flags & BW_METER_GEQ) {
 2336                 /*
 2337                  * Processing for ">=" type of bw_meter entry
 2338                  */
 2339                 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2340                         /* Reset the bw_meter entry */
 2341                         x->bm_start_time = *nowp;
 2342                         x->bm_measured.b_packets = 0;
 2343                         x->bm_measured.b_bytes = 0;
 2344                         x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2345                 }
 2346 
 2347                 /* Record that a packet is received */
 2348                 x->bm_measured.b_packets++;
 2349                 x->bm_measured.b_bytes += plen;
 2350 
 2351                 /*
 2352                  * Test if we should deliver an upcall
 2353                  */
 2354                 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 2355                         if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2356                                  (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 2357                                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2358                                  (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 2359                                 /* Prepare an upcall for delivery */
 2360                                 bw_meter_prepare_upcall(x, nowp);
 2361                                 x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 2362                         }
 2363                 }
 2364         } else if (x->bm_flags & BW_METER_LEQ) {
 2365                 /*
 2366                  * Processing for "<=" type of bw_meter entry
 2367                  */
 2368                 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2369                         /*
 2370                          * We are behind time with the multicast forwarding table
 2371                          * scanning for "<=" type of bw_meter entries, so test now
 2372                          * if we should deliver an upcall.
 2373                          */
 2374                         if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2375                                  (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 2376                                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2377                                  (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 2378                                 /* Prepare an upcall for delivery */
 2379                                 bw_meter_prepare_upcall(x, nowp);
 2380                         }
 2381                         /* Reschedule the bw_meter entry */
 2382                         unschedule_bw_meter(x);
 2383                         schedule_bw_meter(x, nowp);
 2384                 }
 2385 
 2386                 /* Record that a packet is received */
 2387                 x->bm_measured.b_packets++;
 2388                 x->bm_measured.b_bytes += plen;
 2389 
 2390                 /*
 2391                  * Test if we should restart the measuring interval
 2392                  */
 2393                 if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 2394                      x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 2395                     (x->bm_flags & BW_METER_UNIT_BYTES &&
 2396                      x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 2397                         /* Don't restart the measuring interval */
 2398                 } else {
 2399                         /* Do restart the measuring interval */
 2400                         /*
 2401                          * XXX: note that we don't unschedule and schedule, because this
 2402                          * might be too much overhead per packet. Instead, when we process
 2403                          * all entries for a given timer hash bin, we check whether it is
 2404                          * really a timeout. If not, we reschedule at that time.
 2405                          */
 2406                         x->bm_start_time = *nowp;
 2407                         x->bm_measured.b_packets = 0;
 2408                         x->bm_measured.b_bytes = 0;
 2409                         x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2410                 }
 2411         }
 2412 }
 2413 
 2414 /*
 2415  * Prepare a bandwidth-related upcall
 2416  */
 2417 static void
 2418 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 2419 {
 2420         struct timeval delta;
 2421         struct bw_upcall *u;
 2422 
 2423         /*
 2424          * Compute the measured time interval
 2425          */
 2426         delta = *nowp;
 2427         BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2428 
 2429         /*
 2430          * If there are too many pending upcalls, deliver them now
 2431          */
 2432         if (bw_upcalls_n >= BW_UPCALLS_MAX)
 2433                 bw_upcalls_send();
 2434 
 2435         /*
 2436          * Set the bw_upcall entry
 2437          */
 2438         u = &bw_upcalls[bw_upcalls_n++];
 2439         u->bu_src = x->bm_mfc->mfc_origin;
 2440         u->bu_dst = x->bm_mfc->mfc_mcastgrp;
 2441         u->bu_threshold.b_time = x->bm_threshold.b_time;
 2442         u->bu_threshold.b_packets = x->bm_threshold.b_packets;
 2443         u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
 2444         u->bu_measured.b_time = delta;
 2445         u->bu_measured.b_packets = x->bm_measured.b_packets;
 2446         u->bu_measured.b_bytes = x->bm_measured.b_bytes;
 2447         u->bu_flags = 0;
 2448         if (x->bm_flags & BW_METER_UNIT_PACKETS)
 2449                 u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
 2450         if (x->bm_flags & BW_METER_UNIT_BYTES)
 2451                 u->bu_flags |= BW_UPCALL_UNIT_BYTES;
 2452         if (x->bm_flags & BW_METER_GEQ)
 2453                 u->bu_flags |= BW_UPCALL_GEQ;
 2454         if (x->bm_flags & BW_METER_LEQ)
 2455                 u->bu_flags |= BW_UPCALL_LEQ;
 2456 }
 2457 
 2458 /*
 2459  * Send the pending bandwidth-related upcalls
 2460  */
 2461 static void
 2462 bw_upcalls_send(void)
 2463 {
 2464         struct mbuf *m;
 2465         int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
 2466         struct sockaddr_in k_igmpsrc = {
 2467                 .sin_len = sizeof(k_igmpsrc),
 2468                 .sin_family = AF_INET,
 2469         };
 2470         static struct igmpmsg igmpmsg = {
 2471                 0,              /* unused1 */
 2472                 0,              /* unused2 */
 2473                 IGMPMSG_BW_UPCALL,/* im_msgtype */
 2474                 0,              /* im_mbz */
 2475                 0,              /* im_vif */
 2476                 0,              /* unused3 */
 2477                 { 0 },          /* im_src */
 2478                 { 0 }           /* im_dst */
 2479         };
 2480 
 2481         if (bw_upcalls_n == 0)
 2482                 return;                 /* No pending upcalls */
 2483 
 2484         bw_upcalls_n = 0;
 2485 
 2486         /*
 2487          * Allocate a new mbuf, initialize it with the header and
 2488          * the payload for the pending calls.
 2489          */
 2490         MGETHDR(m, M_DONTWAIT, MT_HEADER);
 2491         if (m == NULL) {
 2492                 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 2493                 return;
 2494         }
 2495 
 2496         m->m_len = m->m_pkthdr.len = 0;
 2497         m_copyback(m, 0, sizeof(struct igmpmsg), (void *)&igmpmsg);
 2498         m_copyback(m, sizeof(struct igmpmsg), len, (void *)&bw_upcalls[0]);
 2499 
 2500         /*
 2501          * Send the upcalls
 2502          * XXX do we need to set the address in k_igmpsrc ?
 2503          */
 2504         mrtstat.mrts_upcalls++;
 2505         if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
 2506                 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 2507                 ++mrtstat.mrts_upq_sockfull;
 2508         }
 2509 }
 2510 
 2511 /*
 2512  * Compute the timeout hash value for the bw_meter entries
 2513  */
 2514 #define BW_METER_TIMEHASH(bw_meter, hash)                               \
 2515     do {                                                                \
 2516         struct timeval next_timeval = (bw_meter)->bm_start_time;        \
 2517         BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 2518         (hash) = next_timeval.tv_sec;                                   \
 2519         if (next_timeval.tv_usec)                                       \
 2520                 (hash)++; /* XXX: make sure we don't timeout early */   \
 2521         (hash) %= BW_METER_BUCKETS;                                     \
 2522     } while (/*CONSTCOND*/ 0)
 2523 
 2524 /*
 2525  * Schedule a timer to process periodically bw_meter entry of type "<="
 2526  * by linking the entry in the proper hash bucket.
 2527  */
 2528 static void
 2529 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 2530 {
 2531         int time_hash;
 2532 
 2533         if (!(x->bm_flags & BW_METER_LEQ))
 2534                 return;         /* XXX: we schedule timers only for "<=" entries */
 2535 
 2536         /*
 2537          * Reset the bw_meter entry
 2538          */
 2539         x->bm_start_time = *nowp;
 2540         x->bm_measured.b_packets = 0;
 2541         x->bm_measured.b_bytes = 0;
 2542         x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2543 
 2544         /*
 2545          * Compute the timeout hash value and insert the entry
 2546          */
 2547         BW_METER_TIMEHASH(x, time_hash);
 2548         x->bm_time_next = bw_meter_timers[time_hash];
 2549         bw_meter_timers[time_hash] = x;
 2550         x->bm_time_hash = time_hash;
 2551 }
 2552 
 2553 /*
 2554  * Unschedule the periodic timer that processes bw_meter entry of type "<="
 2555  * by removing the entry from the proper hash bucket.
 2556  */
 2557 static void
 2558 unschedule_bw_meter(struct bw_meter *x)
 2559 {
 2560         int time_hash;
 2561         struct bw_meter *prev, *tmp;
 2562 
 2563         if (!(x->bm_flags & BW_METER_LEQ))
 2564                 return;         /* XXX: we schedule timers only for "<=" entries */
 2565 
 2566         /*
 2567          * Compute the timeout hash value and delete the entry
 2568          */
 2569         time_hash = x->bm_time_hash;
 2570         if (time_hash >= BW_METER_BUCKETS)
 2571                 return;         /* Entry was not scheduled */
 2572 
 2573         for (prev = NULL, tmp = bw_meter_timers[time_hash];
 2574              tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 2575                 if (tmp == x)
 2576                         break;
 2577 
 2578         if (tmp == NULL)
 2579                 panic("unschedule_bw_meter: bw_meter entry not found");
 2580 
 2581         if (prev != NULL)
 2582                 prev->bm_time_next = x->bm_time_next;
 2583         else
 2584                 bw_meter_timers[time_hash] = x->bm_time_next;
 2585 
 2586         x->bm_time_next = NULL;
 2587         x->bm_time_hash = BW_METER_BUCKETS;
 2588 }
 2589 
 2590 /*
 2591  * Process all "<=" type of bw_meter that should be processed now,
 2592  * and for each entry prepare an upcall if necessary. Each processed
 2593  * entry is rescheduled again for the (periodic) processing.
 2594  *
 2595  * This is run periodically (once per second normally). On each round,
 2596  * all the potentially matching entries are in the hash slot that we are
 2597  * looking at.
 2598  */
 2599 static void
 2600 bw_meter_process(void)
 2601 {
 2602         int s;
 2603         static uint32_t last_tv_sec;    /* last time we processed this */
 2604 
 2605         uint32_t loops;
 2606         int i;
 2607         struct timeval now, process_endtime;
 2608 
 2609         microtime(&now);
 2610         if (last_tv_sec == now.tv_sec)
 2611                 return;         /* nothing to do */
 2612 
 2613         loops = now.tv_sec - last_tv_sec;
 2614         last_tv_sec = now.tv_sec;
 2615         if (loops > BW_METER_BUCKETS)
 2616                 loops = BW_METER_BUCKETS;
 2617 
 2618         s = splsoftnet();
 2619         /*
 2620          * Process all bins of bw_meter entries from the one after the last
 2621          * processed to the current one. On entry, i points to the last bucket
 2622          * visited, so we need to increment i at the beginning of the loop.
 2623          */
 2624         for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 2625                 struct bw_meter *x, *tmp_list;
 2626 
 2627                 if (++i >= BW_METER_BUCKETS)
 2628                         i = 0;
 2629 
 2630                 /* Disconnect the list of bw_meter entries from the bin */
 2631                 tmp_list = bw_meter_timers[i];
 2632                 bw_meter_timers[i] = NULL;
 2633 
 2634                 /* Process the list of bw_meter entries */
 2635                 while (tmp_list != NULL) {
 2636                         x = tmp_list;
 2637                         tmp_list = tmp_list->bm_time_next;
 2638 
 2639                         /* Test if the time interval is over */
 2640                         process_endtime = x->bm_start_time;
 2641                         BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 2642                         if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 2643                                 /* Not yet: reschedule, but don't reset */
 2644                                 int time_hash;
 2645 
 2646                                 BW_METER_TIMEHASH(x, time_hash);
 2647                                 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 2648                                         /*
 2649                                          * XXX: somehow the bin processing is a bit ahead of time.
 2650                                          * Put the entry in the next bin.
 2651                                          */
 2652                                         if (++time_hash >= BW_METER_BUCKETS)
 2653                                                 time_hash = 0;
 2654                                 }
 2655                                 x->bm_time_next = bw_meter_timers[time_hash];
 2656                                 bw_meter_timers[time_hash] = x;
 2657                                 x->bm_time_hash = time_hash;
 2658 
 2659                                 continue;
 2660                         }
 2661 
 2662                         /*
 2663                          * Test if we should deliver an upcall
 2664                          */
 2665                         if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2666                             (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 2667                             ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2668                             (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 2669                                 /* Prepare an upcall for delivery */
 2670                                 bw_meter_prepare_upcall(x, &now);
 2671                         }
 2672 
 2673                         /*
 2674                           * Reschedule for next processing
 2675                          */
 2676                         schedule_bw_meter(x, &now);
 2677                 }
 2678         }
 2679 
 2680         /* Send all upcalls that are pending delivery */
 2681         bw_upcalls_send();
 2682 
 2683         splx(s);
 2684 }
 2685 
 2686 /*
 2687  * A periodic function for sending all upcalls that are pending delivery
 2688  */
 2689 static void
 2690 expire_bw_upcalls_send(void *unused)
 2691 {
 2692         int s;
 2693 
 2694         s = splsoftnet();
 2695         bw_upcalls_send();
 2696         splx(s);
 2697 
 2698         callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
 2699             expire_bw_upcalls_send, NULL);
 2700 }
 2701 
 2702 /*
 2703  * A periodic function for periodic scanning of the multicast forwarding
 2704  * table for processing all "<=" bw_meter entries.
 2705  */
 2706 static void
 2707 expire_bw_meter_process(void *unused)
 2708 {
 2709         if (mrt_api_config & MRT_MFC_BW_UPCALL)
 2710                 bw_meter_process();
 2711 
 2712         callout_reset(&bw_meter_ch, BW_METER_PERIOD,
 2713             expire_bw_meter_process, NULL);
 2714 }
 2715 
 2716 /*
 2717  * End of bandwidth monitoring code
 2718  */
 2719 
 2720 #ifdef PIM
 2721 /*
 2722  * Send the packet up to the user daemon, or eventually do kernel encapsulation
 2723  */
 2724 static int
 2725 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
 2726     struct mfc *rt)
 2727 {
 2728         struct mbuf *mb_copy, *mm;
 2729 
 2730         if (mrtdebug & DEBUG_PIM)
 2731                 log(LOG_DEBUG, "pim_register_send: \n");
 2732 
 2733         mb_copy = pim_register_prepare(ip, m);
 2734         if (mb_copy == NULL)
 2735                 return ENOBUFS;
 2736 
 2737         /*
 2738          * Send all the fragments. Note that the mbuf for each fragment
 2739          * is freed by the sending machinery.
 2740          */
 2741         for (mm = mb_copy; mm; mm = mb_copy) {
 2742                 mb_copy = mm->m_nextpkt;
 2743                 mm->m_nextpkt = NULL;
 2744                 mm = m_pullup(mm, sizeof(struct ip));
 2745                 if (mm != NULL) {
 2746                         ip = mtod(mm, struct ip *);
 2747                         if ((mrt_api_config & MRT_MFC_RP) &&
 2748                             !in_nullhost(rt->mfc_rp)) {
 2749                                 pim_register_send_rp(ip, vifp, mm, rt);
 2750                         } else {
 2751                                 pim_register_send_upcall(ip, vifp, mm, rt);
 2752                         }
 2753                 }
 2754         }
 2755 
 2756         return 0;
 2757 }
 2758 
 2759 /*
 2760  * Return a copy of the data packet that is ready for PIM Register
 2761  * encapsulation.
 2762  * XXX: Note that in the returned copy the IP header is a valid one.
 2763  */
 2764 static struct mbuf *
 2765 pim_register_prepare(struct ip *ip, struct mbuf *m)
 2766 {
 2767         struct mbuf *mb_copy = NULL;
 2768         int mtu;
 2769 
 2770         /* Take care of delayed checksums */
 2771         if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
 2772                 in_undefer_cksum_tcpudp(m);
 2773                 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
 2774         }
 2775 
 2776         /*
 2777          * Copy the old packet & pullup its IP header into the
 2778          * new mbuf so we can modify it.
 2779          */
 2780         mb_copy = m_copypacket(m, M_DONTWAIT);
 2781         if (mb_copy == NULL)
 2782                 return NULL;
 2783         mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
 2784         if (mb_copy == NULL)
 2785                 return NULL;
 2786 
 2787         /* take care of the TTL */
 2788         ip = mtod(mb_copy, struct ip *);
 2789         --ip->ip_ttl;
 2790 
 2791         /* Compute the MTU after the PIM Register encapsulation */
 2792         mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 2793 
 2794         if (ntohs(ip->ip_len) <= mtu) {
 2795                 /* Turn the IP header into a valid one */
 2796                 ip->ip_sum = 0;
 2797                 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 2798         } else {
 2799                 /* Fragment the packet */
 2800                 if (ip_fragment(mb_copy, NULL, mtu) != 0) {
 2801                         /* XXX: mb_copy was freed by ip_fragment() */
 2802                         return NULL;
 2803                 }
 2804         }
 2805         return mb_copy;
 2806 }
 2807 
 2808 /*
 2809  * Send an upcall with the data packet to the user-level process.
 2810  */
 2811 static int
 2812 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
 2813     struct mbuf *mb_copy, struct mfc *rt)
 2814 {
 2815         struct mbuf *mb_first;
 2816         int len = ntohs(ip->ip_len);
 2817         struct igmpmsg *im;
 2818         struct sockaddr_in k_igmpsrc = {
 2819                 .sin_len = sizeof(k_igmpsrc),
 2820                 .sin_family = AF_INET,
 2821         };
 2822 
 2823         /*
 2824          * Add a new mbuf with an upcall header
 2825          */
 2826         MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 2827         if (mb_first == NULL) {
 2828                 m_freem(mb_copy);
 2829                 return ENOBUFS;
 2830         }
 2831         mb_first->m_data += max_linkhdr;
 2832         mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
 2833         mb_first->m_len = sizeof(struct igmpmsg);
 2834         mb_first->m_next = mb_copy;
 2835 
 2836         /* Send message to routing daemon */
 2837         im = mtod(mb_first, struct igmpmsg *);
 2838         im->im_msgtype  = IGMPMSG_WHOLEPKT;
 2839         im->im_mbz      = 0;
 2840         im->im_vif      = vifp - viftable;
 2841         im->im_src      = ip->ip_src;
 2842         im->im_dst      = ip->ip_dst;
 2843 
 2844         k_igmpsrc.sin_addr      = ip->ip_src;
 2845 
 2846         mrtstat.mrts_upcalls++;
 2847 
 2848         if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 2849                 if (mrtdebug & DEBUG_PIM)
 2850                         log(LOG_WARNING,
 2851                             "mcast: pim_register_send_upcall: ip_mrouter socket queue full\n");
 2852                 ++mrtstat.mrts_upq_sockfull;
 2853                 return ENOBUFS;
 2854         }
 2855 
 2856         /* Keep statistics */
 2857         pimstat.pims_snd_registers_msgs++;
 2858         pimstat.pims_snd_registers_bytes += len;
 2859 
 2860         return 0;
 2861 }
 2862 
 2863 /*
 2864  * Encapsulate the data packet in PIM Register message and send it to the RP.
 2865  */
 2866 static int
 2867 pim_register_send_rp(struct ip *ip, struct vif *vifp,
 2868     struct mbuf *mb_copy, struct mfc *rt)
 2869 {
 2870         struct mbuf *mb_first;
 2871         struct ip *ip_outer;
 2872         struct pim_encap_pimhdr *pimhdr;
 2873         int len = ntohs(ip->ip_len);
 2874         vifi_t vifi = rt->mfc_parent;
 2875 
 2876         if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
 2877                 m_freem(mb_copy);
 2878                 return EADDRNOTAVAIL;           /* The iif vif is invalid */
 2879         }
 2880 
 2881         /*
 2882          * Add a new mbuf with the encapsulating header
 2883          */
 2884         MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 2885         if (mb_first == NULL) {
 2886                 m_freem(mb_copy);
 2887                 return ENOBUFS;
 2888         }
 2889         mb_first->m_data += max_linkhdr;
 2890         mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
 2891         mb_first->m_next = mb_copy;
 2892 
 2893         mb_first->m_pkthdr.len = len + mb_first->m_len;
 2894 
 2895         /*
 2896          * Fill in the encapsulating IP and PIM header
 2897          */
 2898         ip_outer = mtod(mb_first, struct ip *);
 2899         *ip_outer = pim_encap_iphdr;
 2900         if (mb_first->m_pkthdr.len < IP_MINFRAGSIZE)
 2901                 ip_outer->ip_id = 0;
 2902         else
 2903                 ip_outer->ip_id = ip_newid(NULL);
 2904         ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 2905             sizeof(pim_encap_pimhdr));
 2906         ip_outer->ip_src = viftable[vifi].v_lcl_addr;
 2907         ip_outer->ip_dst = rt->mfc_rp;
 2908         /*
 2909          * Copy the inner header TOS to the outer header, and take care of the
 2910          * IP_DF bit.
 2911          */
 2912         ip_outer->ip_tos = ip->ip_tos;
 2913         if (ntohs(ip->ip_off) & IP_DF)
 2914                 ip_outer->ip_off |= htons(IP_DF);
 2915         pimhdr = (struct pim_encap_pimhdr *)((char *)ip_outer
 2916             + sizeof(pim_encap_iphdr));
 2917         *pimhdr = pim_encap_pimhdr;
 2918         /* If the iif crosses a border, set the Border-bit */
 2919         if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
 2920                 pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 2921 
 2922         mb_first->m_data += sizeof(pim_encap_iphdr);
 2923         pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
 2924         mb_first->m_data -= sizeof(pim_encap_iphdr);
 2925 
 2926         if (vifp->v_rate_limit == 0)
 2927                 tbf_send_packet(vifp, mb_first);
 2928         else
 2929                 tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len));
 2930 
 2931         /* Keep statistics */
 2932         pimstat.pims_snd_registers_msgs++;
 2933         pimstat.pims_snd_registers_bytes += len;
 2934 
 2935         return 0;
 2936 }
 2937 
 2938 /*
 2939  * PIM-SMv2 and PIM-DM messages processing.
 2940  * Receives and verifies the PIM control messages, and passes them
 2941  * up to the listening socket, using rip_input().
 2942  * The only message with special processing is the PIM_REGISTER message
 2943  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
 2944  * is passed to if_simloop().
 2945  */
 2946 void
 2947 pim_input(struct mbuf *m, int off, int proto)
 2948 {
 2949         struct ip *ip = mtod(m, struct ip *);
 2950         struct pim *pim;
 2951         int minlen;
 2952         int datalen;
 2953         int ip_tos;
 2954         int iphlen;
 2955 
 2956         iphlen = off;
 2957         datalen = ntohs(ip->ip_len) - iphlen;
 2958 
 2959         /* Keep statistics */
 2960         pimstat.pims_rcv_total_msgs++;
 2961         pimstat.pims_rcv_total_bytes += datalen;
 2962 
 2963         /*
 2964          * Validate lengths
 2965          */
 2966         if (datalen < PIM_MINLEN) {
 2967                 pimstat.pims_rcv_tooshort++;
 2968                 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
 2969                     datalen, (u_long)ip->ip_src.s_addr);
 2970                 m_freem(m);
 2971                 return;
 2972         }
 2973 
 2974         /*
 2975          * If the packet is at least as big as a REGISTER, go ahead
 2976          * and grab the PIM REGISTER header size, to avoid another
 2977          * possible m_pullup() later.
 2978          *
 2979          * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
 2980          * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
 2981          */
 2982         minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
 2983 
 2984         /*
 2985          * Get the IP and PIM headers in contiguous memory, and
 2986          * possibly the PIM REGISTER header.
 2987          */
 2988         if ((m->m_flags & M_EXT || m->m_len < minlen) &&
 2989             (m = m_pullup(m, minlen)) == NULL) {
 2990                 log(LOG_ERR, "pim_input: m_pullup failure\n");
 2991                 return;
 2992         }
 2993         ip = mtod(m, struct ip *);
 2994         ip_tos = ip->ip_tos;
 2995 
 2996         /* adjust mbuf to point to the PIM header */
 2997         m->m_data += iphlen;
 2998         m->m_len  -= iphlen;
 2999         pim = mtod(m, struct pim *);
 3000 
 3001         /*
 3002          * Validate checksum. If PIM REGISTER, exclude the data packet.
 3003          *
 3004          * XXX: some older PIMv2 implementations don't make this distinction,
 3005          * so for compatibility reason perform the checksum over part of the
 3006          * message, and if error, then over the whole message.
 3007          */
 3008         if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 3009                 /* do nothing, checksum okay */
 3010         } else if (in_cksum(m, datalen)) {
 3011                 pimstat.pims_rcv_badsum++;
 3012                 if (mrtdebug & DEBUG_PIM)
 3013                         log(LOG_DEBUG, "pim_input: invalid checksum\n");
 3014                 m_freem(m);
 3015                 return;
 3016         }
 3017 
 3018         /* PIM version check */
 3019         if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 3020                 pimstat.pims_rcv_badversion++;
 3021                 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
 3022                     PIM_VT_V(pim->pim_vt), PIM_VERSION);
 3023                 m_freem(m);
 3024                 return;
 3025         }
 3026 
 3027         /* restore mbuf back to the outer IP */
 3028         m->m_data -= iphlen;
 3029         m->m_len  += iphlen;
 3030 
 3031         if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 3032                 /*
 3033                  * Since this is a REGISTER, we'll make a copy of the register
 3034                  * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 3035                  * routing daemon.
 3036                  */
 3037                 int s;
 3038                 struct sockaddr_in dst = {
 3039                         .sin_len = sizeof(dst),
 3040                         .sin_family = AF_INET,
 3041                 };
 3042                 struct mbuf *mcp;
 3043                 struct ip *encap_ip;
 3044                 u_int32_t *reghdr;
 3045                 struct ifnet *vifp;
 3046 
 3047                 s = splsoftnet();
 3048                 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
 3049                         splx(s);
 3050                         if (mrtdebug & DEBUG_PIM)
 3051                                 log(LOG_DEBUG,
 3052                                     "pim_input: register vif not set: %d\n", reg_vif_num);
 3053                         m_freem(m);
 3054                         return;
 3055                 }
 3056                 /* XXX need refcnt? */
 3057                 vifp = viftable[reg_vif_num].v_ifp;
 3058                 splx(s);
 3059 
 3060                 /*
 3061                  * Validate length
 3062                  */
 3063                 if (datalen < PIM_REG_MINLEN) {
 3064                         pimstat.pims_rcv_tooshort++;
 3065                         pimstat.pims_rcv_badregisters++;
 3066                         log(LOG_ERR,
 3067                             "pim_input: register packet size too small %d from %lx\n",
 3068                             datalen, (u_long)ip->ip_src.s_addr);
 3069                         m_freem(m);
 3070                         return;
 3071                 }
 3072 
 3073                 reghdr = (u_int32_t *)(pim + 1);
 3074                 encap_ip = (struct ip *)(reghdr + 1);
 3075 
 3076                 if (mrtdebug & DEBUG_PIM) {
 3077                         log(LOG_DEBUG,
 3078                             "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
 3079                             (u_long)ntohl(encap_ip->ip_src.s_addr),
 3080                             (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3081                             ntohs(encap_ip->ip_len));
 3082                 }
 3083 
 3084                 /* verify the version number of the inner packet */
 3085                 if (encap_ip->ip_v != IPVERSION) {
 3086                         pimstat.pims_rcv_badregisters++;
 3087                         if (mrtdebug & DEBUG_PIM) {
 3088                                 log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
 3089                                     "of the inner packet\n", encap_ip->ip_v);
 3090                         }
 3091                         m_freem(m);
 3092                         return;
 3093                 }
 3094 
 3095                 /* verify the inner packet doesn't have options */
 3096                 if (encap_ip->ip_hl != (sizeof(struct ip) >> 2)) {
 3097                         pimstat.pims_rcv_badregisters++;
 3098                         m_freem(m);
 3099                         return;
 3100                 }
 3101 
 3102                 /* verify the inner packet is destined to a mcast group */
 3103                 if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
 3104                         pimstat.pims_rcv_badregisters++;
 3105                          if (mrtdebug & DEBUG_PIM)
 3106                                 log(LOG_DEBUG,
 3107                                     "pim_input: inner packet of register is not "
 3108                                     "multicast %lx\n",
 3109                                     (u_long)ntohl(encap_ip->ip_dst.s_addr));
 3110                         m_freem(m);
 3111                         return;
 3112                 }
 3113 
 3114                 /* If a NULL_REGISTER, pass it to the daemon */
 3115                 if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 3116                         goto pim_input_to_daemon;
 3117 
 3118                 /*
 3119                  * Copy the TOS from the outer IP header to the inner IP header.
 3120                  */
 3121                 if (encap_ip->ip_tos != ip_tos) {
 3122                         /* Outer TOS -> inner TOS */
 3123                         encap_ip->ip_tos = ip_tos;
 3124                         /* Recompute the inner header checksum. Sigh... */
 3125 
 3126                         /* adjust mbuf to point to the inner IP header */
 3127                         m->m_data += (iphlen + PIM_MINLEN);
 3128                         m->m_len  -= (iphlen + PIM_MINLEN);
 3129 
 3130                         encap_ip->ip_sum = 0;
 3131                         encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 3132 
 3133                         /* restore mbuf to point back to the outer IP header */
 3134                         m->m_data -= (iphlen + PIM_MINLEN);
 3135                         m->m_len  += (iphlen + PIM_MINLEN);
 3136                 }
 3137 
 3138                 /*
 3139                  * Decapsulate the inner IP packet and loopback to forward it
 3140                  * as a normal multicast packet. Also, make a copy of the
 3141                  *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 3142                  * to pass to the daemon later, so it can take the appropriate
 3143                  * actions (e.g., send back PIM_REGISTER_STOP).
 3144                  * XXX: here m->m_data points to the outer IP header.
 3145                  */
 3146                 mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_DONTWAIT);
 3147                 if (mcp == NULL) {
 3148                         log(LOG_ERR,
 3149                             "pim_input: pim register: could not copy register head\n");
 3150                         m_freem(m);
 3151                         return;
 3152                 }
 3153 
 3154                 /* Keep statistics */
 3155                 /* XXX: registers_bytes include only the encap. mcast pkt */
 3156                 pimstat.pims_rcv_registers_msgs++;
 3157                 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
 3158 
 3159                 /*
 3160                  * forward the inner ip packet; point m_data at the inner ip.
 3161                  */
 3162                 m_adj(m, iphlen + PIM_MINLEN);
 3163 
 3164                 if (mrtdebug & DEBUG_PIM) {
 3165                         log(LOG_DEBUG,
 3166                             "pim_input: forwarding decapsulated register: "
 3167                             "src %lx, dst %lx, vif %d\n",
 3168                             (u_long)ntohl(encap_ip->ip_src.s_addr),
 3169                             (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3170                             reg_vif_num);
 3171                 }
 3172                 /* NB: vifp was collected above; can it change on us? */
 3173                 looutput(vifp, m, (struct sockaddr *)&dst, NULL);
 3174 
 3175                 /* prepare the register head to send to the mrouting daemon */
 3176                 m = mcp;
 3177         }
 3178 
 3179 pim_input_to_daemon:
 3180         /*
 3181          * Pass the PIM message up to the daemon; if it is a Register message,
 3182          * pass the 'head' only up to the daemon. This includes the
 3183          * outer IP header, PIM header, PIM-Register header and the
 3184          * inner IP header.
 3185          * XXX: the outer IP header pkt size of a Register is not adjust to
 3186          * reflect the fact that the inner multicast data is truncated.
 3187          */
 3188         /*
 3189          * Currently, pim_input() is always called holding softnet_lock
 3190          * by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE).
 3191          */
 3192         KASSERT(mutex_owned(softnet_lock));
 3193         rip_input(m, iphlen, proto);
 3194 
 3195         return;
 3196 }
 3197 #endif /* PIM */

Cache object: c9d8f070c9fe9911a98d809d60731578


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.