The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/ip_mroute.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: ip_mroute.c,v 1.116 2008/10/01 16:01:51 rmind Exp $    */
    2 
    3 /*
    4  * Copyright (c) 1992, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Stephen Deering of Stanford University.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   35  */
   36 
   37 /*
   38  * Copyright (c) 1989 Stephen Deering
   39  *
   40  * This code is derived from software contributed to Berkeley by
   41  * Stephen Deering of Stanford University.
   42  *
   43  * Redistribution and use in source and binary forms, with or without
   44  * modification, are permitted provided that the following conditions
   45  * are met:
   46  * 1. Redistributions of source code must retain the above copyright
   47  *    notice, this list of conditions and the following disclaimer.
   48  * 2. Redistributions in binary form must reproduce the above copyright
   49  *    notice, this list of conditions and the following disclaimer in the
   50  *    documentation and/or other materials provided with the distribution.
   51  * 3. All advertising materials mentioning features or use of this software
   52  *    must display the following acknowledgement:
   53  *      This product includes software developed by the University of
   54  *      California, Berkeley and its contributors.
   55  * 4. Neither the name of the University nor the names of its contributors
   56  *    may be used to endorse or promote products derived from this software
   57  *    without specific prior written permission.
   58  *
   59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   69  * SUCH DAMAGE.
   70  *
   71  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   72  */
   73 
   74 /*
   75  * IP multicast forwarding procedures
   76  *
   77  * Written by David Waitzman, BBN Labs, August 1988.
   78  * Modified by Steve Deering, Stanford, February 1989.
   79  * Modified by Mark J. Steiglitz, Stanford, May, 1991
   80  * Modified by Van Jacobson, LBL, January 1993
   81  * Modified by Ajit Thyagarajan, PARC, August 1993
   82  * Modified by Bill Fenner, PARC, April 1994
   83  * Modified by Charles M. Hannum, NetBSD, May 1995.
   84  * Modified by Ahmed Helmy, SGI, June 1996
   85  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
   86  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
   87  * Modified by Hitoshi Asaeda, WIDE, August 2000
   88  * Modified by Pavlin Radoslavov, ICSI, October 2002
   89  *
   90  * MROUTING Revision: 1.2
   91  * and PIM-SMv2 and PIM-DM support, advanced API support,
   92  * bandwidth metering and signaling
   93  */
   94 
   95 #include <sys/cdefs.h>
   96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.116 2008/10/01 16:01:51 rmind Exp $");
   97 
   98 #include "opt_inet.h"
   99 #include "opt_ipsec.h"
  100 #include "opt_pim.h"
  101 
  102 #ifdef PIM
  103 #define _PIM_VT 1
  104 #endif
  105 
  106 #include <sys/param.h>
  107 #include <sys/systm.h>
  108 #include <sys/callout.h>
  109 #include <sys/mbuf.h>
  110 #include <sys/socket.h>
  111 #include <sys/socketvar.h>
  112 #include <sys/protosw.h>
  113 #include <sys/errno.h>
  114 #include <sys/time.h>
  115 #include <sys/kernel.h>
  116 #include <sys/ioctl.h>
  117 #include <sys/syslog.h>
  118 
  119 #include <net/if.h>
  120 #include <net/route.h>
  121 #include <net/raw_cb.h>
  122 
  123 #include <netinet/in.h>
  124 #include <netinet/in_var.h>
  125 #include <netinet/in_systm.h>
  126 #include <netinet/ip.h>
  127 #include <netinet/ip_var.h>
  128 #include <netinet/in_pcb.h>
  129 #include <netinet/udp.h>
  130 #include <netinet/igmp.h>
  131 #include <netinet/igmp_var.h>
  132 #include <netinet/ip_mroute.h>
  133 #ifdef PIM
  134 #include <netinet/pim.h>
  135 #include <netinet/pim_var.h>
  136 #endif
  137 #include <netinet/ip_encap.h>
  138 
  139 #ifdef IPSEC
  140 #include <netinet6/ipsec.h>
  141 #include <netkey/key.h>
  142 #endif
  143 
  144 #ifdef FAST_IPSEC
  145 #include <netipsec/ipsec.h>
  146 #include <netipsec/key.h>
  147 #endif
  148 
  149 #include <machine/stdarg.h>
  150 
  151 #define IP_MULTICASTOPTS 0
  152 #define M_PULLUP(m, len)                                                 \
  153         do {                                                             \
  154                 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
  155                         (m) = m_pullup((m), (len));                      \
  156         } while (/*CONSTCOND*/ 0)
  157 
  158 /*
  159  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
  160  * except for netstat or debugging purposes.
  161  */
  162 struct socket  *ip_mrouter  = NULL;
  163 int             ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
  164 
  165 #define NO_RTE_FOUND    0x1
  166 #define RTE_FOUND       0x2
  167 
  168 #define MFCHASH(a, g)                                                   \
  169         ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^        \
  170           ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
  171 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
  172 u_long  mfchash;
  173 
  174 u_char          nexpire[MFCTBLSIZ];
  175 struct vif      viftable[MAXVIFS];
  176 struct mrtstat  mrtstat;
  177 u_int           mrtdebug = 0;     /* debug level        */
  178 #define         DEBUG_MFC       0x02
  179 #define         DEBUG_FORWARD   0x04
  180 #define         DEBUG_EXPIRE    0x08
  181 #define         DEBUG_XMIT      0x10
  182 #define         DEBUG_PIM       0x20
  183 
  184 #define         VIFI_INVALID    ((vifi_t) -1)
  185 
  186 u_int           tbfdebug = 0;     /* tbf debug level    */
  187 #ifdef RSVP_ISI
  188 u_int           rsvpdebug = 0;    /* rsvp debug level   */
  189 extern struct socket *ip_rsvpd;
  190 extern int rsvp_on;
  191 #endif /* RSVP_ISI */
  192 
  193 /* vif attachment using sys/netinet/ip_encap.c */
  194 static void vif_input(struct mbuf *, ...);
  195 static int vif_encapcheck(struct mbuf *, int, int, void *);
  196 
  197 static const struct protosw vif_protosw =
  198 { SOCK_RAW,     &inetdomain,    IPPROTO_IPV4,   PR_ATOMIC|PR_ADDR,
  199   vif_input,    rip_output,     0,              rip_ctloutput,
  200   rip_usrreq,
  201   0,            0,              0,              0,
  202 };
  203 
  204 #define         EXPIRE_TIMEOUT  (hz / 4)        /* 4x / second */
  205 #define         UPCALL_EXPIRE   6               /* number of timeouts */
  206 
  207 /*
  208  * Define the token bucket filter structures
  209  */
  210 
  211 #define         TBF_REPROCESS   (hz / 100)      /* 100x / second */
  212 
  213 static int get_sg_cnt(struct sioc_sg_req *);
  214 static int get_vif_cnt(struct sioc_vif_req *);
  215 static int ip_mrouter_init(struct socket *, int);
  216 static int set_assert(int);
  217 static int add_vif(struct vifctl *);
  218 static int del_vif(vifi_t *);
  219 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
  220 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
  221 static void expire_mfc(struct mfc *);
  222 static int add_mfc(struct sockopt *);
  223 #ifdef UPCALL_TIMING
  224 static void collate(struct timeval *);
  225 #endif
  226 static int del_mfc(struct sockopt *);
  227 static int set_api_config(struct sockopt *); /* chose API capabilities */
  228 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
  229 static void expire_upcalls(void *);
  230 #ifdef RSVP_ISI
  231 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
  232 #else
  233 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
  234 #endif
  235 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
  236 static void encap_send(struct ip *, struct vif *, struct mbuf *);
  237 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t);
  238 static void tbf_queue(struct vif *, struct mbuf *);
  239 static void tbf_process_q(struct vif *);
  240 static void tbf_reprocess_q(void *);
  241 static int tbf_dq_sel(struct vif *, struct ip *);
  242 static void tbf_send_packet(struct vif *, struct mbuf *);
  243 static void tbf_update_tokens(struct vif *);
  244 static int priority(struct vif *, struct ip *);
  245 
  246 /*
  247  * Bandwidth monitoring
  248  */
  249 static void free_bw_list(struct bw_meter *);
  250 static int add_bw_upcall(struct bw_upcall *);
  251 static int del_bw_upcall(struct bw_upcall *);
  252 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
  253 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
  254 static void bw_upcalls_send(void);
  255 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
  256 static void unschedule_bw_meter(struct bw_meter *);
  257 static void bw_meter_process(void);
  258 static void expire_bw_upcalls_send(void *);
  259 static void expire_bw_meter_process(void *);
  260 
  261 #ifdef PIM
  262 static int pim_register_send(struct ip *, struct vif *,
  263                 struct mbuf *, struct mfc *);
  264 static int pim_register_send_rp(struct ip *, struct vif *,
  265                 struct mbuf *, struct mfc *);
  266 static int pim_register_send_upcall(struct ip *, struct vif *,
  267                 struct mbuf *, struct mfc *);
  268 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
  269 #endif
  270 
  271 /*
  272  * 'Interfaces' associated with decapsulator (so we can tell
  273  * packets that went through it from ones that get reflected
  274  * by a broken gateway).  These interfaces are never linked into
  275  * the system ifnet list & no routes point to them.  I.e., packets
  276  * can't be sent this way.  They only exist as a placeholder for
  277  * multicast source verification.
  278  */
  279 #if 0
  280 struct ifnet multicast_decap_if[MAXVIFS];
  281 #endif
  282 
  283 #define ENCAP_TTL       64
  284 #define ENCAP_PROTO     IPPROTO_IPIP    /* 4 */
  285 
  286 /* prototype IP hdr for encapsulated packets */
  287 struct ip multicast_encap_iphdr = {
  288         .ip_hl = sizeof(struct ip) >> 2,
  289         .ip_v = IPVERSION,
  290         .ip_len = sizeof(struct ip),
  291         .ip_ttl = ENCAP_TTL,
  292         .ip_p = ENCAP_PROTO,
  293 };
  294 
  295 /*
  296  * Bandwidth meter variables and constants
  297  */
  298 
  299 /*
  300  * Pending timeouts are stored in a hash table, the key being the
  301  * expiration time. Periodically, the entries are analysed and processed.
  302  */
  303 #define BW_METER_BUCKETS        1024
  304 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
  305 struct callout bw_meter_ch;
  306 #define BW_METER_PERIOD (hz)            /* periodical handling of bw meters */
  307 
  308 /*
  309  * Pending upcalls are stored in a vector which is flushed when
  310  * full, or periodically
  311  */
  312 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX];
  313 static u_int    bw_upcalls_n; /* # of pending upcalls */
  314 struct callout  bw_upcalls_ch;
  315 #define BW_UPCALLS_PERIOD (hz)          /* periodical flush of bw upcalls */
  316 
  317 #ifdef PIM
  318 struct pimstat pimstat;
  319 
  320 /*
  321  * Note: the PIM Register encapsulation adds the following in front of a
  322  * data packet:
  323  *
  324  * struct pim_encap_hdr {
  325  *    struct ip ip;
  326  *    struct pim_encap_pimhdr  pim;
  327  * }
  328  *
  329  */
  330 
  331 struct pim_encap_pimhdr {
  332         struct pim pim;
  333         uint32_t   flags;
  334 };
  335 
  336 static struct ip pim_encap_iphdr = {
  337         .ip_v = IPVERSION,
  338         .ip_hl = sizeof(struct ip) >> 2,
  339         .ip_len = sizeof(struct ip),
  340         .ip_ttl = ENCAP_TTL,
  341         .ip_p = IPPROTO_PIM,
  342 };
  343 
  344 static struct pim_encap_pimhdr pim_encap_pimhdr = {
  345     {
  346         PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
  347         0,                      /* reserved */
  348         0,                      /* checksum */
  349     },
  350     0                           /* flags */
  351 };
  352 
  353 static struct ifnet multicast_register_if;
  354 static vifi_t reg_vif_num = VIFI_INVALID;
  355 #endif /* PIM */
  356 
  357 
  358 /*
  359  * Private variables.
  360  */
  361 static vifi_t      numvifs = 0;
  362 
  363 static struct callout expire_upcalls_ch;
  364 
  365 /*
  366  * whether or not special PIM assert processing is enabled.
  367  */
  368 static int pim_assert;
  369 /*
  370  * Rate limit for assert notification messages, in usec
  371  */
  372 #define ASSERT_MSG_TIME         3000000
  373 
  374 /*
  375  * Kernel multicast routing API capabilities and setup.
  376  * If more API capabilities are added to the kernel, they should be
  377  * recorded in `mrt_api_support'.
  378  */
  379 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
  380                                           MRT_MFC_FLAGS_BORDER_VIF |
  381                                           MRT_MFC_RP |
  382                                           MRT_MFC_BW_UPCALL);
  383 static u_int32_t mrt_api_config = 0;
  384 
  385 /*
  386  * Find a route for a given origin IP address and Multicast group address
  387  * Type of service parameter to be added in the future!!!
  388  * Statistics are updated by the caller if needed
  389  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
  390  */
  391 static struct mfc *
  392 mfc_find(struct in_addr *o, struct in_addr *g)
  393 {
  394         struct mfc *rt;
  395 
  396         LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
  397                 if (in_hosteq(rt->mfc_origin, *o) &&
  398                     in_hosteq(rt->mfc_mcastgrp, *g) &&
  399                     (rt->mfc_stall == NULL))
  400                         break;
  401         }
  402 
  403         return (rt);
  404 }
  405 
  406 /*
  407  * Macros to compute elapsed time efficiently
  408  * Borrowed from Van Jacobson's scheduling code
  409  */
  410 #define TV_DELTA(a, b, delta) do {                                      \
  411         int xxs;                                                        \
  412         delta = (a).tv_usec - (b).tv_usec;                              \
  413         xxs = (a).tv_sec - (b).tv_sec;                                  \
  414         switch (xxs) {                                                  \
  415         case 2:                                                         \
  416                 delta += 1000000;                                       \
  417                 /* fall through */                                      \
  418         case 1:                                                         \
  419                 delta += 1000000;                                       \
  420                 /* fall through */                                      \
  421         case 0:                                                         \
  422                 break;                                                  \
  423         default:                                                        \
  424                 delta += (1000000 * xxs);                               \
  425                 break;                                                  \
  426         }                                                               \
  427 } while (/*CONSTCOND*/ 0)
  428 
  429 #ifdef UPCALL_TIMING
  430 u_int32_t upcall_data[51];
  431 #endif /* UPCALL_TIMING */
  432 
  433 /*
  434  * Handle MRT setsockopt commands to modify the multicast routing tables.
  435  */
  436 int
  437 ip_mrouter_set(struct socket *so, struct sockopt *sopt)
  438 {
  439         int error;
  440         int optval;
  441         struct vifctl vifc;
  442         vifi_t vifi;
  443         struct bw_upcall bwuc;
  444 
  445         if (sopt->sopt_name != MRT_INIT && so != ip_mrouter)
  446                 error = ENOPROTOOPT;
  447         else {
  448                 switch (sopt->sopt_name) {
  449                 case MRT_INIT:
  450                         error = sockopt_getint(sopt, &optval);
  451                         if (error)
  452                                 break;
  453 
  454                         error = ip_mrouter_init(so, optval);
  455                         break;
  456                 case MRT_DONE:
  457                         error = ip_mrouter_done();
  458                         break;
  459                 case MRT_ADD_VIF:
  460                         error = sockopt_get(sopt, &vifc, sizeof(vifc));
  461                         if (error)
  462                                 break;
  463                         error = add_vif(&vifc);
  464                         break;
  465                 case MRT_DEL_VIF:
  466                         error = sockopt_get(sopt, &vifi, sizeof(vifi));
  467                         if (error)
  468                                 break;
  469                         error = del_vif(&vifi);
  470                         break;
  471                 case MRT_ADD_MFC:
  472                         error = add_mfc(sopt);
  473                         break;
  474                 case MRT_DEL_MFC:
  475                         error = del_mfc(sopt);
  476                         break;
  477                 case MRT_ASSERT:
  478                         error = sockopt_getint(sopt, &optval);
  479                         if (error)
  480                                 break;
  481                         error = set_assert(optval);
  482                         break;
  483                 case MRT_API_CONFIG:
  484                         error = set_api_config(sopt);
  485                         break;
  486                 case MRT_ADD_BW_UPCALL:
  487                         error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
  488                         if (error)
  489                                 break;
  490                         error = add_bw_upcall(&bwuc);
  491                         break;
  492                 case MRT_DEL_BW_UPCALL:
  493                         error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
  494                         if (error)
  495                                 break;
  496                         error = del_bw_upcall(&bwuc);
  497                         break;
  498                 default:
  499                         error = ENOPROTOOPT;
  500                         break;
  501                 }
  502         }
  503         return (error);
  504 }
  505 
  506 /*
  507  * Handle MRT getsockopt commands
  508  */
  509 int
  510 ip_mrouter_get(struct socket *so, struct sockopt *sopt)
  511 {
  512         int error;
  513 
  514         if (so != ip_mrouter)
  515                 error = ENOPROTOOPT;
  516         else {
  517                 switch (sopt->sopt_name) {
  518                 case MRT_VERSION:
  519                         error = sockopt_setint(sopt, 0x0305); /* XXX !!!! */
  520                         break;
  521                 case MRT_ASSERT:
  522                         error = sockopt_setint(sopt, pim_assert);
  523                         break;
  524                 case MRT_API_SUPPORT:
  525                         error = sockopt_set(sopt, &mrt_api_support,
  526                             sizeof(mrt_api_support));
  527                         break;
  528                 case MRT_API_CONFIG:
  529                         error = sockopt_set(sopt, &mrt_api_config,
  530                             sizeof(mrt_api_config));
  531                         break;
  532                 default:
  533                         error = ENOPROTOOPT;
  534                         break;
  535                 }
  536         }
  537         return (error);
  538 }
  539 
  540 /*
  541  * Handle ioctl commands to obtain information from the cache
  542  */
  543 int
  544 mrt_ioctl(struct socket *so, u_long cmd, void *data)
  545 {
  546         int error;
  547 
  548         if (so != ip_mrouter)
  549                 error = EINVAL;
  550         else
  551                 switch (cmd) {
  552                 case SIOCGETVIFCNT:
  553                         error = get_vif_cnt((struct sioc_vif_req *)data);
  554                         break;
  555                 case SIOCGETSGCNT:
  556                         error = get_sg_cnt((struct sioc_sg_req *)data);
  557                         break;
  558                 default:
  559                         error = EINVAL;
  560                         break;
  561                 }
  562 
  563         return (error);
  564 }
  565 
  566 /*
  567  * returns the packet, byte, rpf-failure count for the source group provided
  568  */
  569 static int
  570 get_sg_cnt(struct sioc_sg_req *req)
  571 {
  572         int s;
  573         struct mfc *rt;
  574 
  575         s = splsoftnet();
  576         rt = mfc_find(&req->src, &req->grp);
  577         if (rt == NULL) {
  578                 splx(s);
  579                 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
  580                 return (EADDRNOTAVAIL);
  581         }
  582         req->pktcnt = rt->mfc_pkt_cnt;
  583         req->bytecnt = rt->mfc_byte_cnt;
  584         req->wrong_if = rt->mfc_wrong_if;
  585         splx(s);
  586 
  587         return (0);
  588 }
  589 
  590 /*
  591  * returns the input and output packet and byte counts on the vif provided
  592  */
  593 static int
  594 get_vif_cnt(struct sioc_vif_req *req)
  595 {
  596         vifi_t vifi = req->vifi;
  597 
  598         if (vifi >= numvifs)
  599                 return (EINVAL);
  600 
  601         req->icount = viftable[vifi].v_pkt_in;
  602         req->ocount = viftable[vifi].v_pkt_out;
  603         req->ibytes = viftable[vifi].v_bytes_in;
  604         req->obytes = viftable[vifi].v_bytes_out;
  605 
  606         return (0);
  607 }
  608 
  609 /*
  610  * Enable multicast routing
  611  */
  612 static int
  613 ip_mrouter_init(struct socket *so, int v)
  614 {
  615         if (mrtdebug)
  616                 log(LOG_DEBUG,
  617                     "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
  618                     so->so_type, so->so_proto->pr_protocol);
  619 
  620         if (so->so_type != SOCK_RAW ||
  621             so->so_proto->pr_protocol != IPPROTO_IGMP)
  622                 return (EOPNOTSUPP);
  623 
  624         if (v != 1)
  625                 return (EINVAL);
  626 
  627         if (ip_mrouter != NULL)
  628                 return (EADDRINUSE);
  629 
  630         ip_mrouter = so;
  631 
  632         mfchashtbl = hashinit(MFCTBLSIZ, HASH_LIST, true, &mfchash);
  633         bzero((void *)nexpire, sizeof(nexpire));
  634 
  635         pim_assert = 0;
  636 
  637         callout_init(&expire_upcalls_ch, 0);
  638         callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
  639                       expire_upcalls, NULL);
  640 
  641         callout_init(&bw_upcalls_ch, 0);
  642         callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
  643                       expire_bw_upcalls_send, NULL);
  644 
  645         callout_init(&bw_meter_ch, 0);
  646         callout_reset(&bw_meter_ch, BW_METER_PERIOD,
  647                       expire_bw_meter_process, NULL);
  648 
  649         if (mrtdebug)
  650                 log(LOG_DEBUG, "ip_mrouter_init\n");
  651 
  652         return (0);
  653 }
  654 
  655 /*
  656  * Disable multicast routing
  657  */
  658 int
  659 ip_mrouter_done(void)
  660 {
  661         vifi_t vifi;
  662         struct vif *vifp;
  663         int i;
  664         int s;
  665 
  666         s = splsoftnet();
  667 
  668         /* Clear out all the vifs currently in use. */
  669         for (vifi = 0; vifi < numvifs; vifi++) {
  670                 vifp = &viftable[vifi];
  671                 if (!in_nullhost(vifp->v_lcl_addr))
  672                         reset_vif(vifp);
  673         }
  674 
  675         numvifs = 0;
  676         pim_assert = 0;
  677         mrt_api_config = 0;
  678 
  679         callout_stop(&expire_upcalls_ch);
  680         callout_stop(&bw_upcalls_ch);
  681         callout_stop(&bw_meter_ch);
  682 
  683         /*
  684          * Free all multicast forwarding cache entries.
  685          */
  686         for (i = 0; i < MFCTBLSIZ; i++) {
  687                 struct mfc *rt, *nrt;
  688 
  689                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
  690                         nrt = LIST_NEXT(rt, mfc_hash);
  691 
  692                         expire_mfc(rt);
  693                 }
  694         }
  695 
  696         bzero((void *)nexpire, sizeof(nexpire));
  697         hashdone(mfchashtbl, HASH_LIST, mfchash);
  698         mfchashtbl = NULL;
  699 
  700         bw_upcalls_n = 0;
  701         bzero(bw_meter_timers, sizeof(bw_meter_timers));
  702 
  703         /* Reset de-encapsulation cache. */
  704 
  705         ip_mrouter = NULL;
  706 
  707         splx(s);
  708 
  709         if (mrtdebug)
  710                 log(LOG_DEBUG, "ip_mrouter_done\n");
  711 
  712         return (0);
  713 }
  714 
  715 void
  716 ip_mrouter_detach(struct ifnet *ifp)
  717 {
  718         int vifi, i;
  719         struct vif *vifp;
  720         struct mfc *rt;
  721         struct rtdetq *rte;
  722 
  723         /* XXX not sure about side effect to userland routing daemon */
  724         for (vifi = 0; vifi < numvifs; vifi++) {
  725                 vifp = &viftable[vifi];
  726                 if (vifp->v_ifp == ifp)
  727                         reset_vif(vifp);
  728         }
  729         for (i = 0; i < MFCTBLSIZ; i++) {
  730                 if (nexpire[i] == 0)
  731                         continue;
  732                 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
  733                         for (rte = rt->mfc_stall; rte; rte = rte->next) {
  734                                 if (rte->ifp == ifp)
  735                                         rte->ifp = NULL;
  736                         }
  737                 }
  738         }
  739 }
  740 
  741 /*
  742  * Set PIM assert processing global
  743  */
  744 static int
  745 set_assert(int i)
  746 {
  747         pim_assert = !!i;
  748         return (0);
  749 }
  750 
  751 /*
  752  * Configure API capabilities
  753  */
  754 static int
  755 set_api_config(struct sockopt *sopt)
  756 {
  757         u_int32_t apival;
  758         int i, error;
  759 
  760         /*
  761          * We can set the API capabilities only if it is the first operation
  762          * after MRT_INIT. I.e.:
  763          *  - there are no vifs installed
  764          *  - pim_assert is not enabled
  765          *  - the MFC table is empty
  766          */
  767         error = sockopt_get(sopt, &apival, sizeof(apival));
  768         if (error)
  769                 return (error);
  770         if (numvifs > 0)
  771                 return (EPERM);
  772         if (pim_assert)
  773                 return (EPERM);
  774         for (i = 0; i < MFCTBLSIZ; i++) {
  775                 if (LIST_FIRST(&mfchashtbl[i]) != NULL)
  776                         return (EPERM);
  777         }
  778 
  779         mrt_api_config = apival & mrt_api_support;
  780         return (0);
  781 }
  782 
  783 /*
  784  * Add a vif to the vif table
  785  */
  786 static int
  787 add_vif(struct vifctl *vifcp)
  788 {
  789         struct vif *vifp;
  790         struct ifaddr *ifa;
  791         struct ifnet *ifp;
  792         struct ifreq ifr;
  793         int error, s;
  794         struct sockaddr_in sin;
  795 
  796         if (vifcp->vifc_vifi >= MAXVIFS)
  797                 return (EINVAL);
  798         if (in_nullhost(vifcp->vifc_lcl_addr))
  799                 return (EADDRNOTAVAIL);
  800 
  801         vifp = &viftable[vifcp->vifc_vifi];
  802         if (!in_nullhost(vifp->v_lcl_addr))
  803                 return (EADDRINUSE);
  804 
  805         /* Find the interface with an address in AF_INET family. */
  806 #ifdef PIM
  807         if (vifcp->vifc_flags & VIFF_REGISTER) {
  808                 /*
  809                  * XXX: Because VIFF_REGISTER does not really need a valid
  810                  * local interface (e.g. it could be 127.0.0.2), we don't
  811                  * check its address.
  812                  */
  813             ifp = NULL;
  814         } else
  815 #endif
  816         {
  817                 sockaddr_in_init(&sin, &vifcp->vifc_lcl_addr, 0);
  818                 ifa = ifa_ifwithaddr(sintosa(&sin));
  819                 if (ifa == NULL)
  820                         return (EADDRNOTAVAIL);
  821                 ifp = ifa->ifa_ifp;
  822         }
  823 
  824         if (vifcp->vifc_flags & VIFF_TUNNEL) {
  825                 if (vifcp->vifc_flags & VIFF_SRCRT) {
  826                         log(LOG_ERR, "source routed tunnels not supported\n");
  827                         return (EOPNOTSUPP);
  828                 }
  829 
  830                 /* attach this vif to decapsulator dispatch table */
  831                 /*
  832                  * XXX Use addresses in registration so that matching
  833                  * can be done with radix tree in decapsulator.  But,
  834                  * we need to check inner header for multicast, so
  835                  * this requires both radix tree lookup and then a
  836                  * function to check, and this is not supported yet.
  837                  */
  838                 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
  839                     vif_encapcheck, &vif_protosw, vifp);
  840                 if (!vifp->v_encap_cookie)
  841                         return (EINVAL);
  842 
  843                 /* Create a fake encapsulation interface. */
  844                 ifp = (struct ifnet *)malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK);
  845                 bzero(ifp, sizeof(*ifp));
  846                 snprintf(ifp->if_xname, sizeof(ifp->if_xname),
  847                          "mdecap%d", vifcp->vifc_vifi);
  848 
  849                 /* Prepare cached route entry. */
  850                 bzero(&vifp->v_route, sizeof(vifp->v_route));
  851 #ifdef PIM
  852         } else if (vifcp->vifc_flags & VIFF_REGISTER) {
  853                 ifp = &multicast_register_if;
  854                 if (mrtdebug)
  855                         log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
  856                             (void *)ifp);
  857                 if (reg_vif_num == VIFI_INVALID) {
  858                         bzero(ifp, sizeof(*ifp));
  859                         snprintf(ifp->if_xname, sizeof(ifp->if_xname),
  860                                  "register_vif");
  861                         ifp->if_flags = IFF_LOOPBACK;
  862                         bzero(&vifp->v_route, sizeof(vifp->v_route));
  863                         reg_vif_num = vifcp->vifc_vifi;
  864                 }
  865 #endif
  866         } else {
  867                 /* Make sure the interface supports multicast. */
  868                 if ((ifp->if_flags & IFF_MULTICAST) == 0)
  869                         return (EOPNOTSUPP);
  870 
  871                 /* Enable promiscuous reception of all IP multicasts. */
  872                 sockaddr_in_init(&sin, &zeroin_addr, 0);
  873                 ifreq_setaddr(SIOCADDMULTI, &ifr, sintosa(&sin));
  874                 error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, &ifr);
  875                 if (error)
  876                         return (error);
  877         }
  878 
  879         s = splsoftnet();
  880 
  881         /* Define parameters for the tbf structure. */
  882         vifp->tbf_q = NULL;
  883         vifp->tbf_t = &vifp->tbf_q;
  884         microtime(&vifp->tbf_last_pkt_t);
  885         vifp->tbf_n_tok = 0;
  886         vifp->tbf_q_len = 0;
  887         vifp->tbf_max_q_len = MAXQSIZE;
  888 
  889         vifp->v_flags = vifcp->vifc_flags;
  890         vifp->v_threshold = vifcp->vifc_threshold;
  891         /* scaling up here allows division by 1024 in critical code */
  892         vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
  893         vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
  894         vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
  895         vifp->v_ifp = ifp;
  896         /* Initialize per vif pkt counters. */
  897         vifp->v_pkt_in = 0;
  898         vifp->v_pkt_out = 0;
  899         vifp->v_bytes_in = 0;
  900         vifp->v_bytes_out = 0;
  901 
  902         callout_init(&vifp->v_repq_ch, 0);
  903 
  904 #ifdef RSVP_ISI
  905         vifp->v_rsvp_on = 0;
  906         vifp->v_rsvpd = NULL;
  907 #endif /* RSVP_ISI */
  908 
  909         splx(s);
  910 
  911         /* Adjust numvifs up if the vifi is higher than numvifs. */
  912         if (numvifs <= vifcp->vifc_vifi)
  913                 numvifs = vifcp->vifc_vifi + 1;
  914 
  915         if (mrtdebug)
  916                 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
  917                     vifcp->vifc_vifi,
  918                     ntohl(vifcp->vifc_lcl_addr.s_addr),
  919                     (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
  920                     ntohl(vifcp->vifc_rmt_addr.s_addr),
  921                     vifcp->vifc_threshold,
  922                     vifcp->vifc_rate_limit);
  923 
  924         return (0);
  925 }
  926 
  927 void
  928 reset_vif(struct vif *vifp)
  929 {
  930         struct mbuf *m, *n;
  931         struct ifnet *ifp;
  932         struct ifreq ifr;
  933         struct sockaddr_in sin;
  934 
  935         callout_stop(&vifp->v_repq_ch);
  936 
  937         /* detach this vif from decapsulator dispatch table */
  938         encap_detach(vifp->v_encap_cookie);
  939         vifp->v_encap_cookie = NULL;
  940 
  941         /*
  942          * Free packets queued at the interface
  943          */
  944         for (m = vifp->tbf_q; m != NULL; m = n) {
  945                 n = m->m_nextpkt;
  946                 m_freem(m);
  947         }
  948 
  949         if (vifp->v_flags & VIFF_TUNNEL)
  950                 free(vifp->v_ifp, M_MRTABLE);
  951         else if (vifp->v_flags & VIFF_REGISTER) {
  952 #ifdef PIM
  953                 reg_vif_num = VIFI_INVALID;
  954 #endif
  955         } else {
  956                 sockaddr_in_init(&sin, &zeroin_addr, 0);
  957                 ifreq_setaddr(SIOCDELMULTI, &ifr, sintosa(&sin));
  958                 ifp = vifp->v_ifp;
  959                 (*ifp->if_ioctl)(ifp, SIOCDELMULTI, &ifr);
  960         }
  961         bzero((void *)vifp, sizeof(*vifp));
  962 }
  963 
  964 /*
  965  * Delete a vif from the vif table
  966  */
  967 static int
  968 del_vif(vifi_t *vifip)
  969 {
  970         struct vif *vifp;
  971         vifi_t vifi;
  972         int s;
  973 
  974         if (*vifip >= numvifs)
  975                 return (EINVAL);
  976 
  977         vifp = &viftable[*vifip];
  978         if (in_nullhost(vifp->v_lcl_addr))
  979                 return (EADDRNOTAVAIL);
  980 
  981         s = splsoftnet();
  982 
  983         reset_vif(vifp);
  984 
  985         /* Adjust numvifs down */
  986         for (vifi = numvifs; vifi > 0; vifi--)
  987                 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
  988                         break;
  989         numvifs = vifi;
  990 
  991         splx(s);
  992 
  993         if (mrtdebug)
  994                 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
  995 
  996         return (0);
  997 }
  998 
  999 /*
 1000  * update an mfc entry without resetting counters and S,G addresses.
 1001  */
 1002 static void
 1003 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 1004 {
 1005         int i;
 1006 
 1007         rt->mfc_parent = mfccp->mfcc_parent;
 1008         for (i = 0; i < numvifs; i++) {
 1009                 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 1010                 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
 1011                         MRT_MFC_FLAGS_ALL;
 1012         }
 1013         /* set the RP address */
 1014         if (mrt_api_config & MRT_MFC_RP)
 1015                 rt->mfc_rp = mfccp->mfcc_rp;
 1016         else
 1017                 rt->mfc_rp = zeroin_addr;
 1018 }
 1019 
 1020 /*
 1021  * fully initialize an mfc entry from the parameter.
 1022  */
 1023 static void
 1024 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 1025 {
 1026         rt->mfc_origin     = mfccp->mfcc_origin;
 1027         rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 1028 
 1029         update_mfc_params(rt, mfccp);
 1030 
 1031         /* initialize pkt counters per src-grp */
 1032         rt->mfc_pkt_cnt    = 0;
 1033         rt->mfc_byte_cnt   = 0;
 1034         rt->mfc_wrong_if   = 0;
 1035         timerclear(&rt->mfc_last_assert);
 1036 }
 1037 
 1038 static void
 1039 expire_mfc(struct mfc *rt)
 1040 {
 1041         struct rtdetq *rte, *nrte;
 1042 
 1043         free_bw_list(rt->mfc_bw_meter);
 1044 
 1045         for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
 1046                 nrte = rte->next;
 1047                 m_freem(rte->m);
 1048                 free(rte, M_MRTABLE);
 1049         }
 1050 
 1051         LIST_REMOVE(rt, mfc_hash);
 1052         free(rt, M_MRTABLE);
 1053 }
 1054 
 1055 /*
 1056  * Add an mfc entry
 1057  */
 1058 static int
 1059 add_mfc(struct sockopt *sopt)
 1060 {
 1061         struct mfcctl2 mfcctl2;
 1062         struct mfcctl2 *mfccp;
 1063         struct mfc *rt;
 1064         u_int32_t hash = 0;
 1065         struct rtdetq *rte, *nrte;
 1066         u_short nstl;
 1067         int s;
 1068         int mfcctl_size = sizeof(struct mfcctl);
 1069         int error;
 1070 
 1071         if (mrt_api_config & MRT_API_FLAGS_ALL)
 1072                 mfcctl_size = sizeof(struct mfcctl2);
 1073 
 1074         /*
 1075          * select data size depending on API version.
 1076          */
 1077         mfccp = &mfcctl2;
 1078         memset(&mfcctl2, 0, sizeof(mfcctl2));
 1079 
 1080         if (mrt_api_config & MRT_API_FLAGS_ALL)
 1081                 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
 1082         else
 1083                 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
 1084 
 1085         if (error)
 1086                 return (error);
 1087 
 1088         s = splsoftnet();
 1089         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1090 
 1091         /* If an entry already exists, just update the fields */
 1092         if (rt) {
 1093                 if (mrtdebug & DEBUG_MFC)
 1094                         log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
 1095                             ntohl(mfccp->mfcc_origin.s_addr),
 1096                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1097                             mfccp->mfcc_parent);
 1098 
 1099                 update_mfc_params(rt, mfccp);
 1100 
 1101                 splx(s);
 1102                 return (0);
 1103         }
 1104 
 1105         /*
 1106          * Find the entry for which the upcall was made and update
 1107          */
 1108         nstl = 0;
 1109         hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
 1110         LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1111                 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1112                     in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 1113                     rt->mfc_stall != NULL) {
 1114                         if (nstl++)
 1115                                 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
 1116                                     "multiple kernel entries",
 1117                                     ntohl(mfccp->mfcc_origin.s_addr),
 1118                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1119                                     mfccp->mfcc_parent, rt->mfc_stall);
 1120 
 1121                         if (mrtdebug & DEBUG_MFC)
 1122                                 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n",
 1123                                     ntohl(mfccp->mfcc_origin.s_addr),
 1124                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1125                                     mfccp->mfcc_parent, rt->mfc_stall);
 1126 
 1127                         rte = rt->mfc_stall;
 1128                         init_mfc_params(rt, mfccp);
 1129                         rt->mfc_stall = NULL;
 1130 
 1131                         rt->mfc_expire = 0; /* Don't clean this guy up */
 1132                         nexpire[hash]--;
 1133 
 1134                         /* free packets Qed at the end of this entry */
 1135                         for (; rte != NULL; rte = nrte) {
 1136                                 nrte = rte->next;
 1137                                 if (rte->ifp) {
 1138 #ifdef RSVP_ISI
 1139                                         ip_mdq(rte->m, rte->ifp, rt, -1);
 1140 #else
 1141                                         ip_mdq(rte->m, rte->ifp, rt);
 1142 #endif /* RSVP_ISI */
 1143                                 }
 1144                                 m_freem(rte->m);
 1145 #ifdef UPCALL_TIMING
 1146                                 collate(&rte->t);
 1147 #endif /* UPCALL_TIMING */
 1148                                 free(rte, M_MRTABLE);
 1149                         }
 1150                 }
 1151         }
 1152 
 1153         /*
 1154          * It is possible that an entry is being inserted without an upcall
 1155          */
 1156         if (nstl == 0) {
 1157                 /*
 1158                  * No mfc; make a new one
 1159                  */
 1160                 if (mrtdebug & DEBUG_MFC)
 1161                         log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
 1162                             ntohl(mfccp->mfcc_origin.s_addr),
 1163                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1164                             mfccp->mfcc_parent);
 1165 
 1166                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1167                         if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1168                             in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 1169                                 init_mfc_params(rt, mfccp);
 1170                                 if (rt->mfc_expire)
 1171                                         nexpire[hash]--;
 1172                                 rt->mfc_expire = 0;
 1173                                 break; /* XXX */
 1174                         }
 1175                 }
 1176                 if (rt == NULL) {       /* no upcall, so make a new entry */
 1177                         rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
 1178                                                   M_NOWAIT);
 1179                         if (rt == NULL) {
 1180                                 splx(s);
 1181                                 return (ENOBUFS);
 1182                         }
 1183 
 1184                         init_mfc_params(rt, mfccp);
 1185                         rt->mfc_expire  = 0;
 1186                         rt->mfc_stall   = NULL;
 1187                         rt->mfc_bw_meter = NULL;
 1188 
 1189                         /* insert new entry at head of hash chain */
 1190                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1191                 }
 1192         }
 1193 
 1194         splx(s);
 1195         return (0);
 1196 }
 1197 
 1198 #ifdef UPCALL_TIMING
 1199 /*
 1200  * collect delay statistics on the upcalls
 1201  */
 1202 static void
 1203 collate(struct timeval *t)
 1204 {
 1205         u_int32_t d;
 1206         struct timeval tp;
 1207         u_int32_t delta;
 1208 
 1209         microtime(&tp);
 1210 
 1211         if (timercmp(t, &tp, <)) {
 1212                 TV_DELTA(tp, *t, delta);
 1213 
 1214                 d = delta >> 10;
 1215                 if (d > 50)
 1216                         d = 50;
 1217 
 1218                 ++upcall_data[d];
 1219         }
 1220 }
 1221 #endif /* UPCALL_TIMING */
 1222 
 1223 /*
 1224  * Delete an mfc entry
 1225  */
 1226 static int
 1227 del_mfc(struct sockopt *sopt)
 1228 {
 1229         struct mfcctl2 mfcctl2;
 1230         struct mfcctl2 *mfccp;
 1231         struct mfc *rt;
 1232         int s;
 1233         int error;
 1234 
 1235         /*
 1236          * XXX: for deleting MFC entries the information in entries
 1237          * of size "struct mfcctl" is sufficient.
 1238          */
 1239 
 1240         mfccp = &mfcctl2;
 1241         memset(&mfcctl2, 0, sizeof(mfcctl2));
 1242 
 1243         error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
 1244         if (error) {
 1245                 /* Try with the size of mfcctl2. */
 1246                 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
 1247                 if (error)
 1248                         return (error);
 1249         }
 1250 
 1251         if (mrtdebug & DEBUG_MFC)
 1252                 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
 1253                     ntohl(mfccp->mfcc_origin.s_addr),
 1254                     ntohl(mfccp->mfcc_mcastgrp.s_addr));
 1255 
 1256         s = splsoftnet();
 1257 
 1258         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1259         if (rt == NULL) {
 1260                 splx(s);
 1261                 return (EADDRNOTAVAIL);
 1262         }
 1263 
 1264         /*
 1265          * free the bw_meter entries
 1266          */
 1267         free_bw_list(rt->mfc_bw_meter);
 1268         rt->mfc_bw_meter = NULL;
 1269 
 1270         LIST_REMOVE(rt, mfc_hash);
 1271         free(rt, M_MRTABLE);
 1272 
 1273         splx(s);
 1274         return (0);
 1275 }
 1276 
 1277 static int
 1278 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 1279 {
 1280         if (s) {
 1281                 if (sbappendaddr(&s->so_rcv, sintosa(src), mm,
 1282                     (struct mbuf *)NULL) != 0) {
 1283                         sorwakeup(s);
 1284                         return (0);
 1285                 }
 1286         }
 1287         m_freem(mm);
 1288         return (-1);
 1289 }
 1290 
 1291 /*
 1292  * IP multicast forwarding function. This function assumes that the packet
 1293  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
 1294  * pointed to by "ifp", and the packet is to be relayed to other networks
 1295  * that have members of the packet's destination IP multicast group.
 1296  *
 1297  * The packet is returned unscathed to the caller, unless it is
 1298  * erroneous, in which case a non-zero return value tells the caller to
 1299  * discard it.
 1300  */
 1301 
 1302 #define IP_HDR_LEN  20  /* # bytes of fixed IP header (excluding options) */
 1303 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 1304 
 1305 int
 1306 #ifdef RSVP_ISI
 1307 ip_mforward(struct mbuf *m, struct ifnet *ifp, struct ip_moptions *imo)
 1308 #else
 1309 ip_mforward(struct mbuf *m, struct ifnet *ifp)
 1310 #endif /* RSVP_ISI */
 1311 {
 1312         struct ip *ip = mtod(m, struct ip *);
 1313         struct mfc *rt;
 1314         static int srctun = 0;
 1315         struct mbuf *mm;
 1316         struct sockaddr_in sin;
 1317         int s;
 1318         vifi_t vifi;
 1319 
 1320         if (mrtdebug & DEBUG_FORWARD)
 1321                 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
 1322                     ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
 1323 
 1324         if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
 1325             ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
 1326                 /*
 1327                  * Packet arrived via a physical interface or
 1328                  * an encapsulated tunnel or a register_vif.
 1329                  */
 1330         } else {
 1331                 /*
 1332                  * Packet arrived through a source-route tunnel.
 1333                  * Source-route tunnels are no longer supported.
 1334                  */
 1335                 if ((srctun++ % 1000) == 0)
 1336                         log(LOG_ERR,
 1337                             "ip_mforward: received source-routed packet from %x\n",
 1338                             ntohl(ip->ip_src.s_addr));
 1339 
 1340                 return (1);
 1341         }
 1342 
 1343         /*
 1344          * Clear any in-bound checksum flags for this packet.
 1345          */
 1346         m->m_pkthdr.csum_flags = 0;
 1347 
 1348 #ifdef RSVP_ISI
 1349         if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
 1350                 if (ip->ip_ttl < MAXTTL)
 1351                         ip->ip_ttl++;   /* compensate for -1 in *_send routines */
 1352                 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1353                         struct vif *vifp = viftable + vifi;
 1354                         printf("Sending IPPROTO_RSVP from %x to %x on vif %d (%s%s)\n",
 1355                             ntohl(ip->ip_src), ntohl(ip->ip_dst), vifi,
 1356                             (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
 1357                             vifp->v_ifp->if_xname);
 1358                 }
 1359                 return (ip_mdq(m, ifp, (struct mfc *)NULL, vifi));
 1360         }
 1361         if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1362                 printf("Warning: IPPROTO_RSVP from %x to %x without vif option\n",
 1363                     ntohl(ip->ip_src), ntohl(ip->ip_dst));
 1364         }
 1365 #endif /* RSVP_ISI */
 1366 
 1367         /*
 1368          * Don't forward a packet with time-to-live of zero or one,
 1369          * or a packet destined to a local-only group.
 1370          */
 1371         if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
 1372                 return (0);
 1373 
 1374         /*
 1375          * Determine forwarding vifs from the forwarding cache table
 1376          */
 1377         s = splsoftnet();
 1378         ++mrtstat.mrts_mfc_lookups;
 1379         rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 1380 
 1381         /* Entry exists, so forward if necessary */
 1382         if (rt != NULL) {
 1383                 splx(s);
 1384 #ifdef RSVP_ISI
 1385                 return (ip_mdq(m, ifp, rt, -1));
 1386 #else
 1387                 return (ip_mdq(m, ifp, rt));
 1388 #endif /* RSVP_ISI */
 1389         } else {
 1390                 /*
 1391                  * If we don't have a route for packet's origin,
 1392                  * Make a copy of the packet & send message to routing daemon
 1393                  */
 1394 
 1395                 struct mbuf *mb0;
 1396                 struct rtdetq *rte;
 1397                 u_int32_t hash;
 1398                 int hlen = ip->ip_hl << 2;
 1399 #ifdef UPCALL_TIMING
 1400                 struct timeval tp;
 1401 
 1402                 microtime(&tp);
 1403 #endif /* UPCALL_TIMING */
 1404 
 1405                 ++mrtstat.mrts_mfc_misses;
 1406 
 1407                 mrtstat.mrts_no_route++;
 1408                 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
 1409                         log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
 1410                             ntohl(ip->ip_src.s_addr),
 1411                             ntohl(ip->ip_dst.s_addr));
 1412 
 1413                 /*
 1414                  * Allocate mbufs early so that we don't do extra work if we are
 1415                  * just going to fail anyway.  Make sure to pullup the header so
 1416                  * that other people can't step on it.
 1417                  */
 1418                 rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE,
 1419                                               M_NOWAIT);
 1420                 if (rte == NULL) {
 1421                         splx(s);
 1422                         return (ENOBUFS);
 1423                 }
 1424                 mb0 = m_copypacket(m, M_DONTWAIT);
 1425                 M_PULLUP(mb0, hlen);
 1426                 if (mb0 == NULL) {
 1427                         free(rte, M_MRTABLE);
 1428                         splx(s);
 1429                         return (ENOBUFS);
 1430                 }
 1431 
 1432                 /* is there an upcall waiting for this flow? */
 1433                 hash = MFCHASH(ip->ip_src, ip->ip_dst);
 1434                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1435                         if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 1436                             in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 1437                             rt->mfc_stall != NULL)
 1438                                 break;
 1439                 }
 1440 
 1441                 if (rt == NULL) {
 1442                         int i;
 1443                         struct igmpmsg *im;
 1444 
 1445                         /*
 1446                          * Locate the vifi for the incoming interface for
 1447                          * this packet.
 1448                          * If none found, drop packet.
 1449                          */
 1450                         for (vifi = 0; vifi < numvifs &&
 1451                                  viftable[vifi].v_ifp != ifp; vifi++)
 1452                                 ;
 1453                         if (vifi >= numvifs) /* vif not found, drop packet */
 1454                                 goto non_fatal;
 1455 
 1456                         /* no upcall, so make a new entry */
 1457                         rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
 1458                                                   M_NOWAIT);
 1459                         if (rt == NULL)
 1460                                 goto fail;
 1461 
 1462                         /*
 1463                          * Make a copy of the header to send to the user level
 1464                          * process
 1465                          */
 1466                         mm = m_copym(m, 0, hlen, M_DONTWAIT);
 1467                         M_PULLUP(mm, hlen);
 1468                         if (mm == NULL)
 1469                                 goto fail1;
 1470 
 1471                         /*
 1472                          * Send message to routing daemon to install
 1473                          * a route into the kernel table
 1474                          */
 1475 
 1476                         im = mtod(mm, struct igmpmsg *);
 1477                         im->im_msgtype = IGMPMSG_NOCACHE;
 1478                         im->im_mbz = 0;
 1479                         im->im_vif = vifi;
 1480 
 1481                         mrtstat.mrts_upcalls++;
 1482 
 1483                         sockaddr_in_init(&sin, &ip->ip_src, 0);
 1484                         if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1485                                 log(LOG_WARNING,
 1486                                     "ip_mforward: ip_mrouter socket queue full\n");
 1487                                 ++mrtstat.mrts_upq_sockfull;
 1488                         fail1:
 1489                                 free(rt, M_MRTABLE);
 1490                         fail:
 1491                                 free(rte, M_MRTABLE);
 1492                                 m_freem(mb0);
 1493                                 splx(s);
 1494                                 return (ENOBUFS);
 1495                         }
 1496 
 1497                         /* insert new entry at head of hash chain */
 1498                         rt->mfc_origin = ip->ip_src;
 1499                         rt->mfc_mcastgrp = ip->ip_dst;
 1500                         rt->mfc_pkt_cnt = 0;
 1501                         rt->mfc_byte_cnt = 0;
 1502                         rt->mfc_wrong_if = 0;
 1503                         rt->mfc_expire = UPCALL_EXPIRE;
 1504                         nexpire[hash]++;
 1505                         for (i = 0; i < numvifs; i++) {
 1506                                 rt->mfc_ttls[i] = 0;
 1507                                 rt->mfc_flags[i] = 0;
 1508                         }
 1509                         rt->mfc_parent = -1;
 1510 
 1511                         /* clear the RP address */
 1512                         rt->mfc_rp = zeroin_addr;
 1513 
 1514                         rt->mfc_bw_meter = NULL;
 1515 
 1516                         /* link into table */
 1517                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1518                         /* Add this entry to the end of the queue */
 1519                         rt->mfc_stall = rte;
 1520                 } else {
 1521                         /* determine if q has overflowed */
 1522                         struct rtdetq **p;
 1523                         int npkts = 0;
 1524 
 1525                         /*
 1526                          * XXX ouch! we need to append to the list, but we
 1527                          * only have a pointer to the front, so we have to
 1528                          * scan the entire list every time.
 1529                          */
 1530                         for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
 1531                                 if (++npkts > MAX_UPQ) {
 1532                                         mrtstat.mrts_upq_ovflw++;
 1533                                 non_fatal:
 1534                                         free(rte, M_MRTABLE);
 1535                                         m_freem(mb0);
 1536                                         splx(s);
 1537                                         return (0);
 1538                                 }
 1539 
 1540                         /* Add this entry to the end of the queue */
 1541                         *p = rte;
 1542                 }
 1543 
 1544                 rte->next = NULL;
 1545                 rte->m = mb0;
 1546                 rte->ifp = ifp;
 1547 #ifdef UPCALL_TIMING
 1548                 rte->t = tp;
 1549 #endif /* UPCALL_TIMING */
 1550 
 1551                 splx(s);
 1552 
 1553                 return (0);
 1554         }
 1555 }
 1556 
 1557 
 1558 /*ARGSUSED*/
 1559 static void
 1560 expire_upcalls(void *v)
 1561 {
 1562         int i;
 1563         int s;
 1564 
 1565         s = splsoftnet();
 1566 
 1567         for (i = 0; i < MFCTBLSIZ; i++) {
 1568                 struct mfc *rt, *nrt;
 1569 
 1570                 if (nexpire[i] == 0)
 1571                         continue;
 1572 
 1573                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
 1574                         nrt = LIST_NEXT(rt, mfc_hash);
 1575 
 1576                         if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 1577                                 continue;
 1578                         nexpire[i]--;
 1579 
 1580                         /*
 1581                          * free the bw_meter entries
 1582                          */
 1583                         while (rt->mfc_bw_meter != NULL) {
 1584                                 struct bw_meter *x = rt->mfc_bw_meter;
 1585 
 1586                                 rt->mfc_bw_meter = x->bm_mfc_next;
 1587                                 free(x, M_BWMETER);
 1588                         }
 1589 
 1590                         ++mrtstat.mrts_cache_cleanups;
 1591                         if (mrtdebug & DEBUG_EXPIRE)
 1592                                 log(LOG_DEBUG,
 1593                                     "expire_upcalls: expiring (%x %x)\n",
 1594                                     ntohl(rt->mfc_origin.s_addr),
 1595                                     ntohl(rt->mfc_mcastgrp.s_addr));
 1596 
 1597                         expire_mfc(rt);
 1598                 }
 1599         }
 1600 
 1601         splx(s);
 1602         callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 1603             expire_upcalls, NULL);
 1604 }
 1605 
 1606 /*
 1607  * Packet forwarding routine once entry in the cache is made
 1608  */
 1609 static int
 1610 #ifdef RSVP_ISI
 1611 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 1612 #else
 1613 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
 1614 #endif /* RSVP_ISI */
 1615 {
 1616         struct ip  *ip = mtod(m, struct ip *);
 1617         vifi_t vifi;
 1618         struct vif *vifp;
 1619         struct sockaddr_in sin;
 1620         int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
 1621 
 1622 /*
 1623  * Macro to send packet on vif.  Since RSVP packets don't get counted on
 1624  * input, they shouldn't get counted on output, so statistics keeping is
 1625  * separate.
 1626  */
 1627 #define MC_SEND(ip, vifp, m) do {                                       \
 1628         if ((vifp)->v_flags & VIFF_TUNNEL)                              \
 1629                 encap_send((ip), (vifp), (m));                          \
 1630         else                                                            \
 1631                 phyint_send((ip), (vifp), (m));                         \
 1632 } while (/*CONSTCOND*/ 0)
 1633 
 1634 #ifdef RSVP_ISI
 1635         /*
 1636          * If xmt_vif is not -1, send on only the requested vif.
 1637          *
 1638          * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.
 1639          */
 1640         if (xmt_vif < numvifs) {
 1641 #ifdef PIM
 1642                 if (viftable[xmt_vif].v_flags & VIFF_REGISTER)
 1643                         pim_register_send(ip, viftable + xmt_vif, m, rt);
 1644                 else
 1645 #endif
 1646                 MC_SEND(ip, viftable + xmt_vif, m);
 1647                 return (1);
 1648         }
 1649 #endif /* RSVP_ISI */
 1650 
 1651         /*
 1652          * Don't forward if it didn't arrive from the parent vif for its origin.
 1653          */
 1654         vifi = rt->mfc_parent;
 1655         if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
 1656                 /* came in the wrong interface */
 1657                 if (mrtdebug & DEBUG_FORWARD)
 1658                         log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
 1659                             ifp, vifi,
 1660                             vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
 1661                 ++mrtstat.mrts_wrong_if;
 1662                 ++rt->mfc_wrong_if;
 1663                 /*
 1664                  * If we are doing PIM assert processing, send a message
 1665                  * to the routing daemon.
 1666                  *
 1667                  * XXX: A PIM-SM router needs the WRONGVIF detection so it
 1668                  * can complete the SPT switch, regardless of the type
 1669                  * of the iif (broadcast media, GRE tunnel, etc).
 1670                  */
 1671                 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
 1672                         struct timeval now;
 1673                         u_int32_t delta;
 1674 
 1675 #ifdef PIM
 1676                         if (ifp == &multicast_register_if)
 1677                                 pimstat.pims_rcv_registers_wrongiif++;
 1678 #endif
 1679 
 1680                         /* Get vifi for the incoming packet */
 1681                         for (vifi = 0;
 1682                              vifi < numvifs && viftable[vifi].v_ifp != ifp;
 1683                              vifi++)
 1684                             ;
 1685                         if (vifi >= numvifs) {
 1686                                 /* The iif is not found: ignore the packet. */
 1687                                 return (0);
 1688                         }
 1689 
 1690                         if (rt->mfc_flags[vifi] &
 1691                             MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
 1692                                 /* WRONGVIF disabled: ignore the packet */
 1693                                 return (0);
 1694                         }
 1695 
 1696                         microtime(&now);
 1697 
 1698                         TV_DELTA(rt->mfc_last_assert, now, delta);
 1699 
 1700                         if (delta > ASSERT_MSG_TIME) {
 1701                                 struct igmpmsg *im;
 1702                                 int hlen = ip->ip_hl << 2;
 1703                                 struct mbuf *mm =
 1704                                     m_copym(m, 0, hlen, M_DONTWAIT);
 1705 
 1706                                 M_PULLUP(mm, hlen);
 1707                                 if (mm == NULL)
 1708                                         return (ENOBUFS);
 1709 
 1710                                 rt->mfc_last_assert = now;
 1711 
 1712                                 im = mtod(mm, struct igmpmsg *);
 1713                                 im->im_msgtype  = IGMPMSG_WRONGVIF;
 1714                                 im->im_mbz      = 0;
 1715                                 im->im_vif      = vifi;
 1716 
 1717                                 mrtstat.mrts_upcalls++;
 1718 
 1719                                 sockaddr_in_init(&sin, &im->im_src, 0);
 1720                                 if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1721                                         log(LOG_WARNING,
 1722                                             "ip_mforward: ip_mrouter socket queue full\n");
 1723                                         ++mrtstat.mrts_upq_sockfull;
 1724                                         return (ENOBUFS);
 1725                                 }
 1726                         }
 1727                 }
 1728                 return (0);
 1729         }
 1730 
 1731         /* If I sourced this packet, it counts as output, else it was input. */
 1732         if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
 1733                 viftable[vifi].v_pkt_out++;
 1734                 viftable[vifi].v_bytes_out += plen;
 1735         } else {
 1736                 viftable[vifi].v_pkt_in++;
 1737                 viftable[vifi].v_bytes_in += plen;
 1738         }
 1739         rt->mfc_pkt_cnt++;
 1740         rt->mfc_byte_cnt += plen;
 1741 
 1742         /*
 1743          * For each vif, decide if a copy of the packet should be forwarded.
 1744          * Forward if:
 1745          *              - the ttl exceeds the vif's threshold
 1746          *              - there are group members downstream on interface
 1747          */
 1748         for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
 1749                 if ((rt->mfc_ttls[vifi] > 0) &&
 1750                         (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 1751                         vifp->v_pkt_out++;
 1752                         vifp->v_bytes_out += plen;
 1753 #ifdef PIM
 1754                         if (vifp->v_flags & VIFF_REGISTER)
 1755                                 pim_register_send(ip, vifp, m, rt);
 1756                         else
 1757 #endif
 1758                         MC_SEND(ip, vifp, m);
 1759                 }
 1760 
 1761         /*
 1762          * Perform upcall-related bw measuring.
 1763          */
 1764         if (rt->mfc_bw_meter != NULL) {
 1765                 struct bw_meter *x;
 1766                 struct timeval now;
 1767 
 1768                 microtime(&now);
 1769                 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 1770                         bw_meter_receive_packet(x, plen, &now);
 1771         }
 1772 
 1773         return (0);
 1774 }
 1775 
 1776 #ifdef RSVP_ISI
 1777 /*
 1778  * check if a vif number is legal/ok. This is used by ip_output.
 1779  */
 1780 int
 1781 legal_vif_num(int vif)
 1782 {
 1783         if (vif >= 0 && vif < numvifs)
 1784                 return (1);
 1785         else
 1786                 return (0);
 1787 }
 1788 #endif /* RSVP_ISI */
 1789 
 1790 static void
 1791 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1792 {
 1793         struct mbuf *mb_copy;
 1794         int hlen = ip->ip_hl << 2;
 1795 
 1796         /*
 1797          * Make a new reference to the packet; make sure that
 1798          * the IP header is actually copied, not just referenced,
 1799          * so that ip_output() only scribbles on the copy.
 1800          */
 1801         mb_copy = m_copypacket(m, M_DONTWAIT);
 1802         M_PULLUP(mb_copy, hlen);
 1803         if (mb_copy == NULL)
 1804                 return;
 1805 
 1806         if (vifp->v_rate_limit <= 0)
 1807                 tbf_send_packet(vifp, mb_copy);
 1808         else
 1809                 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *),
 1810                     ntohs(ip->ip_len));
 1811 }
 1812 
 1813 static void
 1814 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1815 {
 1816         struct mbuf *mb_copy;
 1817         struct ip *ip_copy;
 1818         int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
 1819 
 1820         /* Take care of delayed checksums */
 1821         if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
 1822                 in_delayed_cksum(m);
 1823                 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
 1824         }
 1825 
 1826         /*
 1827          * copy the old packet & pullup it's IP header into the
 1828          * new mbuf so we can modify it.  Try to fill the new
 1829          * mbuf since if we don't the ethernet driver will.
 1830          */
 1831         MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
 1832         if (mb_copy == NULL)
 1833                 return;
 1834         mb_copy->m_data += max_linkhdr;
 1835         mb_copy->m_pkthdr.len = len;
 1836         mb_copy->m_len = sizeof(multicast_encap_iphdr);
 1837 
 1838         if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
 1839                 m_freem(mb_copy);
 1840                 return;
 1841         }
 1842         i = MHLEN - max_linkhdr;
 1843         if (i > len)
 1844                 i = len;
 1845         mb_copy = m_pullup(mb_copy, i);
 1846         if (mb_copy == NULL)
 1847                 return;
 1848 
 1849         /*
 1850          * fill in the encapsulating IP header.
 1851          */
 1852         ip_copy = mtod(mb_copy, struct ip *);
 1853         *ip_copy = multicast_encap_iphdr;
 1854         if (len < IP_MINFRAGSIZE)
 1855                 ip_copy->ip_id = 0;
 1856         else
 1857                 ip_copy->ip_id = ip_newid(NULL);
 1858         ip_copy->ip_len = htons(len);
 1859         ip_copy->ip_src = vifp->v_lcl_addr;
 1860         ip_copy->ip_dst = vifp->v_rmt_addr;
 1861 
 1862         /*
 1863          * turn the encapsulated IP header back into a valid one.
 1864          */
 1865         ip = (struct ip *)((char *)ip_copy + sizeof(multicast_encap_iphdr));
 1866         --ip->ip_ttl;
 1867         ip->ip_sum = 0;
 1868         mb_copy->m_data += sizeof(multicast_encap_iphdr);
 1869         ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 1870         mb_copy->m_data -= sizeof(multicast_encap_iphdr);
 1871 
 1872         if (vifp->v_rate_limit <= 0)
 1873                 tbf_send_packet(vifp, mb_copy);
 1874         else
 1875                 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len));
 1876 }
 1877 
 1878 /*
 1879  * De-encapsulate a packet and feed it back through ip input.
 1880  */
 1881 static void
 1882 vif_input(struct mbuf *m, ...)
 1883 {
 1884         int off, proto;
 1885         va_list ap;
 1886         struct vif *vifp;
 1887         int s;
 1888         struct ifqueue *ifq;
 1889 
 1890         va_start(ap, m);
 1891         off = va_arg(ap, int);
 1892         proto = va_arg(ap, int);
 1893         va_end(ap);
 1894 
 1895         vifp = (struct vif *)encap_getarg(m);
 1896         if (!vifp || proto != ENCAP_PROTO) {
 1897                 m_freem(m);
 1898                 mrtstat.mrts_bad_tunnel++;
 1899                 return;
 1900         }
 1901 
 1902         m_adj(m, off);
 1903         m->m_pkthdr.rcvif = vifp->v_ifp;
 1904         ifq = &ipintrq;
 1905         s = splnet();
 1906         if (IF_QFULL(ifq)) {
 1907                 IF_DROP(ifq);
 1908                 m_freem(m);
 1909         } else {
 1910                 IF_ENQUEUE(ifq, m);
 1911                 /*
 1912                  * normally we would need a "schednetisr(NETISR_IP)"
 1913                  * here but we were called by ip_input and it is going
 1914                  * to loop back & try to dequeue the packet we just
 1915                  * queued as soon as we return so we avoid the
 1916                  * unnecessary software interrrupt.
 1917                  */
 1918         }
 1919         splx(s);
 1920 }
 1921 
 1922 /*
 1923  * Check if the packet should be received on the vif denoted by arg.
 1924  * (The encap selection code will call this once per vif since each is
 1925  * registered separately.)
 1926  */
 1927 static int
 1928 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg)
 1929 {
 1930         struct vif *vifp;
 1931         struct ip ip;
 1932 
 1933 #ifdef DIAGNOSTIC
 1934         if (!arg || proto != IPPROTO_IPV4)
 1935                 panic("unexpected arg in vif_encapcheck");
 1936 #endif
 1937 
 1938         /*
 1939          * Accept the packet only if the inner heaader is multicast
 1940          * and the outer header matches a tunnel-mode vif.  Order
 1941          * checks in the hope that common non-matching packets will be
 1942          * rejected quickly.  Assume that unicast IPv4 traffic in a
 1943          * parallel tunnel (e.g. gif(4)) is unlikely.
 1944          */
 1945 
 1946         /* Obtain the outer IP header and the vif pointer. */
 1947         m_copydata((struct mbuf *)m, 0, sizeof(ip), (void *)&ip);
 1948         vifp = (struct vif *)arg;
 1949 
 1950         /*
 1951          * The outer source must match the vif's remote peer address.
 1952          * For a multicast router with several tunnels, this is the
 1953          * only check that will fail on packets in other tunnels,
 1954          * assuming the local address is the same.         
 1955          */
 1956         if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src))
 1957                 return 0;
 1958 
 1959         /* The outer destination must match the vif's local address. */
 1960         if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst))
 1961                 return 0;
 1962 
 1963         /* The vif must be of tunnel type. */
 1964         if ((vifp->v_flags & VIFF_TUNNEL) == 0)
 1965                 return 0;
 1966 
 1967         /* Check that the inner destination is multicast. */
 1968         m_copydata((struct mbuf *)m, off, sizeof(ip), (void *)&ip);
 1969         if (!IN_MULTICAST(ip.ip_dst.s_addr))
 1970                 return 0;
 1971 
 1972         /*
 1973          * We have checked that both the outer src and dst addresses
 1974          * match the vif, and that the inner destination is multicast
 1975          * (224/5).  By claiming more than 64, we intend to
 1976          * preferentially take packets that also match a parallel
 1977          * gif(4).
 1978          */
 1979         return 32 + 32 + 5;
 1980 }
 1981 
 1982 /*
 1983  * Token bucket filter module
 1984  */
 1985 static void
 1986 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len)
 1987 {
 1988 
 1989         if (len > MAX_BKT_SIZE) {
 1990                 /* drop if packet is too large */
 1991                 mrtstat.mrts_pkt2large++;
 1992                 m_freem(m);
 1993                 return;
 1994         }
 1995 
 1996         tbf_update_tokens(vifp);
 1997 
 1998         /*
 1999          * If there are enough tokens, and the queue is empty, send this packet
 2000          * out immediately.  Otherwise, try to insert it on this vif's queue.
 2001          */
 2002         if (vifp->tbf_q_len == 0) {
 2003                 if (len <= vifp->tbf_n_tok) {
 2004                         vifp->tbf_n_tok -= len;
 2005                         tbf_send_packet(vifp, m);
 2006                 } else {
 2007                         /* queue packet and timeout till later */
 2008                         tbf_queue(vifp, m);
 2009                         callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
 2010                             tbf_reprocess_q, vifp);
 2011                 }
 2012         } else {
 2013                 if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
 2014                     !tbf_dq_sel(vifp, ip)) {
 2015                         /* queue full, and couldn't make room */
 2016                         mrtstat.mrts_q_overflow++;
 2017                         m_freem(m);
 2018                 } else {
 2019                         /* queue length low enough, or made room */
 2020                         tbf_queue(vifp, m);
 2021                         tbf_process_q(vifp);
 2022                 }
 2023         }
 2024 }
 2025 
 2026 /*
 2027  * adds a packet to the queue at the interface
 2028  */
 2029 static void
 2030 tbf_queue(struct vif *vifp, struct mbuf *m)
 2031 {
 2032         int s = splsoftnet();
 2033 
 2034         /* insert at tail */
 2035         *vifp->tbf_t = m;
 2036         vifp->tbf_t = &m->m_nextpkt;
 2037         vifp->tbf_q_len++;
 2038 
 2039         splx(s);
 2040 }
 2041 
 2042 
 2043 /*
 2044  * processes the queue at the interface
 2045  */
 2046 static void
 2047 tbf_process_q(struct vif *vifp)
 2048 {
 2049         struct mbuf *m;
 2050         int len;
 2051         int s = splsoftnet();
 2052 
 2053         /*
 2054          * Loop through the queue at the interface and send as many packets
 2055          * as possible.
 2056          */
 2057         for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) {
 2058                 len = ntohs(mtod(m, struct ip *)->ip_len);
 2059 
 2060                 /* determine if the packet can be sent */
 2061                 if (len <= vifp->tbf_n_tok) {
 2062                         /* if so,
 2063                          * reduce no of tokens, dequeue the packet,
 2064                          * send the packet.
 2065                          */
 2066                         if ((vifp->tbf_q = m->m_nextpkt) == NULL)
 2067                                 vifp->tbf_t = &vifp->tbf_q;
 2068                         --vifp->tbf_q_len;
 2069 
 2070                         m->m_nextpkt = NULL;
 2071                         vifp->tbf_n_tok -= len;
 2072                         tbf_send_packet(vifp, m);
 2073                 } else
 2074                         break;
 2075         }
 2076         splx(s);
 2077 }
 2078 
 2079 static void
 2080 tbf_reprocess_q(void *arg)
 2081 {
 2082         struct vif *vifp = arg;
 2083 
 2084         if (ip_mrouter == NULL)
 2085                 return;
 2086 
 2087         tbf_update_tokens(vifp);
 2088         tbf_process_q(vifp);
 2089 
 2090         if (vifp->tbf_q_len != 0)
 2091                 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
 2092                     tbf_reprocess_q, vifp);
 2093 }
 2094 
 2095 /* function that will selectively discard a member of the queue
 2096  * based on the precedence value and the priority
 2097  */
 2098 static int
 2099 tbf_dq_sel(struct vif *vifp, struct ip *ip)
 2100 {
 2101         u_int p;
 2102         struct mbuf **mp, *m;
 2103         int s = splsoftnet();
 2104 
 2105         p = priority(vifp, ip);
 2106 
 2107         for (mp = &vifp->tbf_q, m = *mp;
 2108             m != NULL;
 2109             mp = &m->m_nextpkt, m = *mp) {
 2110                 if (p > priority(vifp, mtod(m, struct ip *))) {
 2111                         if ((*mp = m->m_nextpkt) == NULL)
 2112                                 vifp->tbf_t = mp;
 2113                         --vifp->tbf_q_len;
 2114 
 2115                         m_freem(m);
 2116                         mrtstat.mrts_drop_sel++;
 2117                         splx(s);
 2118                         return (1);
 2119                 }
 2120         }
 2121         splx(s);
 2122         return (0);
 2123 }
 2124 
 2125 static void
 2126 tbf_send_packet(struct vif *vifp, struct mbuf *m)
 2127 {
 2128         int error;
 2129         int s = splsoftnet();
 2130 
 2131         if (vifp->v_flags & VIFF_TUNNEL) {
 2132                 /* If tunnel options */
 2133                 ip_output(m, (struct mbuf *)NULL, &vifp->v_route,
 2134                     IP_FORWARDING, (struct ip_moptions *)NULL,
 2135                     (struct socket *)NULL);
 2136         } else {
 2137                 /* if physical interface option, extract the options and then send */
 2138                 struct ip_moptions imo;
 2139 
 2140                 imo.imo_multicast_ifp = vifp->v_ifp;
 2141                 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
 2142                 imo.imo_multicast_loop = 1;
 2143 #ifdef RSVP_ISI
 2144                 imo.imo_multicast_vif = -1;
 2145 #endif
 2146 
 2147                 error = ip_output(m, NULL, NULL, IP_FORWARDING|IP_MULTICASTOPTS,
 2148                     &imo, NULL);
 2149 
 2150                 if (mrtdebug & DEBUG_XMIT)
 2151                         log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
 2152                             (long)(vifp - viftable), error);
 2153         }
 2154         splx(s);
 2155 }
 2156 
 2157 /* determine the current time and then
 2158  * the elapsed time (between the last time and time now)
 2159  * in milliseconds & update the no. of tokens in the bucket
 2160  */
 2161 static void
 2162 tbf_update_tokens(struct vif *vifp)
 2163 {
 2164         struct timeval tp;
 2165         u_int32_t tm;
 2166         int s = splsoftnet();
 2167 
 2168         microtime(&tp);
 2169 
 2170         TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
 2171 
 2172         /*
 2173          * This formula is actually
 2174          * "time in seconds" * "bytes/second".
 2175          *
 2176          * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
 2177          *
 2178          * The (1000/1024) was introduced in add_vif to optimize
 2179          * this divide into a shift.
 2180          */
 2181         vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
 2182         vifp->tbf_last_pkt_t = tp;
 2183 
 2184         if (vifp->tbf_n_tok > MAX_BKT_SIZE)
 2185                 vifp->tbf_n_tok = MAX_BKT_SIZE;
 2186 
 2187         splx(s);
 2188 }
 2189 
 2190 static int
 2191 priority(struct vif *vifp, struct ip *ip)
 2192 {
 2193         int prio = 50;  /* the lowest priority -- default case */
 2194 
 2195         /* temporary hack; may add general packet classifier some day */
 2196 
 2197         /*
 2198          * The UDP port space is divided up into four priority ranges:
 2199          * [0, 16384)     : unclassified - lowest priority
 2200          * [16384, 32768) : audio - highest priority
 2201          * [32768, 49152) : whiteboard - medium priority
 2202          * [49152, 65536) : video - low priority
 2203          */
 2204         if (ip->ip_p == IPPROTO_UDP) {
 2205                 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
 2206 
 2207                 switch (ntohs(udp->uh_dport) & 0xc000) {
 2208                 case 0x4000:
 2209                         prio = 70;
 2210                         break;
 2211                 case 0x8000:
 2212                         prio = 60;
 2213                         break;
 2214                 case 0xc000:
 2215                         prio = 55;
 2216                         break;
 2217                 }
 2218 
 2219                 if (tbfdebug > 1)
 2220                         log(LOG_DEBUG, "port %x prio %d\n",
 2221                             ntohs(udp->uh_dport), prio);
 2222         }
 2223 
 2224         return (prio);
 2225 }
 2226 
 2227 /*
 2228  * End of token bucket filter modifications
 2229  */
 2230 #ifdef RSVP_ISI
 2231 int
 2232 ip_rsvp_vif_init(struct socket *so, struct mbuf *m)
 2233 {
 2234         int vifi, s;
 2235 
 2236         if (rsvpdebug)
 2237                 printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n",
 2238                     so->so_type, so->so_proto->pr_protocol);
 2239 
 2240         if (so->so_type != SOCK_RAW ||
 2241             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2242                 return (EOPNOTSUPP);
 2243 
 2244         /* Check mbuf. */
 2245         if (m == NULL || m->m_len != sizeof(int)) {
 2246                 return (EINVAL);
 2247         }
 2248         vifi = *(mtod(m, int *));
 2249 
 2250         if (rsvpdebug)
 2251                 printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n",
 2252                        vifi, rsvp_on);
 2253 
 2254         s = splsoftnet();
 2255 
 2256         /* Check vif. */
 2257         if (!legal_vif_num(vifi)) {
 2258                 splx(s);
 2259                 return (EADDRNOTAVAIL);
 2260         }
 2261 
 2262         /* Check if socket is available. */
 2263         if (viftable[vifi].v_rsvpd != NULL) {
 2264                 splx(s);
 2265                 return (EADDRINUSE);
 2266         }
 2267 
 2268         viftable[vifi].v_rsvpd = so;
 2269         /*
 2270          * This may seem silly, but we need to be sure we don't over-increment
 2271          * the RSVP counter, in case something slips up.
 2272          */
 2273         if (!viftable[vifi].v_rsvp_on) {
 2274                 viftable[vifi].v_rsvp_on = 1;
 2275                 rsvp_on++;
 2276         }
 2277 
 2278         splx(s);
 2279         return (0);
 2280 }
 2281 
 2282 int
 2283 ip_rsvp_vif_done(struct socket *so, struct mbuf *m)
 2284 {
 2285         int vifi, s;
 2286 
 2287         if (rsvpdebug)
 2288                 printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n",
 2289                     so->so_type, so->so_proto->pr_protocol);
 2290 
 2291         if (so->so_type != SOCK_RAW ||
 2292             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2293                 return (EOPNOTSUPP);
 2294 
 2295         /* Check mbuf. */
 2296         if (m == NULL || m->m_len != sizeof(int)) {
 2297                 return (EINVAL);
 2298         }
 2299         vifi = *(mtod(m, int *));
 2300 
 2301         s = splsoftnet();
 2302 
 2303         /* Check vif. */
 2304         if (!legal_vif_num(vifi)) {
 2305                 splx(s);
 2306                 return (EADDRNOTAVAIL);
 2307         }
 2308 
 2309         if (rsvpdebug)
 2310                 printf("ip_rsvp_vif_done: v_rsvpd = %x so = %x\n",
 2311                     viftable[vifi].v_rsvpd, so);
 2312 
 2313         viftable[vifi].v_rsvpd = NULL;
 2314         /*
 2315          * This may seem silly, but we need to be sure we don't over-decrement
 2316          * the RSVP counter, in case something slips up.
 2317          */
 2318         if (viftable[vifi].v_rsvp_on) {
 2319                 viftable[vifi].v_rsvp_on = 0;
 2320                 rsvp_on--;
 2321         }
 2322 
 2323         splx(s);
 2324         return (0);
 2325 }
 2326 
 2327 void
 2328 ip_rsvp_force_done(struct socket *so)
 2329 {
 2330         int vifi, s;
 2331 
 2332         /* Don't bother if it is not the right type of socket. */
 2333         if (so->so_type != SOCK_RAW ||
 2334             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2335                 return;
 2336 
 2337         s = splsoftnet();
 2338 
 2339         /*
 2340          * The socket may be attached to more than one vif...this
 2341          * is perfectly legal.
 2342          */
 2343         for (vifi = 0; vifi < numvifs; vifi++) {
 2344                 if (viftable[vifi].v_rsvpd == so) {
 2345                         viftable[vifi].v_rsvpd = NULL;
 2346                         /*
 2347                          * This may seem silly, but we need to be sure we don't
 2348                          * over-decrement the RSVP counter, in case something
 2349                          * slips up.
 2350                          */
 2351                         if (viftable[vifi].v_rsvp_on) {
 2352                                 viftable[vifi].v_rsvp_on = 0;
 2353                                 rsvp_on--;
 2354                         }
 2355                 }
 2356         }
 2357 
 2358         splx(s);
 2359         return;
 2360 }
 2361 
 2362 void
 2363 rsvp_input(struct mbuf *m, struct ifnet *ifp)
 2364 {
 2365         int vifi, s;
 2366         struct ip *ip = mtod(m, struct ip *);
 2367         struct sockaddr_in rsvp_src;
 2368 
 2369         if (rsvpdebug)
 2370                 printf("rsvp_input: rsvp_on %d\n", rsvp_on);
 2371 
 2372         /*
 2373          * Can still get packets with rsvp_on = 0 if there is a local member
 2374          * of the group to which the RSVP packet is addressed.  But in this
 2375          * case we want to throw the packet away.
 2376          */
 2377         if (!rsvp_on) {
 2378                 m_freem(m);
 2379                 return;
 2380         }
 2381 
 2382         /*
 2383          * If the old-style non-vif-associated socket is set, then use
 2384          * it and ignore the new ones.
 2385          */
 2386         if (ip_rsvpd != NULL) {
 2387                 if (rsvpdebug)
 2388                         printf("rsvp_input: "
 2389                             "Sending packet up old-style socket\n");
 2390                 rip_input(m);   /*XXX*/
 2391                 return;
 2392         }
 2393 
 2394         s = splsoftnet();
 2395 
 2396         if (rsvpdebug)
 2397                 printf("rsvp_input: check vifs\n");
 2398 
 2399         /* Find which vif the packet arrived on. */
 2400         for (vifi = 0; vifi < numvifs; vifi++) {
 2401                 if (viftable[vifi].v_ifp == ifp)
 2402                         break;
 2403         }
 2404 
 2405         if (vifi == numvifs) {
 2406                 /* Can't find vif packet arrived on. Drop packet. */
 2407                 if (rsvpdebug)
 2408                         printf("rsvp_input: "
 2409                             "Can't find vif for packet...dropping it.\n");
 2410                 m_freem(m);
 2411                 splx(s);
 2412                 return;
 2413         }
 2414 
 2415         if (rsvpdebug)
 2416                 printf("rsvp_input: check socket\n");
 2417 
 2418         if (viftable[vifi].v_rsvpd == NULL) {
 2419                 /*
 2420                  * drop packet, since there is no specific socket for this
 2421                  * interface
 2422                  */
 2423                 if (rsvpdebug)
 2424                         printf("rsvp_input: No socket defined for vif %d\n",
 2425                             vifi);
 2426                 m_freem(m);
 2427                 splx(s);
 2428                 return;
 2429         }
 2430 
 2431         sockaddr_in_init(&rsvp_src, &ip->ip_src, 0);
 2432 
 2433         if (rsvpdebug && m)
 2434                 printf("rsvp_input: m->m_len = %d, sbspace() = %d\n",
 2435                     m->m_len, sbspace(&viftable[vifi].v_rsvpd->so_rcv));
 2436 
 2437         if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0)
 2438                 if (rsvpdebug)
 2439                         printf("rsvp_input: Failed to append to socket\n");
 2440         else
 2441                 if (rsvpdebug)
 2442                         printf("rsvp_input: send packet up\n");
 2443 
 2444         splx(s);
 2445 }
 2446 #endif /* RSVP_ISI */
 2447 
 2448 /*
 2449  * Code for bandwidth monitors
 2450  */
 2451 
 2452 /*
 2453  * Define common interface for timeval-related methods
 2454  */
 2455 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
 2456 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
 2457 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
 2458 
 2459 static uint32_t
 2460 compute_bw_meter_flags(struct bw_upcall *req)
 2461 {
 2462     uint32_t flags = 0;
 2463 
 2464     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 2465         flags |= BW_METER_UNIT_PACKETS;
 2466     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 2467         flags |= BW_METER_UNIT_BYTES;
 2468     if (req->bu_flags & BW_UPCALL_GEQ)
 2469         flags |= BW_METER_GEQ;
 2470     if (req->bu_flags & BW_UPCALL_LEQ)
 2471         flags |= BW_METER_LEQ;
 2472 
 2473     return flags;
 2474 }
 2475 
 2476 /*
 2477  * Add a bw_meter entry
 2478  */
 2479 static int
 2480 add_bw_upcall(struct bw_upcall *req)
 2481 {
 2482     int s;
 2483     struct mfc *mfc;
 2484     struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 2485                 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
 2486     struct timeval now;
 2487     struct bw_meter *x;
 2488     uint32_t flags;
 2489 
 2490     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2491         return EOPNOTSUPP;
 2492 
 2493     /* Test if the flags are valid */
 2494     if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 2495         return EINVAL;
 2496     if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 2497         return EINVAL;
 2498     if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2499             == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2500         return EINVAL;
 2501 
 2502     /* Test if the threshold time interval is valid */
 2503     if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 2504         return EINVAL;
 2505 
 2506     flags = compute_bw_meter_flags(req);
 2507 
 2508     /*
 2509      * Find if we have already same bw_meter entry
 2510      */
 2511     s = splsoftnet();
 2512     mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2513     if (mfc == NULL) {
 2514         splx(s);
 2515         return EADDRNOTAVAIL;
 2516     }
 2517     for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 2518         if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2519                            &req->bu_threshold.b_time, ==)) &&
 2520             (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2521             (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2522             (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 2523             splx(s);
 2524             return 0;           /* XXX Already installed */
 2525         }
 2526     }
 2527 
 2528     /* Allocate the new bw_meter entry */
 2529     x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
 2530     if (x == NULL) {
 2531         splx(s);
 2532         return ENOBUFS;
 2533     }
 2534 
 2535     /* Set the new bw_meter entry */
 2536     x->bm_threshold.b_time = req->bu_threshold.b_time;
 2537     microtime(&now);
 2538     x->bm_start_time = now;
 2539     x->bm_threshold.b_packets = req->bu_threshold.b_packets;
 2540     x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
 2541     x->bm_measured.b_packets = 0;
 2542     x->bm_measured.b_bytes = 0;
 2543     x->bm_flags = flags;
 2544     x->bm_time_next = NULL;
 2545     x->bm_time_hash = BW_METER_BUCKETS;
 2546 
 2547     /* Add the new bw_meter entry to the front of entries for this MFC */
 2548     x->bm_mfc = mfc;
 2549     x->bm_mfc_next = mfc->mfc_bw_meter;
 2550     mfc->mfc_bw_meter = x;
 2551     schedule_bw_meter(x, &now);
 2552     splx(s);
 2553 
 2554     return 0;
 2555 }
 2556 
 2557 static void
 2558 free_bw_list(struct bw_meter *list)
 2559 {
 2560     while (list != NULL) {
 2561         struct bw_meter *x = list;
 2562 
 2563         list = list->bm_mfc_next;
 2564         unschedule_bw_meter(x);
 2565         free(x, M_BWMETER);
 2566     }
 2567 }
 2568 
 2569 /*
 2570  * Delete one or multiple bw_meter entries
 2571  */
 2572 static int
 2573 del_bw_upcall(struct bw_upcall *req)
 2574 {
 2575     int s;
 2576     struct mfc *mfc;
 2577     struct bw_meter *x;
 2578 
 2579     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2580         return EOPNOTSUPP;
 2581 
 2582     s = splsoftnet();
 2583     /* Find the corresponding MFC entry */
 2584     mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2585     if (mfc == NULL) {
 2586         splx(s);
 2587         return EADDRNOTAVAIL;
 2588     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 2589         /*
 2590          * Delete all bw_meter entries for this mfc
 2591          */
 2592         struct bw_meter *list;
 2593 
 2594         list = mfc->mfc_bw_meter;
 2595         mfc->mfc_bw_meter = NULL;
 2596         free_bw_list(list);
 2597         splx(s);
 2598         return 0;
 2599     } else {                    /* Delete a single bw_meter entry */
 2600         struct bw_meter *prev;
 2601         uint32_t flags = 0;
 2602 
 2603         flags = compute_bw_meter_flags(req);
 2604 
 2605         /* Find the bw_meter entry to delete */
 2606         for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 2607              prev = x, x = x->bm_mfc_next) {
 2608             if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2609                                &req->bu_threshold.b_time, ==)) &&
 2610                 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2611                 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2612                 (x->bm_flags & BW_METER_USER_FLAGS) == flags)
 2613                 break;
 2614         }
 2615         if (x != NULL) { /* Delete entry from the list for this MFC */
 2616             if (prev != NULL)
 2617                 prev->bm_mfc_next = x->bm_mfc_next;     /* remove from middle*/
 2618             else
 2619                 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 2620 
 2621             unschedule_bw_meter(x);
 2622             splx(s);
 2623             /* Free the bw_meter entry */
 2624             free(x, M_BWMETER);
 2625             return 0;
 2626         } else {
 2627             splx(s);
 2628             return EINVAL;
 2629         }
 2630     }
 2631     /* NOTREACHED */
 2632 }
 2633 
 2634 /*
 2635  * Perform bandwidth measurement processing that may result in an upcall
 2636  */
 2637 static void
 2638 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 2639 {
 2640     struct timeval delta;
 2641 
 2642     delta = *nowp;
 2643     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2644 
 2645     if (x->bm_flags & BW_METER_GEQ) {
 2646         /*
 2647          * Processing for ">=" type of bw_meter entry
 2648          */
 2649         if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2650             /* Reset the bw_meter entry */
 2651             x->bm_start_time = *nowp;
 2652             x->bm_measured.b_packets = 0;
 2653             x->bm_measured.b_bytes = 0;
 2654             x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2655         }
 2656 
 2657         /* Record that a packet is received */
 2658         x->bm_measured.b_packets++;
 2659         x->bm_measured.b_bytes += plen;
 2660 
 2661         /*
 2662          * Test if we should deliver an upcall
 2663          */
 2664         if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 2665             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2666                  (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 2667                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2668                  (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 2669                 /* Prepare an upcall for delivery */
 2670                 bw_meter_prepare_upcall(x, nowp);
 2671                 x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 2672             }
 2673         }
 2674     } else if (x->bm_flags & BW_METER_LEQ) {
 2675         /*
 2676          * Processing for "<=" type of bw_meter entry
 2677          */
 2678         if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2679             /*
 2680              * We are behind time with the multicast forwarding table
 2681              * scanning for "<=" type of bw_meter entries, so test now
 2682              * if we should deliver an upcall.
 2683              */
 2684             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2685                  (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 2686                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2687                  (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 2688                 /* Prepare an upcall for delivery */
 2689                 bw_meter_prepare_upcall(x, nowp);
 2690             }
 2691             /* Reschedule the bw_meter entry */
 2692             unschedule_bw_meter(x);
 2693             schedule_bw_meter(x, nowp);
 2694         }
 2695 
 2696         /* Record that a packet is received */
 2697         x->bm_measured.b_packets++;
 2698         x->bm_measured.b_bytes += plen;
 2699 
 2700         /*
 2701          * Test if we should restart the measuring interval
 2702          */
 2703         if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 2704              x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 2705             (x->bm_flags & BW_METER_UNIT_BYTES &&
 2706              x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 2707             /* Don't restart the measuring interval */
 2708         } else {
 2709             /* Do restart the measuring interval */
 2710             /*
 2711              * XXX: note that we don't unschedule and schedule, because this
 2712              * might be too much overhead per packet. Instead, when we process
 2713              * all entries for a given timer hash bin, we check whether it is
 2714              * really a timeout. If not, we reschedule at that time.
 2715              */
 2716             x->bm_start_time = *nowp;
 2717             x->bm_measured.b_packets = 0;
 2718             x->bm_measured.b_bytes = 0;
 2719             x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2720         }
 2721     }
 2722 }
 2723 
 2724 /*
 2725  * Prepare a bandwidth-related upcall
 2726  */
 2727 static void
 2728 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 2729 {
 2730     struct timeval delta;
 2731     struct bw_upcall *u;
 2732 
 2733     /*
 2734      * Compute the measured time interval
 2735      */
 2736     delta = *nowp;
 2737     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2738 
 2739     /*
 2740      * If there are too many pending upcalls, deliver them now
 2741      */
 2742     if (bw_upcalls_n >= BW_UPCALLS_MAX)
 2743         bw_upcalls_send();
 2744 
 2745     /*
 2746      * Set the bw_upcall entry
 2747      */
 2748     u = &bw_upcalls[bw_upcalls_n++];
 2749     u->bu_src = x->bm_mfc->mfc_origin;
 2750     u->bu_dst = x->bm_mfc->mfc_mcastgrp;
 2751     u->bu_threshold.b_time = x->bm_threshold.b_time;
 2752     u->bu_threshold.b_packets = x->bm_threshold.b_packets;
 2753     u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
 2754     u->bu_measured.b_time = delta;
 2755     u->bu_measured.b_packets = x->bm_measured.b_packets;
 2756     u->bu_measured.b_bytes = x->bm_measured.b_bytes;
 2757     u->bu_flags = 0;
 2758     if (x->bm_flags & BW_METER_UNIT_PACKETS)
 2759         u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
 2760     if (x->bm_flags & BW_METER_UNIT_BYTES)
 2761         u->bu_flags |= BW_UPCALL_UNIT_BYTES;
 2762     if (x->bm_flags & BW_METER_GEQ)
 2763         u->bu_flags |= BW_UPCALL_GEQ;
 2764     if (x->bm_flags & BW_METER_LEQ)
 2765         u->bu_flags |= BW_UPCALL_LEQ;
 2766 }
 2767 
 2768 /*
 2769  * Send the pending bandwidth-related upcalls
 2770  */
 2771 static void
 2772 bw_upcalls_send(void)
 2773 {
 2774     struct mbuf *m;
 2775     int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
 2776     struct sockaddr_in k_igmpsrc = { 
 2777             .sin_len = sizeof(k_igmpsrc),
 2778             .sin_family = AF_INET,
 2779     };
 2780     static struct igmpmsg igmpmsg = { 0,                /* unused1 */
 2781                                       0,                /* unused2 */
 2782                                       IGMPMSG_BW_UPCALL,/* im_msgtype */
 2783                                       0,                /* im_mbz  */
 2784                                       0,                /* im_vif  */
 2785                                       0,                /* unused3 */
 2786                                       { 0 },            /* im_src  */
 2787                                       { 0 } };          /* im_dst  */
 2788 
 2789     if (bw_upcalls_n == 0)
 2790         return;                 /* No pending upcalls */
 2791 
 2792     bw_upcalls_n = 0;
 2793 
 2794     /*
 2795      * Allocate a new mbuf, initialize it with the header and
 2796      * the payload for the pending calls.
 2797      */
 2798     MGETHDR(m, M_DONTWAIT, MT_HEADER);
 2799     if (m == NULL) {
 2800         log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 2801         return;
 2802     }
 2803 
 2804     m->m_len = m->m_pkthdr.len = 0;
 2805     m_copyback(m, 0, sizeof(struct igmpmsg), (void *)&igmpmsg);
 2806     m_copyback(m, sizeof(struct igmpmsg), len, (void *)&bw_upcalls[0]);
 2807 
 2808     /*
 2809      * Send the upcalls
 2810      * XXX do we need to set the address in k_igmpsrc ?
 2811      */
 2812     mrtstat.mrts_upcalls++;
 2813     if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
 2814         log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 2815         ++mrtstat.mrts_upq_sockfull;
 2816     }
 2817 }
 2818 
 2819 /*
 2820  * Compute the timeout hash value for the bw_meter entries
 2821  */
 2822 #define BW_METER_TIMEHASH(bw_meter, hash)                               \
 2823     do {                                                                \
 2824         struct timeval next_timeval = (bw_meter)->bm_start_time;        \
 2825                                                                         \
 2826         BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 2827         (hash) = next_timeval.tv_sec;                                   \
 2828         if (next_timeval.tv_usec)                                       \
 2829             (hash)++; /* XXX: make sure we don't timeout early */       \
 2830         (hash) %= BW_METER_BUCKETS;                                     \
 2831     } while (/*CONSTCOND*/ 0)
 2832 
 2833 /*
 2834  * Schedule a timer to process periodically bw_meter entry of type "<="
 2835  * by linking the entry in the proper hash bucket.
 2836  */
 2837 static void
 2838 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 2839 {
 2840     int time_hash;
 2841 
 2842     if (!(x->bm_flags & BW_METER_LEQ))
 2843         return;         /* XXX: we schedule timers only for "<=" entries */
 2844 
 2845     /*
 2846      * Reset the bw_meter entry
 2847      */
 2848     x->bm_start_time = *nowp;
 2849     x->bm_measured.b_packets = 0;
 2850     x->bm_measured.b_bytes = 0;
 2851     x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2852 
 2853     /*
 2854      * Compute the timeout hash value and insert the entry
 2855      */
 2856     BW_METER_TIMEHASH(x, time_hash);
 2857     x->bm_time_next = bw_meter_timers[time_hash];
 2858     bw_meter_timers[time_hash] = x;
 2859     x->bm_time_hash = time_hash;
 2860 }
 2861 
 2862 /*
 2863  * Unschedule the periodic timer that processes bw_meter entry of type "<="
 2864  * by removing the entry from the proper hash bucket.
 2865  */
 2866 static void
 2867 unschedule_bw_meter(struct bw_meter *x)
 2868 {
 2869     int time_hash;
 2870     struct bw_meter *prev, *tmp;
 2871 
 2872     if (!(x->bm_flags & BW_METER_LEQ))
 2873         return;         /* XXX: we schedule timers only for "<=" entries */
 2874 
 2875     /*
 2876      * Compute the timeout hash value and delete the entry
 2877      */
 2878     time_hash = x->bm_time_hash;
 2879     if (time_hash >= BW_METER_BUCKETS)
 2880         return;         /* Entry was not scheduled */
 2881 
 2882     for (prev = NULL, tmp = bw_meter_timers[time_hash];
 2883              tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 2884         if (tmp == x)
 2885             break;
 2886 
 2887     if (tmp == NULL)
 2888         panic("unschedule_bw_meter: bw_meter entry not found");
 2889 
 2890     if (prev != NULL)
 2891         prev->bm_time_next = x->bm_time_next;
 2892     else
 2893         bw_meter_timers[time_hash] = x->bm_time_next;
 2894 
 2895     x->bm_time_next = NULL;
 2896     x->bm_time_hash = BW_METER_BUCKETS;
 2897 }
 2898 
 2899 /*
 2900  * Process all "<=" type of bw_meter that should be processed now,
 2901  * and for each entry prepare an upcall if necessary. Each processed
 2902  * entry is rescheduled again for the (periodic) processing.
 2903  *
 2904  * This is run periodically (once per second normally). On each round,
 2905  * all the potentially matching entries are in the hash slot that we are
 2906  * looking at.
 2907  */
 2908 static void
 2909 bw_meter_process(void)
 2910 {
 2911     int s;
 2912     static uint32_t last_tv_sec;        /* last time we processed this */
 2913 
 2914     uint32_t loops;
 2915     int i;
 2916     struct timeval now, process_endtime;
 2917 
 2918     microtime(&now);
 2919     if (last_tv_sec == now.tv_sec)
 2920         return;         /* nothing to do */
 2921 
 2922     loops = now.tv_sec - last_tv_sec;
 2923     last_tv_sec = now.tv_sec;
 2924     if (loops > BW_METER_BUCKETS)
 2925         loops = BW_METER_BUCKETS;
 2926 
 2927     s = splsoftnet();
 2928     /*
 2929      * Process all bins of bw_meter entries from the one after the last
 2930      * processed to the current one. On entry, i points to the last bucket
 2931      * visited, so we need to increment i at the beginning of the loop.
 2932      */
 2933     for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 2934         struct bw_meter *x, *tmp_list;
 2935 
 2936         if (++i >= BW_METER_BUCKETS)
 2937             i = 0;
 2938 
 2939         /* Disconnect the list of bw_meter entries from the bin */
 2940         tmp_list = bw_meter_timers[i];
 2941         bw_meter_timers[i] = NULL;
 2942 
 2943         /* Process the list of bw_meter entries */
 2944         while (tmp_list != NULL) {
 2945             x = tmp_list;
 2946             tmp_list = tmp_list->bm_time_next;
 2947 
 2948             /* Test if the time interval is over */
 2949             process_endtime = x->bm_start_time;
 2950             BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 2951             if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 2952                 /* Not yet: reschedule, but don't reset */
 2953                 int time_hash;
 2954 
 2955                 BW_METER_TIMEHASH(x, time_hash);
 2956                 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 2957                     /*
 2958                      * XXX: somehow the bin processing is a bit ahead of time.
 2959                      * Put the entry in the next bin.
 2960                      */
 2961                     if (++time_hash >= BW_METER_BUCKETS)
 2962                         time_hash = 0;
 2963                 }
 2964                 x->bm_time_next = bw_meter_timers[time_hash];
 2965                 bw_meter_timers[time_hash] = x;
 2966                 x->bm_time_hash = time_hash;
 2967 
 2968                 continue;
 2969             }
 2970 
 2971             /*
 2972              * Test if we should deliver an upcall
 2973              */
 2974             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2975                  (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 2976                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2977                  (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 2978                 /* Prepare an upcall for delivery */
 2979                 bw_meter_prepare_upcall(x, &now);
 2980             }
 2981 
 2982             /*
 2983              * Reschedule for next processing
 2984              */
 2985             schedule_bw_meter(x, &now);
 2986         }
 2987     }
 2988 
 2989     /* Send all upcalls that are pending delivery */
 2990     bw_upcalls_send();
 2991 
 2992     splx(s);
 2993 }
 2994 
 2995 /*
 2996  * A periodic function for sending all upcalls that are pending delivery
 2997  */
 2998 static void
 2999 expire_bw_upcalls_send(void *unused)
 3000 {
 3001     int s;
 3002 
 3003     s = splsoftnet();
 3004     bw_upcalls_send();
 3005     splx(s);
 3006 
 3007     callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
 3008                   expire_bw_upcalls_send, NULL);
 3009 }
 3010 
 3011 /*
 3012  * A periodic function for periodic scanning of the multicast forwarding
 3013  * table for processing all "<=" bw_meter entries.
 3014  */
 3015 static void
 3016 expire_bw_meter_process(void *unused)
 3017 {
 3018     if (mrt_api_config & MRT_MFC_BW_UPCALL)
 3019         bw_meter_process();
 3020 
 3021     callout_reset(&bw_meter_ch, BW_METER_PERIOD,
 3022                   expire_bw_meter_process, NULL);
 3023 }
 3024 
 3025 /*
 3026  * End of bandwidth monitoring code
 3027  */
 3028 
 3029 #ifdef PIM
 3030 /*
 3031  * Send the packet up to the user daemon, or eventually do kernel encapsulation
 3032  */
 3033 static int
 3034 pim_register_send(struct ip *ip, struct vif *vifp,
 3035         struct mbuf *m, struct mfc *rt)
 3036 {
 3037     struct mbuf *mb_copy, *mm;
 3038 
 3039     if (mrtdebug & DEBUG_PIM)
 3040         log(LOG_DEBUG, "pim_register_send: ");
 3041 
 3042     mb_copy = pim_register_prepare(ip, m);
 3043     if (mb_copy == NULL)
 3044         return ENOBUFS;
 3045 
 3046     /*
 3047      * Send all the fragments. Note that the mbuf for each fragment
 3048      * is freed by the sending machinery.
 3049      */
 3050     for (mm = mb_copy; mm; mm = mb_copy) {
 3051         mb_copy = mm->m_nextpkt;
 3052         mm->m_nextpkt = NULL;
 3053         mm = m_pullup(mm, sizeof(struct ip));
 3054         if (mm != NULL) {
 3055             ip = mtod(mm, struct ip *);
 3056             if ((mrt_api_config & MRT_MFC_RP) &&
 3057                 !in_nullhost(rt->mfc_rp)) {
 3058                 pim_register_send_rp(ip, vifp, mm, rt);
 3059             } else {
 3060                 pim_register_send_upcall(ip, vifp, mm, rt);
 3061             }
 3062         }
 3063     }
 3064 
 3065     return 0;
 3066 }
 3067 
 3068 /*
 3069  * Return a copy of the data packet that is ready for PIM Register
 3070  * encapsulation.
 3071  * XXX: Note that in the returned copy the IP header is a valid one.
 3072  */
 3073 static struct mbuf *
 3074 pim_register_prepare(struct ip *ip, struct mbuf *m)
 3075 {
 3076     struct mbuf *mb_copy = NULL;
 3077     int mtu;
 3078 
 3079     /* Take care of delayed checksums */
 3080     if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
 3081         in_delayed_cksum(m);
 3082         m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
 3083     }
 3084 
 3085     /*
 3086      * Copy the old packet & pullup its IP header into the
 3087      * new mbuf so we can modify it.
 3088      */
 3089     mb_copy = m_copypacket(m, M_DONTWAIT);
 3090     if (mb_copy == NULL)
 3091         return NULL;
 3092     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
 3093     if (mb_copy == NULL)
 3094         return NULL;
 3095 
 3096     /* take care of the TTL */
 3097     ip = mtod(mb_copy, struct ip *);
 3098     --ip->ip_ttl;
 3099 
 3100     /* Compute the MTU after the PIM Register encapsulation */
 3101     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 3102 
 3103     if (ntohs(ip->ip_len) <= mtu) {
 3104         /* Turn the IP header into a valid one */
 3105         ip->ip_sum = 0;
 3106         ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 3107     } else {
 3108         /* Fragment the packet */
 3109         if (ip_fragment(mb_copy, NULL, mtu) != 0) {
 3110             /* XXX: mb_copy was freed by ip_fragment() */
 3111             return NULL;
 3112         }
 3113     }
 3114     return mb_copy;
 3115 }
 3116 
 3117 /*
 3118  * Send an upcall with the data packet to the user-level process.
 3119  */
 3120 static int
 3121 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
 3122     struct mbuf *mb_copy, struct mfc *rt)
 3123 {
 3124     struct mbuf *mb_first;
 3125     int len = ntohs(ip->ip_len);
 3126     struct igmpmsg *im;
 3127     struct sockaddr_in k_igmpsrc = {
 3128             .sin_len = sizeof(k_igmpsrc),
 3129             .sin_family = AF_INET,
 3130     };
 3131 
 3132     /*
 3133      * Add a new mbuf with an upcall header
 3134      */
 3135     MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 3136     if (mb_first == NULL) {
 3137         m_freem(mb_copy);
 3138         return ENOBUFS;
 3139     }
 3140     mb_first->m_data += max_linkhdr;
 3141     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
 3142     mb_first->m_len = sizeof(struct igmpmsg);
 3143     mb_first->m_next = mb_copy;
 3144 
 3145     /* Send message to routing daemon */
 3146     im = mtod(mb_first, struct igmpmsg *);
 3147     im->im_msgtype      = IGMPMSG_WHOLEPKT;
 3148     im->im_mbz          = 0;
 3149     im->im_vif          = vifp - viftable;
 3150     im->im_src          = ip->ip_src;
 3151     im->im_dst          = ip->ip_dst;
 3152 
 3153     k_igmpsrc.sin_addr  = ip->ip_src;
 3154 
 3155     mrtstat.mrts_upcalls++;
 3156 
 3157     if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 3158         if (mrtdebug & DEBUG_PIM)
 3159             log(LOG_WARNING,
 3160                 "mcast: pim_register_send_upcall: ip_mrouter socket queue full");
 3161         ++mrtstat.mrts_upq_sockfull;
 3162         return ENOBUFS;
 3163     }
 3164 
 3165     /* Keep statistics */
 3166     pimstat.pims_snd_registers_msgs++;
 3167     pimstat.pims_snd_registers_bytes += len;
 3168 
 3169     return 0;
 3170 }
 3171 
 3172 /*
 3173  * Encapsulate the data packet in PIM Register message and send it to the RP.
 3174  */
 3175 static int
 3176 pim_register_send_rp(struct ip *ip, struct vif *vifp,
 3177         struct mbuf *mb_copy, struct mfc *rt)
 3178 {
 3179     struct mbuf *mb_first;
 3180     struct ip *ip_outer;
 3181     struct pim_encap_pimhdr *pimhdr;
 3182     int len = ntohs(ip->ip_len);
 3183     vifi_t vifi = rt->mfc_parent;
 3184 
 3185     if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
 3186         m_freem(mb_copy);
 3187         return EADDRNOTAVAIL;           /* The iif vif is invalid */
 3188     }
 3189 
 3190     /*
 3191      * Add a new mbuf with the encapsulating header
 3192      */
 3193     MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 3194     if (mb_first == NULL) {
 3195         m_freem(mb_copy);
 3196         return ENOBUFS;
 3197     }
 3198     mb_first->m_data += max_linkhdr;
 3199     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
 3200     mb_first->m_next = mb_copy;
 3201 
 3202     mb_first->m_pkthdr.len = len + mb_first->m_len;
 3203 
 3204     /*
 3205      * Fill in the encapsulating IP and PIM header
 3206      */
 3207     ip_outer = mtod(mb_first, struct ip *);
 3208     *ip_outer = pim_encap_iphdr;
 3209      if (mb_first->m_pkthdr.len < IP_MINFRAGSIZE)
 3210         ip_outer->ip_id = 0;
 3211     else
 3212         ip_outer->ip_id = ip_newid(NULL);
 3213     ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 3214                              sizeof(pim_encap_pimhdr));
 3215     ip_outer->ip_src = viftable[vifi].v_lcl_addr;
 3216     ip_outer->ip_dst = rt->mfc_rp;
 3217     /*
 3218      * Copy the inner header TOS to the outer header, and take care of the
 3219      * IP_DF bit.
 3220      */
 3221     ip_outer->ip_tos = ip->ip_tos;
 3222     if (ntohs(ip->ip_off) & IP_DF)
 3223         ip_outer->ip_off |= htons(IP_DF);
 3224     pimhdr = (struct pim_encap_pimhdr *)((char *)ip_outer
 3225                                          + sizeof(pim_encap_iphdr));
 3226     *pimhdr = pim_encap_pimhdr;
 3227     /* If the iif crosses a border, set the Border-bit */
 3228     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
 3229         pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 3230 
 3231     mb_first->m_data += sizeof(pim_encap_iphdr);
 3232     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
 3233     mb_first->m_data -= sizeof(pim_encap_iphdr);
 3234 
 3235     if (vifp->v_rate_limit == 0)
 3236         tbf_send_packet(vifp, mb_first);
 3237     else
 3238         tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len));
 3239 
 3240     /* Keep statistics */
 3241     pimstat.pims_snd_registers_msgs++;
 3242     pimstat.pims_snd_registers_bytes += len;
 3243 
 3244     return 0;
 3245 }
 3246 
 3247 /*
 3248  * PIM-SMv2 and PIM-DM messages processing.
 3249  * Receives and verifies the PIM control messages, and passes them
 3250  * up to the listening socket, using rip_input().
 3251  * The only message with special processing is the PIM_REGISTER message
 3252  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
 3253  * is passed to if_simloop().
 3254  */
 3255 void
 3256 pim_input(struct mbuf *m, ...)
 3257 {
 3258     struct ip *ip = mtod(m, struct ip *);
 3259     struct pim *pim;
 3260     int minlen;
 3261     int datalen;
 3262     int ip_tos;
 3263     int proto;
 3264     int iphlen;
 3265     va_list ap;
 3266 
 3267     va_start(ap, m);
 3268     iphlen = va_arg(ap, int);
 3269     proto = va_arg(ap, int);
 3270     va_end(ap);
 3271 
 3272     datalen = ntohs(ip->ip_len) - iphlen;
 3273 
 3274     /* Keep statistics */
 3275     pimstat.pims_rcv_total_msgs++;
 3276     pimstat.pims_rcv_total_bytes += datalen;
 3277 
 3278     /*
 3279      * Validate lengths
 3280      */
 3281     if (datalen < PIM_MINLEN) {
 3282         pimstat.pims_rcv_tooshort++;
 3283         log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
 3284             datalen, (u_long)ip->ip_src.s_addr);
 3285         m_freem(m);
 3286         return;
 3287     }
 3288 
 3289     /*
 3290      * If the packet is at least as big as a REGISTER, go agead
 3291      * and grab the PIM REGISTER header size, to avoid another
 3292      * possible m_pullup() later.
 3293      *
 3294      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
 3295      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
 3296      */
 3297     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
 3298     /*
 3299      * Get the IP and PIM headers in contiguous memory, and
 3300      * possibly the PIM REGISTER header.
 3301      */
 3302     if ((m->m_flags & M_EXT || m->m_len < minlen) &&
 3303         (m = m_pullup(m, minlen)) == NULL) {
 3304         log(LOG_ERR, "pim_input: m_pullup failure\n");
 3305         return;
 3306     }
 3307     /* m_pullup() may have given us a new mbuf so reset ip. */
 3308     ip = mtod(m, struct ip *);
 3309     ip_tos = ip->ip_tos;
 3310 
 3311     /* adjust mbuf to point to the PIM header */
 3312     m->m_data += iphlen;
 3313     m->m_len  -= iphlen;
 3314     pim = mtod(m, struct pim *);
 3315 
 3316     /*
 3317      * Validate checksum. If PIM REGISTER, exclude the data packet.
 3318      *
 3319      * XXX: some older PIMv2 implementations don't make this distinction,
 3320      * so for compatibility reason perform the checksum over part of the
 3321      * message, and if error, then over the whole message.
 3322      */
 3323     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 3324         /* do nothing, checksum okay */
 3325     } else if (in_cksum(m, datalen)) {
 3326         pimstat.pims_rcv_badsum++;
 3327         if (mrtdebug & DEBUG_PIM)
 3328             log(LOG_DEBUG, "pim_input: invalid checksum");
 3329         m_freem(m);
 3330         return;
 3331     }
 3332 
 3333     /* PIM version check */
 3334     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 3335         pimstat.pims_rcv_badversion++;
 3336         log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
 3337             PIM_VT_V(pim->pim_vt), PIM_VERSION);
 3338         m_freem(m);
 3339         return;
 3340     }
 3341 
 3342     /* restore mbuf back to the outer IP */
 3343     m->m_data -= iphlen;
 3344     m->m_len  += iphlen;
 3345 
 3346     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 3347         /*
 3348          * Since this is a REGISTER, we'll make a copy of the register
 3349          * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 3350          * routing daemon.
 3351          */
 3352         int s;
 3353         struct sockaddr_in dst = {
 3354                 .sin_len = sizeof(dst),
 3355                 .sin_family = AF_INET,
 3356         };
 3357         struct mbuf *mcp;
 3358         struct ip *encap_ip;
 3359         u_int32_t *reghdr;
 3360         struct ifnet *vifp;
 3361 
 3362         s = splsoftnet();
 3363         if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
 3364             splx(s);
 3365             if (mrtdebug & DEBUG_PIM)
 3366                 log(LOG_DEBUG,
 3367                     "pim_input: register vif not set: %d\n", reg_vif_num);
 3368             m_freem(m);
 3369             return;
 3370         }
 3371         /* XXX need refcnt? */
 3372         vifp = viftable[reg_vif_num].v_ifp;
 3373         splx(s);
 3374 
 3375         /*
 3376          * Validate length
 3377          */
 3378         if (datalen < PIM_REG_MINLEN) {
 3379             pimstat.pims_rcv_tooshort++;
 3380             pimstat.pims_rcv_badregisters++;
 3381             log(LOG_ERR,
 3382                 "pim_input: register packet size too small %d from %lx\n",
 3383                 datalen, (u_long)ip->ip_src.s_addr);
 3384             m_freem(m);
 3385             return;
 3386         }
 3387 
 3388         reghdr = (u_int32_t *)(pim + 1);
 3389         encap_ip = (struct ip *)(reghdr + 1);
 3390 
 3391         if (mrtdebug & DEBUG_PIM) {
 3392             log(LOG_DEBUG,
 3393                 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
 3394                 (u_long)ntohl(encap_ip->ip_src.s_addr),
 3395                 (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3396                 ntohs(encap_ip->ip_len));
 3397         }
 3398 
 3399         /* verify the version number of the inner packet */
 3400         if (encap_ip->ip_v != IPVERSION) {
 3401             pimstat.pims_rcv_badregisters++;
 3402             if (mrtdebug & DEBUG_PIM) {
 3403                 log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
 3404                     "of the inner packet\n", encap_ip->ip_v);
 3405             }
 3406             m_freem(m);
 3407             return;
 3408         }
 3409 
 3410         /* verify the inner packet is destined to a mcast group */
 3411         if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
 3412             pimstat.pims_rcv_badregisters++;
 3413             if (mrtdebug & DEBUG_PIM)
 3414                 log(LOG_DEBUG,
 3415                     "pim_input: inner packet of register is not "
 3416                     "multicast %lx\n",
 3417                     (u_long)ntohl(encap_ip->ip_dst.s_addr));
 3418             m_freem(m);
 3419             return;
 3420         }
 3421 
 3422         /* If a NULL_REGISTER, pass it to the daemon */
 3423         if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 3424             goto pim_input_to_daemon;
 3425 
 3426         /*
 3427          * Copy the TOS from the outer IP header to the inner IP header.
 3428          */
 3429         if (encap_ip->ip_tos != ip_tos) {
 3430             /* Outer TOS -> inner TOS */
 3431             encap_ip->ip_tos = ip_tos;
 3432             /* Recompute the inner header checksum. Sigh... */
 3433 
 3434             /* adjust mbuf to point to the inner IP header */
 3435             m->m_data += (iphlen + PIM_MINLEN);
 3436             m->m_len  -= (iphlen + PIM_MINLEN);
 3437 
 3438             encap_ip->ip_sum = 0;
 3439             encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 3440 
 3441             /* restore mbuf to point back to the outer IP header */
 3442             m->m_data -= (iphlen + PIM_MINLEN);
 3443             m->m_len  += (iphlen + PIM_MINLEN);
 3444         }
 3445 
 3446         /*
 3447          * Decapsulate the inner IP packet and loopback to forward it
 3448          * as a normal multicast packet. Also, make a copy of the
 3449          *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 3450          * to pass to the daemon later, so it can take the appropriate
 3451          * actions (e.g., send back PIM_REGISTER_STOP).
 3452          * XXX: here m->m_data points to the outer IP header.
 3453          */
 3454         mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_DONTWAIT);
 3455         if (mcp == NULL) {
 3456             log(LOG_ERR,
 3457                 "pim_input: pim register: could not copy register head\n");
 3458             m_freem(m);
 3459             return;
 3460         }
 3461 
 3462         /* Keep statistics */
 3463         /* XXX: registers_bytes include only the encap. mcast pkt */
 3464         pimstat.pims_rcv_registers_msgs++;
 3465         pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
 3466 
 3467         /*
 3468          * forward the inner ip packet; point m_data at the inner ip.
 3469          */
 3470         m_adj(m, iphlen + PIM_MINLEN);
 3471 
 3472         if (mrtdebug & DEBUG_PIM) {
 3473             log(LOG_DEBUG,
 3474                 "pim_input: forwarding decapsulated register: "
 3475                 "src %lx, dst %lx, vif %d\n",
 3476                 (u_long)ntohl(encap_ip->ip_src.s_addr),
 3477                 (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3478                 reg_vif_num);
 3479         }
 3480         /* NB: vifp was collected above; can it change on us? */
 3481         looutput(vifp, m, (struct sockaddr *)&dst, (struct rtentry *)NULL);
 3482 
 3483         /* prepare the register head to send to the mrouting daemon */
 3484         m = mcp;
 3485     }
 3486 
 3487 pim_input_to_daemon:
 3488     /*
 3489      * Pass the PIM message up to the daemon; if it is a Register message,
 3490      * pass the 'head' only up to the daemon. This includes the
 3491      * outer IP header, PIM header, PIM-Register header and the
 3492      * inner IP header.
 3493      * XXX: the outer IP header pkt size of a Register is not adjust to
 3494      * reflect the fact that the inner multicast data is truncated.
 3495      */
 3496     rip_input(m, iphlen, proto);
 3497 
 3498     return;
 3499 }
 3500 #endif /* PIM */

Cache object: 2eda539c9b2fe588321e3556fa506701


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.