The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/ip_mroute.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: ip_mroute.c,v 1.90 2005/02/26 22:45:12 perry Exp $     */
    2 
    3 /*
    4  * Copyright (c) 1992, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Stephen Deering of Stanford University.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   35  */
   36 
   37 /*
   38  * Copyright (c) 1989 Stephen Deering
   39  *
   40  * This code is derived from software contributed to Berkeley by
   41  * Stephen Deering of Stanford University.
   42  *
   43  * Redistribution and use in source and binary forms, with or without
   44  * modification, are permitted provided that the following conditions
   45  * are met:
   46  * 1. Redistributions of source code must retain the above copyright
   47  *    notice, this list of conditions and the following disclaimer.
   48  * 2. Redistributions in binary form must reproduce the above copyright
   49  *    notice, this list of conditions and the following disclaimer in the
   50  *    documentation and/or other materials provided with the distribution.
   51  * 3. All advertising materials mentioning features or use of this software
   52  *    must display the following acknowledgement:
   53  *      This product includes software developed by the University of
   54  *      California, Berkeley and its contributors.
   55  * 4. Neither the name of the University nor the names of its contributors
   56  *    may be used to endorse or promote products derived from this software
   57  *    without specific prior written permission.
   58  *
   59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   69  * SUCH DAMAGE.
   70  *
   71  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   72  */
   73 
   74 /*
   75  * IP multicast forwarding procedures
   76  *
   77  * Written by David Waitzman, BBN Labs, August 1988.
   78  * Modified by Steve Deering, Stanford, February 1989.
   79  * Modified by Mark J. Steiglitz, Stanford, May, 1991
   80  * Modified by Van Jacobson, LBL, January 1993
   81  * Modified by Ajit Thyagarajan, PARC, August 1993
   82  * Modified by Bill Fenner, PARC, April 1994
   83  * Modified by Charles M. Hannum, NetBSD, May 1995.
   84  * Modified by Ahmed Helmy, SGI, June 1996
   85  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
   86  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
   87  * Modified by Hitoshi Asaeda, WIDE, August 2000
   88  * Modified by Pavlin Radoslavov, ICSI, October 2002
   89  *
   90  * MROUTING Revision: 1.2
   91  * and PIM-SMv2 and PIM-DM support, advanced API support,
   92  * bandwidth metering and signaling
   93  */
   94 
   95 #include <sys/cdefs.h>
   96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.90 2005/02/26 22:45:12 perry Exp $");
   97 
   98 #include "opt_inet.h"
   99 #include "opt_ipsec.h"
  100 #include "opt_pim.h"
  101 
  102 #ifdef PIM
  103 #define _PIM_VT 1
  104 #endif
  105 
  106 #include <sys/param.h>
  107 #include <sys/systm.h>
  108 #include <sys/callout.h>
  109 #include <sys/mbuf.h>
  110 #include <sys/socket.h>
  111 #include <sys/socketvar.h>
  112 #include <sys/protosw.h>
  113 #include <sys/errno.h>
  114 #include <sys/time.h>
  115 #include <sys/kernel.h>
  116 #include <sys/ioctl.h>
  117 #include <sys/syslog.h>
  118 
  119 #include <net/if.h>
  120 #include <net/route.h>
  121 #include <net/raw_cb.h>
  122 
  123 #include <netinet/in.h>
  124 #include <netinet/in_var.h>
  125 #include <netinet/in_systm.h>
  126 #include <netinet/ip.h>
  127 #include <netinet/ip_var.h>
  128 #include <netinet/in_pcb.h>
  129 #include <netinet/udp.h>
  130 #include <netinet/igmp.h>
  131 #include <netinet/igmp_var.h>
  132 #include <netinet/ip_mroute.h>
  133 #ifdef PIM
  134 #include <netinet/pim.h>
  135 #include <netinet/pim_var.h>
  136 #endif
  137 #include <netinet/ip_encap.h>
  138 
  139 #ifdef IPSEC
  140 #include <netinet6/ipsec.h>
  141 #include <netkey/key.h>
  142 #endif
  143 
  144 #ifdef FAST_IPSEC
  145 #include <netipsec/ipsec.h>
  146 #include <netipsec/key.h>
  147 #endif
  148 
  149 #include <machine/stdarg.h>
  150 
  151 #define IP_MULTICASTOPTS 0
  152 #define M_PULLUP(m, len)                                                 \
  153         do {                                                             \
  154                 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
  155                         (m) = m_pullup((m), (len));                      \
  156         } while (/*CONSTCOND*/ 0)
  157 
  158 /*
  159  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
  160  * except for netstat or debugging purposes.
  161  */
  162 struct socket  *ip_mrouter  = NULL;
  163 int             ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
  164 
  165 #define NO_RTE_FOUND    0x1
  166 #define RTE_FOUND       0x2
  167 
  168 #define MFCHASH(a, g)                                                   \
  169         ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^        \
  170           ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
  171 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
  172 u_long  mfchash;
  173 
  174 u_char          nexpire[MFCTBLSIZ];
  175 struct vif      viftable[MAXVIFS];
  176 struct mrtstat  mrtstat;
  177 u_int           mrtdebug = 0;     /* debug level        */
  178 #define         DEBUG_MFC       0x02
  179 #define         DEBUG_FORWARD   0x04
  180 #define         DEBUG_EXPIRE    0x08
  181 #define         DEBUG_XMIT      0x10
  182 #define         DEBUG_PIM       0x20
  183 
  184 #define         VIFI_INVALID    ((vifi_t) -1)
  185 
  186 u_int           tbfdebug = 0;     /* tbf debug level    */
  187 #ifdef RSVP_ISI
  188 u_int           rsvpdebug = 0;    /* rsvp debug level   */
  189 extern struct socket *ip_rsvpd;
  190 extern int rsvp_on;
  191 #endif /* RSVP_ISI */
  192 
  193 /* vif attachment using sys/netinet/ip_encap.c */
  194 static void vif_input(struct mbuf *, ...);
  195 static int vif_encapcheck(const struct mbuf *, int, int, void *);
  196 
  197 static const struct protosw vif_protosw =
  198 { SOCK_RAW,     &inetdomain,    IPPROTO_IPV4,   PR_ATOMIC|PR_ADDR,
  199   vif_input,    rip_output,     0,              rip_ctloutput,
  200   rip_usrreq,
  201   0,            0,              0,              0,
  202 };
  203 
  204 #define         EXPIRE_TIMEOUT  (hz / 4)        /* 4x / second */
  205 #define         UPCALL_EXPIRE   6               /* number of timeouts */
  206 
  207 /*
  208  * Define the token bucket filter structures
  209  */
  210 
  211 #define         TBF_REPROCESS   (hz / 100)      /* 100x / second */
  212 
  213 static int get_sg_cnt(struct sioc_sg_req *);
  214 static int get_vif_cnt(struct sioc_vif_req *);
  215 static int ip_mrouter_init(struct socket *, struct mbuf *);
  216 static int get_version(struct mbuf *);
  217 static int set_assert(struct mbuf *);
  218 static int get_assert(struct mbuf *);
  219 static int add_vif(struct mbuf *);
  220 static int del_vif(struct mbuf *);
  221 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
  222 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
  223 static void expire_mfc(struct mfc *);
  224 static int add_mfc(struct mbuf *);
  225 #ifdef UPCALL_TIMING
  226 static void collate(struct timeval *);
  227 #endif
  228 static int del_mfc(struct mbuf *);
  229 static int set_api_config(struct mbuf *); /* chose API capabilities */
  230 static int get_api_support(struct mbuf *);
  231 static int get_api_config(struct mbuf *);
  232 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
  233 static void expire_upcalls(void *);
  234 #ifdef RSVP_ISI
  235 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
  236 #else
  237 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
  238 #endif
  239 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
  240 static void encap_send(struct ip *, struct vif *, struct mbuf *);
  241 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t);
  242 static void tbf_queue(struct vif *, struct mbuf *);
  243 static void tbf_process_q(struct vif *);
  244 static void tbf_reprocess_q(void *);
  245 static int tbf_dq_sel(struct vif *, struct ip *);
  246 static void tbf_send_packet(struct vif *, struct mbuf *);
  247 static void tbf_update_tokens(struct vif *);
  248 static int priority(struct vif *, struct ip *);
  249 
  250 /*
  251  * Bandwidth monitoring
  252  */
  253 static void free_bw_list(struct bw_meter *);
  254 static int add_bw_upcall(struct mbuf *);
  255 static int del_bw_upcall(struct mbuf *);
  256 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
  257 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
  258 static void bw_upcalls_send(void);
  259 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
  260 static void unschedule_bw_meter(struct bw_meter *);
  261 static void bw_meter_process(void);
  262 static void expire_bw_upcalls_send(void *);
  263 static void expire_bw_meter_process(void *);
  264 
  265 #ifdef PIM
  266 static int pim_register_send(struct ip *, struct vif *,
  267                 struct mbuf *, struct mfc *);
  268 static int pim_register_send_rp(struct ip *, struct vif *,
  269                 struct mbuf *, struct mfc *);
  270 static int pim_register_send_upcall(struct ip *, struct vif *,
  271                 struct mbuf *, struct mfc *);
  272 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
  273 #endif
  274 
  275 /*
  276  * 'Interfaces' associated with decapsulator (so we can tell
  277  * packets that went through it from ones that get reflected
  278  * by a broken gateway).  These interfaces are never linked into
  279  * the system ifnet list & no routes point to them.  I.e., packets
  280  * can't be sent this way.  They only exist as a placeholder for
  281  * multicast source verification.
  282  */
  283 #if 0
  284 struct ifnet multicast_decap_if[MAXVIFS];
  285 #endif
  286 
  287 #define ENCAP_TTL       64
  288 #define ENCAP_PROTO     IPPROTO_IPIP    /* 4 */
  289 
  290 /* prototype IP hdr for encapsulated packets */
  291 struct ip multicast_encap_iphdr = {
  292 #if BYTE_ORDER == LITTLE_ENDIAN
  293         sizeof(struct ip) >> 2, IPVERSION,
  294 #else
  295         IPVERSION, sizeof(struct ip) >> 2,
  296 #endif
  297         0,                              /* tos */
  298         sizeof(struct ip),              /* total length */
  299         0,                              /* id */
  300         0,                              /* frag offset */
  301         ENCAP_TTL, ENCAP_PROTO,
  302         0,                              /* checksum */
  303 };
  304 
  305 /*
  306  * Bandwidth meter variables and constants
  307  */
  308 
  309 /*
  310  * Pending timeouts are stored in a hash table, the key being the
  311  * expiration time. Periodically, the entries are analysed and processed.
  312  */
  313 #define BW_METER_BUCKETS        1024
  314 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
  315 struct callout bw_meter_ch;
  316 #define BW_METER_PERIOD (hz)            /* periodical handling of bw meters */
  317 
  318 /*
  319  * Pending upcalls are stored in a vector which is flushed when
  320  * full, or periodically
  321  */
  322 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX];
  323 static u_int    bw_upcalls_n; /* # of pending upcalls */
  324 struct callout  bw_upcalls_ch;
  325 #define BW_UPCALLS_PERIOD (hz)          /* periodical flush of bw upcalls */
  326 
  327 #ifdef PIM
  328 struct pimstat pimstat;
  329 
  330 /*
  331  * Note: the PIM Register encapsulation adds the following in front of a
  332  * data packet:
  333  *
  334  * struct pim_encap_hdr {
  335  *    struct ip ip;
  336  *    struct pim_encap_pimhdr  pim;
  337  * }
  338  *
  339  */
  340 
  341 struct pim_encap_pimhdr {
  342         struct pim pim;
  343         uint32_t   flags;
  344 };
  345 
  346 static struct ip pim_encap_iphdr = {
  347 #if BYTE_ORDER == LITTLE_ENDIAN
  348         sizeof(struct ip) >> 2,
  349         IPVERSION,
  350 #else
  351         IPVERSION,
  352         sizeof(struct ip) >> 2,
  353 #endif
  354         0,                      /* tos */
  355         sizeof(struct ip),      /* total length */
  356         0,                      /* id */
  357         0,                      /* frag offset */
  358         ENCAP_TTL,
  359         IPPROTO_PIM,
  360         0,                      /* checksum */
  361 };
  362 
  363 static struct pim_encap_pimhdr pim_encap_pimhdr = {
  364     {
  365         PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
  366         0,                      /* reserved */
  367         0,                      /* checksum */
  368     },
  369     0                           /* flags */
  370 };
  371 
  372 static struct ifnet multicast_register_if;
  373 static vifi_t reg_vif_num = VIFI_INVALID;
  374 #endif /* PIM */
  375 
  376 
  377 /*
  378  * Private variables.
  379  */
  380 static vifi_t      numvifs = 0;
  381 
  382 static struct callout expire_upcalls_ch;
  383 
  384 /*
  385  * one-back cache used by vif_encapcheck to locate a tunnel's vif
  386  * given a datagram's src ip address.
  387  */
  388 static struct in_addr last_encap_src;
  389 static struct vif *last_encap_vif;
  390 
  391 /*
  392  * whether or not special PIM assert processing is enabled.
  393  */
  394 static int pim_assert;
  395 /*
  396  * Rate limit for assert notification messages, in usec
  397  */
  398 #define ASSERT_MSG_TIME         3000000
  399 
  400 /*
  401  * Kernel multicast routing API capabilities and setup.
  402  * If more API capabilities are added to the kernel, they should be
  403  * recorded in `mrt_api_support'.
  404  */
  405 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
  406                                           MRT_MFC_FLAGS_BORDER_VIF |
  407                                           MRT_MFC_RP |
  408                                           MRT_MFC_BW_UPCALL);
  409 static u_int32_t mrt_api_config = 0;
  410 
  411 /*
  412  * Find a route for a given origin IP address and Multicast group address
  413  * Type of service parameter to be added in the future!!!
  414  * Statistics are updated by the caller if needed
  415  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
  416  */
  417 static struct mfc *
  418 mfc_find(struct in_addr *o, struct in_addr *g)
  419 {
  420         struct mfc *rt;
  421 
  422         LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
  423                 if (in_hosteq(rt->mfc_origin, *o) &&
  424                     in_hosteq(rt->mfc_mcastgrp, *g) &&
  425                     (rt->mfc_stall == NULL))
  426                         break;
  427         }
  428 
  429         return (rt);
  430 }
  431 
  432 /*
  433  * Macros to compute elapsed time efficiently
  434  * Borrowed from Van Jacobson's scheduling code
  435  */
  436 #define TV_DELTA(a, b, delta) do {                                      \
  437         int xxs;                                                        \
  438         delta = (a).tv_usec - (b).tv_usec;                              \
  439         xxs = (a).tv_sec - (b).tv_sec;                                  \
  440         switch (xxs) {                                                  \
  441         case 2:                                                         \
  442                 delta += 1000000;                                       \
  443                 /* fall through */                                      \
  444         case 1:                                                         \
  445                 delta += 1000000;                                       \
  446                 /* fall through */                                      \
  447         case 0:                                                         \
  448                 break;                                                  \
  449         default:                                                        \
  450                 delta += (1000000 * xxs);                               \
  451                 break;                                                  \
  452         }                                                               \
  453 } while (/*CONSTCOND*/ 0)
  454 
  455 #ifdef UPCALL_TIMING
  456 u_int32_t upcall_data[51];
  457 #endif /* UPCALL_TIMING */
  458 
  459 /*
  460  * Handle MRT setsockopt commands to modify the multicast routing tables.
  461  */
  462 int
  463 ip_mrouter_set(struct socket *so, int optname, struct mbuf **m)
  464 {
  465         int error;
  466 
  467         if (optname != MRT_INIT && so != ip_mrouter)
  468                 error = ENOPROTOOPT;
  469         else
  470                 switch (optname) {
  471                 case MRT_INIT:
  472                         error = ip_mrouter_init(so, *m);
  473                         break;
  474                 case MRT_DONE:
  475                         error = ip_mrouter_done();
  476                         break;
  477                 case MRT_ADD_VIF:
  478                         error = add_vif(*m);
  479                         break;
  480                 case MRT_DEL_VIF:
  481                         error = del_vif(*m);
  482                         break;
  483                 case MRT_ADD_MFC:
  484                         error = add_mfc(*m);
  485                         break;
  486                 case MRT_DEL_MFC:
  487                         error = del_mfc(*m);
  488                         break;
  489                 case MRT_ASSERT:
  490                         error = set_assert(*m);
  491                         break;
  492                 case MRT_API_CONFIG:
  493                         error = set_api_config(*m);
  494                         break;
  495                 case MRT_ADD_BW_UPCALL:
  496                         error = add_bw_upcall(*m);
  497                         break;
  498                 case MRT_DEL_BW_UPCALL:
  499                         error = del_bw_upcall(*m);
  500                         break;
  501                 default:
  502                         error = ENOPROTOOPT;
  503                         break;
  504                 }
  505 
  506         if (*m)
  507                 m_free(*m);
  508         return (error);
  509 }
  510 
  511 /*
  512  * Handle MRT getsockopt commands
  513  */
  514 int
  515 ip_mrouter_get(struct socket *so, int optname, struct mbuf **m)
  516 {
  517         int error;
  518 
  519         if (so != ip_mrouter)
  520                 error = ENOPROTOOPT;
  521         else {
  522                 *m = m_get(M_WAIT, MT_SOOPTS);
  523                 MCLAIM(*m, so->so_mowner);
  524 
  525                 switch (optname) {
  526                 case MRT_VERSION:
  527                         error = get_version(*m);
  528                         break;
  529                 case MRT_ASSERT:
  530                         error = get_assert(*m);
  531                         break;
  532                 case MRT_API_SUPPORT:
  533                         error = get_api_support(*m);
  534                         break;
  535                 case MRT_API_CONFIG:
  536                         error = get_api_config(*m);
  537                         break;
  538                 default:
  539                         error = ENOPROTOOPT;
  540                         break;
  541                 }
  542 
  543                 if (error)
  544                         m_free(*m);
  545         }
  546 
  547         return (error);
  548 }
  549 
  550 /*
  551  * Handle ioctl commands to obtain information from the cache
  552  */
  553 int
  554 mrt_ioctl(struct socket *so, u_long cmd, caddr_t data)
  555 {
  556         int error;
  557 
  558         if (so != ip_mrouter)
  559                 error = EINVAL;
  560         else
  561                 switch (cmd) {
  562                 case SIOCGETVIFCNT:
  563                         error = get_vif_cnt((struct sioc_vif_req *)data);
  564                         break;
  565                 case SIOCGETSGCNT:
  566                         error = get_sg_cnt((struct sioc_sg_req *)data);
  567                         break;
  568                 default:
  569                         error = EINVAL;
  570                         break;
  571                 }
  572 
  573         return (error);
  574 }
  575 
  576 /*
  577  * returns the packet, byte, rpf-failure count for the source group provided
  578  */
  579 static int
  580 get_sg_cnt(struct sioc_sg_req *req)
  581 {
  582         int s;
  583         struct mfc *rt;
  584 
  585         s = splsoftnet();
  586         rt = mfc_find(&req->src, &req->grp);
  587         if (rt == NULL) {
  588                 splx(s);
  589                 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
  590                 return (EADDRNOTAVAIL);
  591         }
  592         req->pktcnt = rt->mfc_pkt_cnt;
  593         req->bytecnt = rt->mfc_byte_cnt;
  594         req->wrong_if = rt->mfc_wrong_if;
  595         splx(s);
  596 
  597         return (0);
  598 }
  599 
  600 /*
  601  * returns the input and output packet and byte counts on the vif provided
  602  */
  603 static int
  604 get_vif_cnt(struct sioc_vif_req *req)
  605 {
  606         vifi_t vifi = req->vifi;
  607 
  608         if (vifi >= numvifs)
  609                 return (EINVAL);
  610 
  611         req->icount = viftable[vifi].v_pkt_in;
  612         req->ocount = viftable[vifi].v_pkt_out;
  613         req->ibytes = viftable[vifi].v_bytes_in;
  614         req->obytes = viftable[vifi].v_bytes_out;
  615 
  616         return (0);
  617 }
  618 
  619 /*
  620  * Enable multicast routing
  621  */
  622 static int
  623 ip_mrouter_init(struct socket *so, struct mbuf *m)
  624 {
  625         int *v;
  626 
  627         if (mrtdebug)
  628                 log(LOG_DEBUG,
  629                     "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
  630                     so->so_type, so->so_proto->pr_protocol);
  631 
  632         if (so->so_type != SOCK_RAW ||
  633             so->so_proto->pr_protocol != IPPROTO_IGMP)
  634                 return (EOPNOTSUPP);
  635 
  636         if (m == NULL || m->m_len < sizeof(int))
  637                 return (EINVAL);
  638 
  639         v = mtod(m, int *);
  640         if (*v != 1)
  641                 return (EINVAL);
  642 
  643         if (ip_mrouter != NULL)
  644                 return (EADDRINUSE);
  645 
  646         ip_mrouter = so;
  647 
  648         mfchashtbl =
  649             hashinit(MFCTBLSIZ, HASH_LIST, M_MRTABLE, M_WAITOK, &mfchash);
  650         bzero((caddr_t)nexpire, sizeof(nexpire));
  651 
  652         pim_assert = 0;
  653 
  654         callout_init(&expire_upcalls_ch);
  655         callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
  656                       expire_upcalls, NULL);
  657 
  658         callout_init(&bw_upcalls_ch);
  659         callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
  660                       expire_bw_upcalls_send, NULL);
  661 
  662         callout_init(&bw_meter_ch);
  663         callout_reset(&bw_meter_ch, BW_METER_PERIOD,
  664                       expire_bw_meter_process, NULL);
  665 
  666         if (mrtdebug)
  667                 log(LOG_DEBUG, "ip_mrouter_init\n");
  668 
  669         return (0);
  670 }
  671 
  672 /*
  673  * Disable multicast routing
  674  */
  675 int
  676 ip_mrouter_done(void)
  677 {
  678         vifi_t vifi;
  679         struct vif *vifp;
  680         int i;
  681         int s;
  682 
  683         s = splsoftnet();
  684 
  685         /* Clear out all the vifs currently in use. */
  686         for (vifi = 0; vifi < numvifs; vifi++) {
  687                 vifp = &viftable[vifi];
  688                 if (!in_nullhost(vifp->v_lcl_addr))
  689                         reset_vif(vifp);
  690         }
  691 
  692         numvifs = 0;
  693         pim_assert = 0;
  694         mrt_api_config = 0;
  695 
  696         callout_stop(&expire_upcalls_ch);
  697         callout_stop(&bw_upcalls_ch);
  698         callout_stop(&bw_meter_ch);
  699 
  700         /*
  701          * Free all multicast forwarding cache entries.
  702          */
  703         for (i = 0; i < MFCTBLSIZ; i++) {
  704                 struct mfc *rt, *nrt;
  705 
  706                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
  707                         nrt = LIST_NEXT(rt, mfc_hash);
  708 
  709                         expire_mfc(rt);
  710                 }
  711         }
  712 
  713         bzero((caddr_t)nexpire, sizeof(nexpire));
  714         free(mfchashtbl, M_MRTABLE);
  715         mfchashtbl = NULL;
  716 
  717         bw_upcalls_n = 0;
  718         bzero(bw_meter_timers, sizeof(bw_meter_timers));
  719 
  720         /* Reset de-encapsulation cache. */
  721 
  722         ip_mrouter = NULL;
  723 
  724         splx(s);
  725 
  726         if (mrtdebug)
  727                 log(LOG_DEBUG, "ip_mrouter_done\n");
  728 
  729         return (0);
  730 }
  731 
  732 void
  733 ip_mrouter_detach(struct ifnet *ifp)
  734 {
  735         int vifi, i;
  736         struct vif *vifp;
  737         struct mfc *rt;
  738         struct rtdetq *rte;
  739 
  740         /* XXX not sure about side effect to userland routing daemon */
  741         for (vifi = 0; vifi < numvifs; vifi++) {
  742                 vifp = &viftable[vifi];
  743                 if (vifp->v_ifp == ifp)
  744                         reset_vif(vifp);
  745         }
  746         for (i = 0; i < MFCTBLSIZ; i++) {
  747                 if (nexpire[i] == 0)
  748                         continue;
  749                 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
  750                         for (rte = rt->mfc_stall; rte; rte = rte->next) {
  751                                 if (rte->ifp == ifp)
  752                                         rte->ifp = NULL;
  753                         }
  754                 }
  755         }
  756 }
  757 
  758 static int
  759 get_version(struct mbuf *m)
  760 {
  761         int *v = mtod(m, int *);
  762 
  763         *v = 0x0305;    /* XXX !!!! */
  764         m->m_len = sizeof(int);
  765         return (0);
  766 }
  767 
  768 /*
  769  * Set PIM assert processing global
  770  */
  771 static int
  772 set_assert(struct mbuf *m)
  773 {
  774         int *i;
  775 
  776         if (m == NULL || m->m_len < sizeof(int))
  777                 return (EINVAL);
  778 
  779         i = mtod(m, int *);
  780         pim_assert = !!*i;
  781         return (0);
  782 }
  783 
  784 /*
  785  * Get PIM assert processing global
  786  */
  787 static int
  788 get_assert(struct mbuf *m)
  789 {
  790         int *i = mtod(m, int *);
  791 
  792         *i = pim_assert;
  793         m->m_len = sizeof(int);
  794         return (0);
  795 }
  796 
  797 /*
  798  * Configure API capabilities
  799  */
  800 static int
  801 set_api_config(struct mbuf *m)
  802 {
  803         int i;
  804         u_int32_t *apival;
  805 
  806         if (m == NULL || m->m_len < sizeof(u_int32_t))
  807                 return (EINVAL);
  808 
  809         apival = mtod(m, u_int32_t *);
  810 
  811         /*
  812          * We can set the API capabilities only if it is the first operation
  813          * after MRT_INIT. I.e.:
  814          *  - there are no vifs installed
  815          *  - pim_assert is not enabled
  816          *  - the MFC table is empty
  817          */
  818         if (numvifs > 0) {
  819                 *apival = 0;
  820                 return (EPERM);
  821         }
  822         if (pim_assert) {
  823                 *apival = 0;
  824                 return (EPERM);
  825         }
  826         for (i = 0; i < MFCTBLSIZ; i++) {
  827                 if (LIST_FIRST(&mfchashtbl[i]) != NULL) {
  828                         *apival = 0;
  829                         return (EPERM);
  830                 }
  831         }
  832 
  833         mrt_api_config = *apival & mrt_api_support;
  834         *apival = mrt_api_config;
  835 
  836         return (0);
  837 }
  838 
  839 /*
  840  * Get API capabilities
  841  */
  842 static int
  843 get_api_support(struct mbuf *m)
  844 {
  845         u_int32_t *apival;
  846 
  847         if (m == NULL || m->m_len < sizeof(u_int32_t))
  848                 return (EINVAL);
  849 
  850         apival = mtod(m, u_int32_t *);
  851 
  852         *apival = mrt_api_support;
  853 
  854         return (0);
  855 }
  856 
  857 /*
  858  * Get API configured capabilities
  859  */
  860 static int
  861 get_api_config(struct mbuf *m)
  862 {
  863         u_int32_t *apival;
  864 
  865         if (m == NULL || m->m_len < sizeof(u_int32_t))
  866                 return (EINVAL);
  867 
  868         apival = mtod(m, u_int32_t *);
  869 
  870         *apival = mrt_api_config;
  871 
  872         return (0);
  873 }
  874 
  875 static struct sockaddr_in sin = { sizeof(sin), AF_INET };
  876 
  877 /*
  878  * Add a vif to the vif table
  879  */
  880 static int
  881 add_vif(struct mbuf *m)
  882 {
  883         struct vifctl *vifcp;
  884         struct vif *vifp;
  885         struct ifaddr *ifa;
  886         struct ifnet *ifp;
  887         struct ifreq ifr;
  888         int error, s;
  889 
  890         if (m == NULL || m->m_len < sizeof(struct vifctl))
  891                 return (EINVAL);
  892 
  893         vifcp = mtod(m, struct vifctl *);
  894         if (vifcp->vifc_vifi >= MAXVIFS)
  895                 return (EINVAL);
  896         if (in_nullhost(vifcp->vifc_lcl_addr))
  897                 return (EADDRNOTAVAIL);
  898 
  899         vifp = &viftable[vifcp->vifc_vifi];
  900         if (!in_nullhost(vifp->v_lcl_addr))
  901                 return (EADDRINUSE);
  902 
  903         /* Find the interface with an address in AF_INET family. */
  904 #ifdef PIM
  905         if (vifcp->vifc_flags & VIFF_REGISTER) {
  906                 /*
  907                  * XXX: Because VIFF_REGISTER does not really need a valid
  908                  * local interface (e.g. it could be 127.0.0.2), we don't
  909                  * check its address.
  910                  */
  911             ifp = NULL;
  912         } else
  913 #endif
  914         {
  915                 sin.sin_addr = vifcp->vifc_lcl_addr;
  916                 ifa = ifa_ifwithaddr(sintosa(&sin));
  917                 if (ifa == NULL)
  918                         return (EADDRNOTAVAIL);
  919                 ifp = ifa->ifa_ifp;
  920         }
  921 
  922         if (vifcp->vifc_flags & VIFF_TUNNEL) {
  923                 if (vifcp->vifc_flags & VIFF_SRCRT) {
  924                         log(LOG_ERR, "source routed tunnels not supported\n");
  925                         return (EOPNOTSUPP);
  926                 }
  927 
  928                 /* attach this vif to decapsulator dispatch table */
  929                 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
  930                     vif_encapcheck, &vif_protosw, vifp);
  931                 if (!vifp->v_encap_cookie)
  932                         return (EINVAL);
  933 
  934                 /* Create a fake encapsulation interface. */
  935                 ifp = (struct ifnet *)malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK);
  936                 bzero(ifp, sizeof(*ifp));
  937                 snprintf(ifp->if_xname, sizeof(ifp->if_xname),
  938                          "mdecap%d", vifcp->vifc_vifi);
  939 
  940                 /* Prepare cached route entry. */
  941                 bzero(&vifp->v_route, sizeof(vifp->v_route));
  942 #ifdef PIM
  943         } else if (vifcp->vifc_flags & VIFF_REGISTER) {
  944                 ifp = &multicast_register_if;
  945                 if (mrtdebug)
  946                         log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
  947                             (void *)ifp);
  948                 if (reg_vif_num == VIFI_INVALID) {
  949                         bzero(ifp, sizeof(*ifp));
  950                         snprintf(ifp->if_xname, sizeof(ifp->if_xname),
  951                                  "register_vif");
  952                         ifp->if_flags = IFF_LOOPBACK;
  953                         bzero(&vifp->v_route, sizeof(vifp->v_route));
  954                         reg_vif_num = vifcp->vifc_vifi;
  955                 }
  956 #endif
  957         } else {
  958                 /* Make sure the interface supports multicast. */
  959                 if ((ifp->if_flags & IFF_MULTICAST) == 0)
  960                         return (EOPNOTSUPP);
  961 
  962                 /* Enable promiscuous reception of all IP multicasts. */
  963                 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
  964                 satosin(&ifr.ifr_addr)->sin_family = AF_INET;
  965                 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
  966                 error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
  967                 if (error)
  968                         return (error);
  969         }
  970 
  971         s = splsoftnet();
  972 
  973         /* Define parameters for the tbf structure. */
  974         vifp->tbf_q = NULL;
  975         vifp->tbf_t = &vifp->tbf_q;
  976         microtime(&vifp->tbf_last_pkt_t);
  977         vifp->tbf_n_tok = 0;
  978         vifp->tbf_q_len = 0;
  979         vifp->tbf_max_q_len = MAXQSIZE;
  980 
  981         vifp->v_flags = vifcp->vifc_flags;
  982         vifp->v_threshold = vifcp->vifc_threshold;
  983         /* scaling up here allows division by 1024 in critical code */
  984         vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
  985         vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
  986         vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
  987         vifp->v_ifp = ifp;
  988         /* Initialize per vif pkt counters. */
  989         vifp->v_pkt_in = 0;
  990         vifp->v_pkt_out = 0;
  991         vifp->v_bytes_in = 0;
  992         vifp->v_bytes_out = 0;
  993 
  994         callout_init(&vifp->v_repq_ch);
  995 
  996 #ifdef RSVP_ISI
  997         vifp->v_rsvp_on = 0;
  998         vifp->v_rsvpd = NULL;
  999 #endif /* RSVP_ISI */
 1000 
 1001         splx(s);
 1002 
 1003         /* Adjust numvifs up if the vifi is higher than numvifs. */
 1004         if (numvifs <= vifcp->vifc_vifi)
 1005                 numvifs = vifcp->vifc_vifi + 1;
 1006 
 1007         if (mrtdebug)
 1008                 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
 1009                     vifcp->vifc_vifi,
 1010                     ntohl(vifcp->vifc_lcl_addr.s_addr),
 1011                     (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
 1012                     ntohl(vifcp->vifc_rmt_addr.s_addr),
 1013                     vifcp->vifc_threshold,
 1014                     vifcp->vifc_rate_limit);
 1015 
 1016         return (0);
 1017 }
 1018 
 1019 void
 1020 reset_vif(struct vif *vifp)
 1021 {
 1022         struct mbuf *m, *n;
 1023         struct ifnet *ifp;
 1024         struct ifreq ifr;
 1025 
 1026         callout_stop(&vifp->v_repq_ch);
 1027 
 1028         /* detach this vif from decapsulator dispatch table */
 1029         encap_detach(vifp->v_encap_cookie);
 1030         vifp->v_encap_cookie = NULL;
 1031 
 1032         /*
 1033          * Free packets queued at the interface
 1034          */
 1035         for (m = vifp->tbf_q; m != NULL; m = n) {
 1036                 n = m->m_nextpkt;
 1037                 m_freem(m);
 1038         }
 1039 
 1040         if (vifp->v_flags & VIFF_TUNNEL) {
 1041                 free(vifp->v_ifp, M_MRTABLE);
 1042                 if (vifp == last_encap_vif) {
 1043                         last_encap_vif = NULL;
 1044                         last_encap_src = zeroin_addr;
 1045                 }
 1046         } else if (vifp->v_flags & VIFF_REGISTER) {
 1047 #ifdef PIM
 1048                 reg_vif_num = VIFI_INVALID;
 1049 #endif
 1050         } else {
 1051                 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
 1052                 satosin(&ifr.ifr_addr)->sin_family = AF_INET;
 1053                 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
 1054                 ifp = vifp->v_ifp;
 1055                 (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
 1056         }
 1057         bzero((caddr_t)vifp, sizeof(*vifp));
 1058 }
 1059 
 1060 /*
 1061  * Delete a vif from the vif table
 1062  */
 1063 static int
 1064 del_vif(struct mbuf *m)
 1065 {
 1066         vifi_t *vifip;
 1067         struct vif *vifp;
 1068         vifi_t vifi;
 1069         int s;
 1070 
 1071         if (m == NULL || m->m_len < sizeof(vifi_t))
 1072                 return (EINVAL);
 1073 
 1074         vifip = mtod(m, vifi_t *);
 1075         if (*vifip >= numvifs)
 1076                 return (EINVAL);
 1077 
 1078         vifp = &viftable[*vifip];
 1079         if (in_nullhost(vifp->v_lcl_addr))
 1080                 return (EADDRNOTAVAIL);
 1081 
 1082         s = splsoftnet();
 1083 
 1084         reset_vif(vifp);
 1085 
 1086         /* Adjust numvifs down */
 1087         for (vifi = numvifs; vifi > 0; vifi--)
 1088                 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
 1089                         break;
 1090         numvifs = vifi;
 1091 
 1092         splx(s);
 1093 
 1094         if (mrtdebug)
 1095                 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
 1096 
 1097         return (0);
 1098 }
 1099 
 1100 /*
 1101  * update an mfc entry without resetting counters and S,G addresses.
 1102  */
 1103 static void
 1104 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 1105 {
 1106         int i;
 1107 
 1108         rt->mfc_parent = mfccp->mfcc_parent;
 1109         for (i = 0; i < numvifs; i++) {
 1110                 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 1111                 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
 1112                         MRT_MFC_FLAGS_ALL;
 1113         }
 1114         /* set the RP address */
 1115         if (mrt_api_config & MRT_MFC_RP)
 1116                 rt->mfc_rp = mfccp->mfcc_rp;
 1117         else
 1118                 rt->mfc_rp = zeroin_addr;
 1119 }
 1120 
 1121 /*
 1122  * fully initialize an mfc entry from the parameter.
 1123  */
 1124 static void
 1125 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 1126 {
 1127         rt->mfc_origin     = mfccp->mfcc_origin;
 1128         rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 1129 
 1130         update_mfc_params(rt, mfccp);
 1131 
 1132         /* initialize pkt counters per src-grp */
 1133         rt->mfc_pkt_cnt    = 0;
 1134         rt->mfc_byte_cnt   = 0;
 1135         rt->mfc_wrong_if   = 0;
 1136         timerclear(&rt->mfc_last_assert);
 1137 }
 1138 
 1139 static void
 1140 expire_mfc(struct mfc *rt)
 1141 {
 1142         struct rtdetq *rte, *nrte;
 1143 
 1144         free_bw_list(rt->mfc_bw_meter);
 1145 
 1146         for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
 1147                 nrte = rte->next;
 1148                 m_freem(rte->m);
 1149                 free(rte, M_MRTABLE);
 1150         }
 1151 
 1152         LIST_REMOVE(rt, mfc_hash);
 1153         free(rt, M_MRTABLE);
 1154 }
 1155 
 1156 /*
 1157  * Add an mfc entry
 1158  */
 1159 static int
 1160 add_mfc(struct mbuf *m)
 1161 {
 1162         struct mfcctl2 mfcctl2;
 1163         struct mfcctl2 *mfccp;
 1164         struct mfc *rt;
 1165         u_int32_t hash = 0;
 1166         struct rtdetq *rte, *nrte;
 1167         u_short nstl;
 1168         int s;
 1169         int mfcctl_size = sizeof(struct mfcctl);
 1170 
 1171         if (mrt_api_config & MRT_API_FLAGS_ALL)
 1172                 mfcctl_size = sizeof(struct mfcctl2);
 1173 
 1174         if (m == NULL || m->m_len < mfcctl_size)
 1175                 return (EINVAL);
 1176 
 1177         /*
 1178          * select data size depending on API version.
 1179          */
 1180         if (mrt_api_config & MRT_API_FLAGS_ALL) {
 1181                 struct mfcctl2 *mp2 = mtod(m, struct mfcctl2 *);
 1182                 bcopy(mp2, (caddr_t)&mfcctl2, sizeof(*mp2));
 1183         } else {
 1184                 struct mfcctl *mp = mtod(m, struct mfcctl *);
 1185                 bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp));
 1186                 bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl),
 1187                       sizeof(mfcctl2) - sizeof(struct mfcctl));
 1188         }
 1189         mfccp = &mfcctl2;
 1190 
 1191         s = splsoftnet();
 1192         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1193 
 1194         /* If an entry already exists, just update the fields */
 1195         if (rt) {
 1196                 if (mrtdebug & DEBUG_MFC)
 1197                         log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
 1198                             ntohl(mfccp->mfcc_origin.s_addr),
 1199                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1200                             mfccp->mfcc_parent);
 1201 
 1202                 update_mfc_params(rt, mfccp);
 1203 
 1204                 splx(s);
 1205                 return (0);
 1206         }
 1207 
 1208         /*
 1209          * Find the entry for which the upcall was made and update
 1210          */
 1211         nstl = 0;
 1212         hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
 1213         LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1214                 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1215                     in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 1216                     rt->mfc_stall != NULL) {
 1217                         if (nstl++)
 1218                                 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
 1219                                     "multiple kernel entries",
 1220                                     ntohl(mfccp->mfcc_origin.s_addr),
 1221                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1222                                     mfccp->mfcc_parent, rt->mfc_stall);
 1223 
 1224                         if (mrtdebug & DEBUG_MFC)
 1225                                 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n",
 1226                                     ntohl(mfccp->mfcc_origin.s_addr),
 1227                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1228                                     mfccp->mfcc_parent, rt->mfc_stall);
 1229 
 1230                         rte = rt->mfc_stall;
 1231                         init_mfc_params(rt, mfccp);
 1232                         rt->mfc_stall = NULL;
 1233 
 1234                         rt->mfc_expire = 0; /* Don't clean this guy up */
 1235                         nexpire[hash]--;
 1236 
 1237                         /* free packets Qed at the end of this entry */
 1238                         for (; rte != NULL; rte = nrte) {
 1239                                 nrte = rte->next;
 1240                                 if (rte->ifp) {
 1241 #ifdef RSVP_ISI
 1242                                         ip_mdq(rte->m, rte->ifp, rt, -1);
 1243 #else
 1244                                         ip_mdq(rte->m, rte->ifp, rt);
 1245 #endif /* RSVP_ISI */
 1246                                 }
 1247                                 m_freem(rte->m);
 1248 #ifdef UPCALL_TIMING
 1249                                 collate(&rte->t);
 1250 #endif /* UPCALL_TIMING */
 1251                                 free(rte, M_MRTABLE);
 1252                         }
 1253                 }
 1254         }
 1255 
 1256         /*
 1257          * It is possible that an entry is being inserted without an upcall
 1258          */
 1259         if (nstl == 0) {
 1260                 /*
 1261                  * No mfc; make a new one
 1262                  */
 1263                 if (mrtdebug & DEBUG_MFC)
 1264                         log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
 1265                             ntohl(mfccp->mfcc_origin.s_addr),
 1266                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1267                             mfccp->mfcc_parent);
 1268 
 1269                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1270                         if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1271                             in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 1272                                 init_mfc_params(rt, mfccp);
 1273                                 if (rt->mfc_expire)
 1274                                         nexpire[hash]--;
 1275                                 rt->mfc_expire = 0;
 1276                                 break; /* XXX */
 1277                         }
 1278                 }
 1279                 if (rt == NULL) {       /* no upcall, so make a new entry */
 1280                         rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
 1281                                                   M_NOWAIT);
 1282                         if (rt == NULL) {
 1283                                 splx(s);
 1284                                 return (ENOBUFS);
 1285                         }
 1286 
 1287                         init_mfc_params(rt, mfccp);
 1288                         rt->mfc_expire  = 0;
 1289                         rt->mfc_stall   = NULL;
 1290                         rt->mfc_bw_meter = NULL;
 1291 
 1292                         /* insert new entry at head of hash chain */
 1293                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1294                 }
 1295         }
 1296 
 1297         splx(s);
 1298         return (0);
 1299 }
 1300 
 1301 #ifdef UPCALL_TIMING
 1302 /*
 1303  * collect delay statistics on the upcalls
 1304  */
 1305 static void
 1306 collate(struct timeval *t)
 1307 {
 1308         u_int32_t d;
 1309         struct timeval tp;
 1310         u_int32_t delta;
 1311 
 1312         microtime(&tp);
 1313 
 1314         if (timercmp(t, &tp, <)) {
 1315                 TV_DELTA(tp, *t, delta);
 1316 
 1317                 d = delta >> 10;
 1318                 if (d > 50)
 1319                         d = 50;
 1320 
 1321                 ++upcall_data[d];
 1322         }
 1323 }
 1324 #endif /* UPCALL_TIMING */
 1325 
 1326 /*
 1327  * Delete an mfc entry
 1328  */
 1329 static int
 1330 del_mfc(struct mbuf *m)
 1331 {
 1332         struct mfcctl2 mfcctl2;
 1333         struct mfcctl2 *mfccp;
 1334         struct mfc *rt;
 1335         int s;
 1336         int mfcctl_size = sizeof(struct mfcctl);
 1337         struct mfcctl *mp = mtod(m, struct mfcctl *);
 1338 
 1339         /*
 1340          * XXX: for deleting MFC entries the information in entries
 1341          * of size "struct mfcctl" is sufficient.
 1342          */
 1343 
 1344         if (m == NULL || m->m_len < mfcctl_size)
 1345                 return (EINVAL);
 1346 
 1347         bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp));
 1348         bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl),
 1349               sizeof(mfcctl2) - sizeof(struct mfcctl));
 1350 
 1351         mfccp = &mfcctl2;
 1352 
 1353         if (mrtdebug & DEBUG_MFC)
 1354                 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
 1355                     ntohl(mfccp->mfcc_origin.s_addr),
 1356                     ntohl(mfccp->mfcc_mcastgrp.s_addr));
 1357 
 1358         s = splsoftnet();
 1359 
 1360         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1361         if (rt == NULL) {
 1362                 splx(s);
 1363                 return (EADDRNOTAVAIL);
 1364         }
 1365 
 1366         /*
 1367          * free the bw_meter entries
 1368          */
 1369         free_bw_list(rt->mfc_bw_meter);
 1370         rt->mfc_bw_meter = NULL;
 1371 
 1372         LIST_REMOVE(rt, mfc_hash);
 1373         free(rt, M_MRTABLE);
 1374 
 1375         splx(s);
 1376         return (0);
 1377 }
 1378 
 1379 static int
 1380 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 1381 {
 1382         if (s) {
 1383                 if (sbappendaddr(&s->so_rcv, sintosa(src), mm,
 1384                     (struct mbuf *)NULL) != 0) {
 1385                         sorwakeup(s);
 1386                         return (0);
 1387                 }
 1388         }
 1389         m_freem(mm);
 1390         return (-1);
 1391 }
 1392 
 1393 /*
 1394  * IP multicast forwarding function. This function assumes that the packet
 1395  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
 1396  * pointed to by "ifp", and the packet is to be relayed to other networks
 1397  * that have members of the packet's destination IP multicast group.
 1398  *
 1399  * The packet is returned unscathed to the caller, unless it is
 1400  * erroneous, in which case a non-zero return value tells the caller to
 1401  * discard it.
 1402  */
 1403 
 1404 #define IP_HDR_LEN  20  /* # bytes of fixed IP header (excluding options) */
 1405 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 1406 
 1407 int
 1408 #ifdef RSVP_ISI
 1409 ip_mforward(struct mbuf *m, struct ifnet *ifp, struct ip_moptions *imo)
 1410 #else
 1411 ip_mforward(struct mbuf *m, struct ifnet *ifp)
 1412 #endif /* RSVP_ISI */
 1413 {
 1414         struct ip *ip = mtod(m, struct ip *);
 1415         struct mfc *rt;
 1416         static int srctun = 0;
 1417         struct mbuf *mm;
 1418         int s;
 1419         vifi_t vifi;
 1420 
 1421         if (mrtdebug & DEBUG_FORWARD)
 1422                 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
 1423                     ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
 1424 
 1425         if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
 1426             ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
 1427                 /*
 1428                  * Packet arrived via a physical interface or
 1429                  * an encapsulated tunnel or a register_vif.
 1430                  */
 1431         } else {
 1432                 /*
 1433                  * Packet arrived through a source-route tunnel.
 1434                  * Source-route tunnels are no longer supported.
 1435                  */
 1436                 if ((srctun++ % 1000) == 0)
 1437                         log(LOG_ERR,
 1438                             "ip_mforward: received source-routed packet from %x\n",
 1439                             ntohl(ip->ip_src.s_addr));
 1440 
 1441                 return (1);
 1442         }
 1443 
 1444 #ifdef RSVP_ISI
 1445         if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
 1446                 if (ip->ip_ttl < 255)
 1447                         ip->ip_ttl++;   /* compensate for -1 in *_send routines */
 1448                 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1449                         struct vif *vifp = viftable + vifi;
 1450                         printf("Sending IPPROTO_RSVP from %x to %x on vif %d (%s%s)\n",
 1451                             ntohl(ip->ip_src), ntohl(ip->ip_dst), vifi,
 1452                             (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
 1453                             vifp->v_ifp->if_xname);
 1454                 }
 1455                 return (ip_mdq(m, ifp, (struct mfc *)NULL, vifi));
 1456         }
 1457         if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1458                 printf("Warning: IPPROTO_RSVP from %x to %x without vif option\n",
 1459                     ntohl(ip->ip_src), ntohl(ip->ip_dst));
 1460         }
 1461 #endif /* RSVP_ISI */
 1462 
 1463         /*
 1464          * Don't forward a packet with time-to-live of zero or one,
 1465          * or a packet destined to a local-only group.
 1466          */
 1467         if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
 1468                 return (0);
 1469 
 1470         /*
 1471          * Determine forwarding vifs from the forwarding cache table
 1472          */
 1473         s = splsoftnet();
 1474         ++mrtstat.mrts_mfc_lookups;
 1475         rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 1476 
 1477         /* Entry exists, so forward if necessary */
 1478         if (rt != NULL) {
 1479                 splx(s);
 1480 #ifdef RSVP_ISI
 1481                 return (ip_mdq(m, ifp, rt, -1));
 1482 #else
 1483                 return (ip_mdq(m, ifp, rt));
 1484 #endif /* RSVP_ISI */
 1485         } else {
 1486                 /*
 1487                  * If we don't have a route for packet's origin,
 1488                  * Make a copy of the packet & send message to routing daemon
 1489                  */
 1490 
 1491                 struct mbuf *mb0;
 1492                 struct rtdetq *rte;
 1493                 u_int32_t hash;
 1494                 int hlen = ip->ip_hl << 2;
 1495 #ifdef UPCALL_TIMING
 1496                 struct timeval tp;
 1497 
 1498                 microtime(&tp);
 1499 #endif /* UPCALL_TIMING */
 1500 
 1501                 ++mrtstat.mrts_mfc_misses;
 1502 
 1503                 mrtstat.mrts_no_route++;
 1504                 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
 1505                         log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
 1506                             ntohl(ip->ip_src.s_addr),
 1507                             ntohl(ip->ip_dst.s_addr));
 1508 
 1509                 /*
 1510                  * Allocate mbufs early so that we don't do extra work if we are
 1511                  * just going to fail anyway.  Make sure to pullup the header so
 1512                  * that other people can't step on it.
 1513                  */
 1514                 rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE,
 1515                                               M_NOWAIT);
 1516                 if (rte == NULL) {
 1517                         splx(s);
 1518                         return (ENOBUFS);
 1519                 }
 1520                 mb0 = m_copy(m, 0, M_COPYALL);
 1521                 M_PULLUP(mb0, hlen);
 1522                 if (mb0 == NULL) {
 1523                         free(rte, M_MRTABLE);
 1524                         splx(s);
 1525                         return (ENOBUFS);
 1526                 }
 1527 
 1528                 /* is there an upcall waiting for this flow? */
 1529                 hash = MFCHASH(ip->ip_src, ip->ip_dst);
 1530                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1531                         if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 1532                             in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 1533                             rt->mfc_stall != NULL)
 1534                                 break;
 1535                 }
 1536 
 1537                 if (rt == NULL) {
 1538                         int i;
 1539                         struct igmpmsg *im;
 1540 
 1541                         /*
 1542                          * Locate the vifi for the incoming interface for
 1543                          * this packet.
 1544                          * If none found, drop packet.
 1545                          */
 1546                         for (vifi = 0; vifi < numvifs &&
 1547                                  viftable[vifi].v_ifp != ifp; vifi++)
 1548                                 ;
 1549                         if (vifi >= numvifs) /* vif not found, drop packet */
 1550                                 goto non_fatal;
 1551 
 1552                         /* no upcall, so make a new entry */
 1553                         rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
 1554                                                   M_NOWAIT);
 1555                         if (rt == NULL)
 1556                                 goto fail;
 1557 
 1558                         /*
 1559                          * Make a copy of the header to send to the user level
 1560                          * process
 1561                          */
 1562                         mm = m_copy(m, 0, hlen);
 1563                         M_PULLUP(mm, hlen);
 1564                         if (mm == NULL)
 1565                                 goto fail1;
 1566 
 1567                         /*
 1568                          * Send message to routing daemon to install
 1569                          * a route into the kernel table
 1570                          */
 1571 
 1572                         im = mtod(mm, struct igmpmsg *);
 1573                         im->im_msgtype = IGMPMSG_NOCACHE;
 1574                         im->im_mbz = 0;
 1575                         im->im_vif = vifi;
 1576 
 1577                         mrtstat.mrts_upcalls++;
 1578 
 1579                         sin.sin_addr = ip->ip_src;
 1580                         if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1581                                 log(LOG_WARNING,
 1582                                     "ip_mforward: ip_mrouter socket queue full\n");
 1583                                 ++mrtstat.mrts_upq_sockfull;
 1584                         fail1:
 1585                                 free(rt, M_MRTABLE);
 1586                         fail:
 1587                                 free(rte, M_MRTABLE);
 1588                                 m_freem(mb0);
 1589                                 splx(s);
 1590                                 return (ENOBUFS);
 1591                         }
 1592 
 1593                         /* insert new entry at head of hash chain */
 1594                         rt->mfc_origin = ip->ip_src;
 1595                         rt->mfc_mcastgrp = ip->ip_dst;
 1596                         rt->mfc_pkt_cnt = 0;
 1597                         rt->mfc_byte_cnt = 0;
 1598                         rt->mfc_wrong_if = 0;
 1599                         rt->mfc_expire = UPCALL_EXPIRE;
 1600                         nexpire[hash]++;
 1601                         for (i = 0; i < numvifs; i++) {
 1602                                 rt->mfc_ttls[i] = 0;
 1603                                 rt->mfc_flags[i] = 0;
 1604                         }
 1605                         rt->mfc_parent = -1;
 1606 
 1607                         /* clear the RP address */
 1608                         rt->mfc_rp = zeroin_addr;
 1609 
 1610                         rt->mfc_bw_meter = NULL;
 1611 
 1612                         /* link into table */
 1613                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1614                         /* Add this entry to the end of the queue */
 1615                         rt->mfc_stall = rte;
 1616                 } else {
 1617                         /* determine if q has overflowed */
 1618                         struct rtdetq **p;
 1619                         int npkts = 0;
 1620 
 1621                         /*
 1622                          * XXX ouch! we need to append to the list, but we
 1623                          * only have a pointer to the front, so we have to
 1624                          * scan the entire list every time.
 1625                          */
 1626                         for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
 1627                                 if (++npkts > MAX_UPQ) {
 1628                                         mrtstat.mrts_upq_ovflw++;
 1629                                 non_fatal:
 1630                                         free(rte, M_MRTABLE);
 1631                                         m_freem(mb0);
 1632                                         splx(s);
 1633                                         return (0);
 1634                                 }
 1635 
 1636                         /* Add this entry to the end of the queue */
 1637                         *p = rte;
 1638                 }
 1639 
 1640                 rte->next = NULL;
 1641                 rte->m = mb0;
 1642                 rte->ifp = ifp;
 1643 #ifdef UPCALL_TIMING
 1644                 rte->t = tp;
 1645 #endif /* UPCALL_TIMING */
 1646 
 1647                 splx(s);
 1648 
 1649                 return (0);
 1650         }
 1651 }
 1652 
 1653 
 1654 /*ARGSUSED*/
 1655 static void
 1656 expire_upcalls(void *v)
 1657 {
 1658         int i;
 1659         int s;
 1660 
 1661         s = splsoftnet();
 1662 
 1663         for (i = 0; i < MFCTBLSIZ; i++) {
 1664                 struct mfc *rt, *nrt;
 1665 
 1666                 if (nexpire[i] == 0)
 1667                         continue;
 1668 
 1669                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
 1670                         nrt = LIST_NEXT(rt, mfc_hash);
 1671 
 1672                         if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 1673                                 continue;
 1674                         nexpire[i]--;
 1675 
 1676                         /*
 1677                          * free the bw_meter entries
 1678                          */
 1679                         while (rt->mfc_bw_meter != NULL) {
 1680                                 struct bw_meter *x = rt->mfc_bw_meter;
 1681 
 1682                                 rt->mfc_bw_meter = x->bm_mfc_next;
 1683                                 free(x, M_BWMETER);
 1684                         }
 1685 
 1686                         ++mrtstat.mrts_cache_cleanups;
 1687                         if (mrtdebug & DEBUG_EXPIRE)
 1688                                 log(LOG_DEBUG,
 1689                                     "expire_upcalls: expiring (%x %x)\n",
 1690                                     ntohl(rt->mfc_origin.s_addr),
 1691                                     ntohl(rt->mfc_mcastgrp.s_addr));
 1692 
 1693                         expire_mfc(rt);
 1694                 }
 1695         }
 1696 
 1697         splx(s);
 1698         callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 1699             expire_upcalls, NULL);
 1700 }
 1701 
 1702 /*
 1703  * Packet forwarding routine once entry in the cache is made
 1704  */
 1705 static int
 1706 #ifdef RSVP_ISI
 1707 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 1708 #else
 1709 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
 1710 #endif /* RSVP_ISI */
 1711 {
 1712         struct ip  *ip = mtod(m, struct ip *);
 1713         vifi_t vifi;
 1714         struct vif *vifp;
 1715         int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
 1716 
 1717 /*
 1718  * Macro to send packet on vif.  Since RSVP packets don't get counted on
 1719  * input, they shouldn't get counted on output, so statistics keeping is
 1720  * separate.
 1721  */
 1722 #define MC_SEND(ip, vifp, m) do {                                       \
 1723         if ((vifp)->v_flags & VIFF_TUNNEL)                              \
 1724                 encap_send((ip), (vifp), (m));                          \
 1725         else                                                            \
 1726                 phyint_send((ip), (vifp), (m));                         \
 1727 } while (/*CONSTCOND*/ 0)
 1728 
 1729 #ifdef RSVP_ISI
 1730         /*
 1731          * If xmt_vif is not -1, send on only the requested vif.
 1732          *
 1733          * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.
 1734          */
 1735         if (xmt_vif < numvifs) {
 1736 #ifdef PIM
 1737                 if (viftable[xmt_vif].v_flags & VIFF_REGISTER)
 1738                         pim_register_send(ip, viftable + xmt_vif, m, rt);
 1739                 else
 1740 #endif
 1741                 MC_SEND(ip, viftable + xmt_vif, m);
 1742                 return (1);
 1743         }
 1744 #endif /* RSVP_ISI */
 1745 
 1746         /*
 1747          * Don't forward if it didn't arrive from the parent vif for its origin.
 1748          */
 1749         vifi = rt->mfc_parent;
 1750         if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
 1751                 /* came in the wrong interface */
 1752                 if (mrtdebug & DEBUG_FORWARD)
 1753                         log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
 1754                             ifp, vifi,
 1755                             vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
 1756                 ++mrtstat.mrts_wrong_if;
 1757                 ++rt->mfc_wrong_if;
 1758                 /*
 1759                  * If we are doing PIM assert processing, send a message
 1760                  * to the routing daemon.
 1761                  *
 1762                  * XXX: A PIM-SM router needs the WRONGVIF detection so it
 1763                  * can complete the SPT switch, regardless of the type
 1764                  * of the iif (broadcast media, GRE tunnel, etc).
 1765                  */
 1766                 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
 1767                         struct timeval now;
 1768                         u_int32_t delta;
 1769 
 1770 #ifdef PIM
 1771                         if (ifp == &multicast_register_if)
 1772                                 pimstat.pims_rcv_registers_wrongiif++;
 1773 #endif
 1774 
 1775                         /* Get vifi for the incoming packet */
 1776                         for (vifi = 0;
 1777                              vifi < numvifs && viftable[vifi].v_ifp != ifp;
 1778                              vifi++)
 1779                             ;
 1780                         if (vifi >= numvifs) {
 1781                                 /* The iif is not found: ignore the packet. */
 1782                                 return (0);
 1783                         }
 1784 
 1785                         if (rt->mfc_flags[vifi] &
 1786                             MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
 1787                                 /* WRONGVIF disabled: ignore the packet */
 1788                                 return (0);
 1789                         }
 1790 
 1791                         microtime(&now);
 1792 
 1793                         TV_DELTA(rt->mfc_last_assert, now, delta);
 1794 
 1795                         if (delta > ASSERT_MSG_TIME) {
 1796                                 struct igmpmsg *im;
 1797                                 int hlen = ip->ip_hl << 2;
 1798                                 struct mbuf *mm = m_copy(m, 0, hlen);
 1799 
 1800                                 M_PULLUP(mm, hlen);
 1801                                 if (mm == NULL)
 1802                                         return (ENOBUFS);
 1803 
 1804                                 rt->mfc_last_assert = now;
 1805 
 1806                                 im = mtod(mm, struct igmpmsg *);
 1807                                 im->im_msgtype  = IGMPMSG_WRONGVIF;
 1808                                 im->im_mbz      = 0;
 1809                                 im->im_vif      = vifi;
 1810 
 1811                                 mrtstat.mrts_upcalls++;
 1812 
 1813                                 sin.sin_addr = im->im_src;
 1814                                 if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1815                                         log(LOG_WARNING,
 1816                                             "ip_mforward: ip_mrouter socket queue full\n");
 1817                                         ++mrtstat.mrts_upq_sockfull;
 1818                                         return (ENOBUFS);
 1819                                 }
 1820                         }
 1821                 }
 1822                 return (0);
 1823         }
 1824 
 1825         /* If I sourced this packet, it counts as output, else it was input. */
 1826         if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
 1827                 viftable[vifi].v_pkt_out++;
 1828                 viftable[vifi].v_bytes_out += plen;
 1829         } else {
 1830                 viftable[vifi].v_pkt_in++;
 1831                 viftable[vifi].v_bytes_in += plen;
 1832         }
 1833         rt->mfc_pkt_cnt++;
 1834         rt->mfc_byte_cnt += plen;
 1835 
 1836         /*
 1837          * For each vif, decide if a copy of the packet should be forwarded.
 1838          * Forward if:
 1839          *              - the ttl exceeds the vif's threshold
 1840          *              - there are group members downstream on interface
 1841          */
 1842         for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
 1843                 if ((rt->mfc_ttls[vifi] > 0) &&
 1844                         (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 1845                         vifp->v_pkt_out++;
 1846                         vifp->v_bytes_out += plen;
 1847 #ifdef PIM
 1848                         if (vifp->v_flags & VIFF_REGISTER)
 1849                                 pim_register_send(ip, vifp, m, rt);
 1850                         else
 1851 #endif
 1852                         MC_SEND(ip, vifp, m);
 1853                 }
 1854 
 1855         /*
 1856          * Perform upcall-related bw measuring.
 1857          */
 1858         if (rt->mfc_bw_meter != NULL) {
 1859                 struct bw_meter *x;
 1860                 struct timeval now;
 1861 
 1862                 microtime(&now);
 1863                 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 1864                         bw_meter_receive_packet(x, plen, &now);
 1865         }
 1866 
 1867         return (0);
 1868 }
 1869 
 1870 #ifdef RSVP_ISI
 1871 /*
 1872  * check if a vif number is legal/ok. This is used by ip_output.
 1873  */
 1874 int
 1875 legal_vif_num(int vif)
 1876 {
 1877         if (vif >= 0 && vif < numvifs)
 1878                 return (1);
 1879         else
 1880                 return (0);
 1881 }
 1882 #endif /* RSVP_ISI */
 1883 
 1884 static void
 1885 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1886 {
 1887         struct mbuf *mb_copy;
 1888         int hlen = ip->ip_hl << 2;
 1889 
 1890         /*
 1891          * Make a new reference to the packet; make sure that
 1892          * the IP header is actually copied, not just referenced,
 1893          * so that ip_output() only scribbles on the copy.
 1894          */
 1895         mb_copy = m_copy(m, 0, M_COPYALL);
 1896         M_PULLUP(mb_copy, hlen);
 1897         if (mb_copy == NULL)
 1898                 return;
 1899 
 1900         if (vifp->v_rate_limit <= 0)
 1901                 tbf_send_packet(vifp, mb_copy);
 1902         else
 1903                 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *),
 1904                     ntohs(ip->ip_len));
 1905 }
 1906 
 1907 static void
 1908 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1909 {
 1910         struct mbuf *mb_copy;
 1911         struct ip *ip_copy;
 1912         int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
 1913 
 1914         /* Take care of delayed checksums */
 1915         if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
 1916                 in_delayed_cksum(m);
 1917                 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
 1918         }
 1919 
 1920         /*
 1921          * copy the old packet & pullup it's IP header into the
 1922          * new mbuf so we can modify it.  Try to fill the new
 1923          * mbuf since if we don't the ethernet driver will.
 1924          */
 1925         MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
 1926         if (mb_copy == NULL)
 1927                 return;
 1928         mb_copy->m_data += max_linkhdr;
 1929         mb_copy->m_pkthdr.len = len;
 1930         mb_copy->m_len = sizeof(multicast_encap_iphdr);
 1931 
 1932         if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
 1933                 m_freem(mb_copy);
 1934                 return;
 1935         }
 1936         i = MHLEN - max_linkhdr;
 1937         if (i > len)
 1938                 i = len;
 1939         mb_copy = m_pullup(mb_copy, i);
 1940         if (mb_copy == NULL)
 1941                 return;
 1942 
 1943         /*
 1944          * fill in the encapsulating IP header.
 1945          */
 1946         ip_copy = mtod(mb_copy, struct ip *);
 1947         *ip_copy = multicast_encap_iphdr;
 1948         ip_copy->ip_id = ip_newid();
 1949         ip_copy->ip_len = htons(len);
 1950         ip_copy->ip_src = vifp->v_lcl_addr;
 1951         ip_copy->ip_dst = vifp->v_rmt_addr;
 1952 
 1953         /*
 1954          * turn the encapsulated IP header back into a valid one.
 1955          */
 1956         ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
 1957         --ip->ip_ttl;
 1958         ip->ip_sum = 0;
 1959         mb_copy->m_data += sizeof(multicast_encap_iphdr);
 1960         ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 1961         mb_copy->m_data -= sizeof(multicast_encap_iphdr);
 1962 
 1963         if (vifp->v_rate_limit <= 0)
 1964                 tbf_send_packet(vifp, mb_copy);
 1965         else
 1966                 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len));
 1967 }
 1968 
 1969 /*
 1970  * De-encapsulate a packet and feed it back through ip input.
 1971  */
 1972 static void
 1973 vif_input(struct mbuf *m, ...)
 1974 {
 1975         int off, proto;
 1976         va_list ap;
 1977         struct vif *vifp;
 1978         int s;
 1979         struct ifqueue *ifq;
 1980 
 1981         va_start(ap, m);
 1982         off = va_arg(ap, int);
 1983         proto = va_arg(ap, int);
 1984         va_end(ap);
 1985 
 1986         vifp = (struct vif *)encap_getarg(m);
 1987         if (!vifp || proto != AF_INET) {
 1988                 m_freem(m);
 1989                 mrtstat.mrts_bad_tunnel++;
 1990                 return;
 1991         }
 1992 
 1993         m_adj(m, off);
 1994         m->m_pkthdr.rcvif = vifp->v_ifp;
 1995         ifq = &ipintrq;
 1996         s = splnet();
 1997         if (IF_QFULL(ifq)) {
 1998                 IF_DROP(ifq);
 1999                 m_freem(m);
 2000         } else {
 2001                 IF_ENQUEUE(ifq, m);
 2002                 /*
 2003                  * normally we would need a "schednetisr(NETISR_IP)"
 2004                  * here but we were called by ip_input and it is going
 2005                  * to loop back & try to dequeue the packet we just
 2006                  * queued as soon as we return so we avoid the
 2007                  * unnecessary software interrrupt.
 2008                  */
 2009         }
 2010         splx(s);
 2011 }
 2012 
 2013 /*
 2014  * Check if the packet should be grabbed by us.
 2015  */
 2016 static int
 2017 vif_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 2018 {
 2019         struct vif *vifp;
 2020         struct ip ip;
 2021 
 2022 #ifdef DIAGNOSTIC
 2023         if (!arg || proto != IPPROTO_IPV4)
 2024                 panic("unexpected arg in vif_encapcheck");
 2025 #endif
 2026 
 2027         /*
 2028          * do not grab the packet if it's not to a multicast destination or if
 2029          * we don't have an encapsulating tunnel with the source.
 2030          * Note:  This code assumes that the remote site IP address
 2031          * uniquely identifies the tunnel (i.e., that this site has
 2032          * at most one tunnel with the remote site).
 2033          */
 2034 
 2035         /* LINTED const cast */
 2036         m_copydata((struct mbuf *)m, off, sizeof(ip), (caddr_t)&ip);
 2037         if (!IN_MULTICAST(ip.ip_dst.s_addr))
 2038                 return 0;
 2039 
 2040         /* LINTED const cast */
 2041         m_copydata((struct mbuf *)m, 0, sizeof(ip), (caddr_t)&ip);
 2042         if (!in_hosteq(ip.ip_src, last_encap_src)) {
 2043                 vifp = (struct vif *)arg;
 2044                 if (vifp->v_flags & VIFF_TUNNEL &&
 2045                     in_hosteq(vifp->v_rmt_addr, ip.ip_src))
 2046                         ;
 2047                 else
 2048                         return 0;
 2049                 last_encap_vif = vifp;
 2050                 last_encap_src = ip.ip_src;
 2051         } else
 2052                 vifp = last_encap_vif;
 2053 
 2054         /* 32bit match, since we have checked ip_src only */
 2055         return 32;
 2056 }
 2057 
 2058 /*
 2059  * Token bucket filter module
 2060  */
 2061 static void
 2062 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len)
 2063 {
 2064 
 2065         if (len > MAX_BKT_SIZE) {
 2066                 /* drop if packet is too large */
 2067                 mrtstat.mrts_pkt2large++;
 2068                 m_freem(m);
 2069                 return;
 2070         }
 2071 
 2072         tbf_update_tokens(vifp);
 2073 
 2074         /*
 2075          * If there are enough tokens, and the queue is empty, send this packet
 2076          * out immediately.  Otherwise, try to insert it on this vif's queue.
 2077          */
 2078         if (vifp->tbf_q_len == 0) {
 2079                 if (len <= vifp->tbf_n_tok) {
 2080                         vifp->tbf_n_tok -= len;
 2081                         tbf_send_packet(vifp, m);
 2082                 } else {
 2083                         /* queue packet and timeout till later */
 2084                         tbf_queue(vifp, m);
 2085                         callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
 2086                             tbf_reprocess_q, vifp);
 2087                 }
 2088         } else {
 2089                 if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
 2090                     !tbf_dq_sel(vifp, ip)) {
 2091                         /* queue full, and couldn't make room */
 2092                         mrtstat.mrts_q_overflow++;
 2093                         m_freem(m);
 2094                 } else {
 2095                         /* queue length low enough, or made room */
 2096                         tbf_queue(vifp, m);
 2097                         tbf_process_q(vifp);
 2098                 }
 2099         }
 2100 }
 2101 
 2102 /*
 2103  * adds a packet to the queue at the interface
 2104  */
 2105 static void
 2106 tbf_queue(struct vif *vifp, struct mbuf *m)
 2107 {
 2108         int s = splsoftnet();
 2109 
 2110         /* insert at tail */
 2111         *vifp->tbf_t = m;
 2112         vifp->tbf_t = &m->m_nextpkt;
 2113         vifp->tbf_q_len++;
 2114 
 2115         splx(s);
 2116 }
 2117 
 2118 
 2119 /*
 2120  * processes the queue at the interface
 2121  */
 2122 static void
 2123 tbf_process_q(struct vif *vifp)
 2124 {
 2125         struct mbuf *m;
 2126         int len;
 2127         int s = splsoftnet();
 2128 
 2129         /*
 2130          * Loop through the queue at the interface and send as many packets
 2131          * as possible.
 2132          */
 2133         for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) {
 2134                 len = ntohs(mtod(m, struct ip *)->ip_len);
 2135 
 2136                 /* determine if the packet can be sent */
 2137                 if (len <= vifp->tbf_n_tok) {
 2138                         /* if so,
 2139                          * reduce no of tokens, dequeue the packet,
 2140                          * send the packet.
 2141                          */
 2142                         if ((vifp->tbf_q = m->m_nextpkt) == NULL)
 2143                                 vifp->tbf_t = &vifp->tbf_q;
 2144                         --vifp->tbf_q_len;
 2145 
 2146                         m->m_nextpkt = NULL;
 2147                         vifp->tbf_n_tok -= len;
 2148                         tbf_send_packet(vifp, m);
 2149                 } else
 2150                         break;
 2151         }
 2152         splx(s);
 2153 }
 2154 
 2155 static void
 2156 tbf_reprocess_q(void *arg)
 2157 {
 2158         struct vif *vifp = arg;
 2159 
 2160         if (ip_mrouter == NULL)
 2161                 return;
 2162 
 2163         tbf_update_tokens(vifp);
 2164         tbf_process_q(vifp);
 2165 
 2166         if (vifp->tbf_q_len != 0)
 2167                 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
 2168                     tbf_reprocess_q, vifp);
 2169 }
 2170 
 2171 /* function that will selectively discard a member of the queue
 2172  * based on the precedence value and the priority
 2173  */
 2174 static int
 2175 tbf_dq_sel(struct vif *vifp, struct ip *ip)
 2176 {
 2177         u_int p;
 2178         struct mbuf **mp, *m;
 2179         int s = splsoftnet();
 2180 
 2181         p = priority(vifp, ip);
 2182 
 2183         for (mp = &vifp->tbf_q, m = *mp;
 2184             m != NULL;
 2185             mp = &m->m_nextpkt, m = *mp) {
 2186                 if (p > priority(vifp, mtod(m, struct ip *))) {
 2187                         if ((*mp = m->m_nextpkt) == NULL)
 2188                                 vifp->tbf_t = mp;
 2189                         --vifp->tbf_q_len;
 2190 
 2191                         m_freem(m);
 2192                         mrtstat.mrts_drop_sel++;
 2193                         splx(s);
 2194                         return (1);
 2195                 }
 2196         }
 2197         splx(s);
 2198         return (0);
 2199 }
 2200 
 2201 static void
 2202 tbf_send_packet(struct vif *vifp, struct mbuf *m)
 2203 {
 2204         int error;
 2205         int s = splsoftnet();
 2206 
 2207         if (vifp->v_flags & VIFF_TUNNEL) {
 2208                 /* If tunnel options */
 2209                 ip_output(m, (struct mbuf *)NULL, &vifp->v_route,
 2210                     IP_FORWARDING, (struct ip_moptions *)NULL,
 2211                     (struct socket *)NULL);
 2212         } else {
 2213                 /* if physical interface option, extract the options and then send */
 2214                 struct ip_moptions imo;
 2215 
 2216                 imo.imo_multicast_ifp = vifp->v_ifp;
 2217                 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
 2218                 imo.imo_multicast_loop = 1;
 2219 #ifdef RSVP_ISI
 2220                 imo.imo_multicast_vif = -1;
 2221 #endif
 2222 
 2223                 error = ip_output(m, (struct mbuf *)NULL, (struct route *)NULL,
 2224                     IP_FORWARDING|IP_MULTICASTOPTS, &imo,
 2225                     (struct socket *)NULL);
 2226 
 2227                 if (mrtdebug & DEBUG_XMIT)
 2228                         log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
 2229                             (long)(vifp - viftable), error);
 2230         }
 2231         splx(s);
 2232 }
 2233 
 2234 /* determine the current time and then
 2235  * the elapsed time (between the last time and time now)
 2236  * in milliseconds & update the no. of tokens in the bucket
 2237  */
 2238 static void
 2239 tbf_update_tokens(struct vif *vifp)
 2240 {
 2241         struct timeval tp;
 2242         u_int32_t tm;
 2243         int s = splsoftnet();
 2244 
 2245         microtime(&tp);
 2246 
 2247         TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
 2248 
 2249         /*
 2250          * This formula is actually
 2251          * "time in seconds" * "bytes/second".
 2252          *
 2253          * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
 2254          *
 2255          * The (1000/1024) was introduced in add_vif to optimize
 2256          * this divide into a shift.
 2257          */
 2258         vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
 2259         vifp->tbf_last_pkt_t = tp;
 2260 
 2261         if (vifp->tbf_n_tok > MAX_BKT_SIZE)
 2262                 vifp->tbf_n_tok = MAX_BKT_SIZE;
 2263 
 2264         splx(s);
 2265 }
 2266 
 2267 static int
 2268 priority(struct vif *vifp, struct ip *ip)
 2269 {
 2270         int prio = 50;  /* the lowest priority -- default case */
 2271 
 2272         /* temporary hack; may add general packet classifier some day */
 2273 
 2274         /*
 2275          * The UDP port space is divided up into four priority ranges:
 2276          * [0, 16384)     : unclassified - lowest priority
 2277          * [16384, 32768) : audio - highest priority
 2278          * [32768, 49152) : whiteboard - medium priority
 2279          * [49152, 65536) : video - low priority
 2280          */
 2281         if (ip->ip_p == IPPROTO_UDP) {
 2282                 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
 2283 
 2284                 switch (ntohs(udp->uh_dport) & 0xc000) {
 2285                 case 0x4000:
 2286                         prio = 70;
 2287                         break;
 2288                 case 0x8000:
 2289                         prio = 60;
 2290                         break;
 2291                 case 0xc000:
 2292                         prio = 55;
 2293                         break;
 2294                 }
 2295 
 2296                 if (tbfdebug > 1)
 2297                         log(LOG_DEBUG, "port %x prio %d\n",
 2298                             ntohs(udp->uh_dport), prio);
 2299         }
 2300 
 2301         return (prio);
 2302 }
 2303 
 2304 /*
 2305  * End of token bucket filter modifications
 2306  */
 2307 #ifdef RSVP_ISI
 2308 int
 2309 ip_rsvp_vif_init(struct socket *so, struct mbuf *m)
 2310 {
 2311         int vifi, s;
 2312 
 2313         if (rsvpdebug)
 2314                 printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n",
 2315                     so->so_type, so->so_proto->pr_protocol);
 2316 
 2317         if (so->so_type != SOCK_RAW ||
 2318             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2319                 return (EOPNOTSUPP);
 2320 
 2321         /* Check mbuf. */
 2322         if (m == NULL || m->m_len != sizeof(int)) {
 2323                 return (EINVAL);
 2324         }
 2325         vifi = *(mtod(m, int *));
 2326 
 2327         if (rsvpdebug)
 2328                 printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n",
 2329                        vifi, rsvp_on);
 2330 
 2331         s = splsoftnet();
 2332 
 2333         /* Check vif. */
 2334         if (!legal_vif_num(vifi)) {
 2335                 splx(s);
 2336                 return (EADDRNOTAVAIL);
 2337         }
 2338 
 2339         /* Check if socket is available. */
 2340         if (viftable[vifi].v_rsvpd != NULL) {
 2341                 splx(s);
 2342                 return (EADDRINUSE);
 2343         }
 2344 
 2345         viftable[vifi].v_rsvpd = so;
 2346         /*
 2347          * This may seem silly, but we need to be sure we don't over-increment
 2348          * the RSVP counter, in case something slips up.
 2349          */
 2350         if (!viftable[vifi].v_rsvp_on) {
 2351                 viftable[vifi].v_rsvp_on = 1;
 2352                 rsvp_on++;
 2353         }
 2354 
 2355         splx(s);
 2356         return (0);
 2357 }
 2358 
 2359 int
 2360 ip_rsvp_vif_done(struct socket *so, struct mbuf *m)
 2361 {
 2362         int vifi, s;
 2363 
 2364         if (rsvpdebug)
 2365                 printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n",
 2366                     so->so_type, so->so_proto->pr_protocol);
 2367 
 2368         if (so->so_type != SOCK_RAW ||
 2369             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2370                 return (EOPNOTSUPP);
 2371 
 2372         /* Check mbuf. */
 2373         if (m == NULL || m->m_len != sizeof(int)) {
 2374                 return (EINVAL);
 2375         }
 2376         vifi = *(mtod(m, int *));
 2377 
 2378         s = splsoftnet();
 2379 
 2380         /* Check vif. */
 2381         if (!legal_vif_num(vifi)) {
 2382                 splx(s);
 2383                 return (EADDRNOTAVAIL);
 2384         }
 2385 
 2386         if (rsvpdebug)
 2387                 printf("ip_rsvp_vif_done: v_rsvpd = %x so = %x\n",
 2388                     viftable[vifi].v_rsvpd, so);
 2389 
 2390         viftable[vifi].v_rsvpd = NULL;
 2391         /*
 2392          * This may seem silly, but we need to be sure we don't over-decrement
 2393          * the RSVP counter, in case something slips up.
 2394          */
 2395         if (viftable[vifi].v_rsvp_on) {
 2396                 viftable[vifi].v_rsvp_on = 0;
 2397                 rsvp_on--;
 2398         }
 2399 
 2400         splx(s);
 2401         return (0);
 2402 }
 2403 
 2404 void
 2405 ip_rsvp_force_done(struct socket *so)
 2406 {
 2407         int vifi, s;
 2408 
 2409         /* Don't bother if it is not the right type of socket. */
 2410         if (so->so_type != SOCK_RAW ||
 2411             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2412                 return;
 2413 
 2414         s = splsoftnet();
 2415 
 2416         /*
 2417          * The socket may be attached to more than one vif...this
 2418          * is perfectly legal.
 2419          */
 2420         for (vifi = 0; vifi < numvifs; vifi++) {
 2421                 if (viftable[vifi].v_rsvpd == so) {
 2422                         viftable[vifi].v_rsvpd = NULL;
 2423                         /*
 2424                          * This may seem silly, but we need to be sure we don't
 2425                          * over-decrement the RSVP counter, in case something
 2426                          * slips up.
 2427                          */
 2428                         if (viftable[vifi].v_rsvp_on) {
 2429                                 viftable[vifi].v_rsvp_on = 0;
 2430                                 rsvp_on--;
 2431                         }
 2432                 }
 2433         }
 2434 
 2435         splx(s);
 2436         return;
 2437 }
 2438 
 2439 void
 2440 rsvp_input(struct mbuf *m, struct ifnet *ifp)
 2441 {
 2442         int vifi, s;
 2443         struct ip *ip = mtod(m, struct ip *);
 2444         static struct sockaddr_in rsvp_src = { sizeof(sin), AF_INET };
 2445 
 2446         if (rsvpdebug)
 2447                 printf("rsvp_input: rsvp_on %d\n", rsvp_on);
 2448 
 2449         /*
 2450          * Can still get packets with rsvp_on = 0 if there is a local member
 2451          * of the group to which the RSVP packet is addressed.  But in this
 2452          * case we want to throw the packet away.
 2453          */
 2454         if (!rsvp_on) {
 2455                 m_freem(m);
 2456                 return;
 2457         }
 2458 
 2459         /*
 2460          * If the old-style non-vif-associated socket is set, then use
 2461          * it and ignore the new ones.
 2462          */
 2463         if (ip_rsvpd != NULL) {
 2464                 if (rsvpdebug)
 2465                         printf("rsvp_input: "
 2466                             "Sending packet up old-style socket\n");
 2467                 rip_input(m);   /*XXX*/
 2468                 return;
 2469         }
 2470 
 2471         s = splsoftnet();
 2472 
 2473         if (rsvpdebug)
 2474                 printf("rsvp_input: check vifs\n");
 2475 
 2476         /* Find which vif the packet arrived on. */
 2477         for (vifi = 0; vifi < numvifs; vifi++) {
 2478                 if (viftable[vifi].v_ifp == ifp)
 2479                         break;
 2480         }
 2481 
 2482         if (vifi == numvifs) {
 2483                 /* Can't find vif packet arrived on. Drop packet. */
 2484                 if (rsvpdebug)
 2485                         printf("rsvp_input: "
 2486                             "Can't find vif for packet...dropping it.\n");
 2487                 m_freem(m);
 2488                 splx(s);
 2489                 return;
 2490         }
 2491 
 2492         if (rsvpdebug)
 2493                 printf("rsvp_input: check socket\n");
 2494 
 2495         if (viftable[vifi].v_rsvpd == NULL) {
 2496                 /*
 2497                  * drop packet, since there is no specific socket for this
 2498                  * interface
 2499                  */
 2500                 if (rsvpdebug)
 2501                         printf("rsvp_input: No socket defined for vif %d\n",
 2502                             vifi);
 2503                 m_freem(m);
 2504                 splx(s);
 2505                 return;
 2506         }
 2507 
 2508         rsvp_src.sin_addr = ip->ip_src;
 2509 
 2510         if (rsvpdebug && m)
 2511                 printf("rsvp_input: m->m_len = %d, sbspace() = %d\n",
 2512                     m->m_len, sbspace(&viftable[vifi].v_rsvpd->so_rcv));
 2513 
 2514         if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0)
 2515                 if (rsvpdebug)
 2516                         printf("rsvp_input: Failed to append to socket\n");
 2517         else
 2518                 if (rsvpdebug)
 2519                         printf("rsvp_input: send packet up\n");
 2520 
 2521         splx(s);
 2522 }
 2523 #endif /* RSVP_ISI */
 2524 
 2525 /*
 2526  * Code for bandwidth monitors
 2527  */
 2528 
 2529 /*
 2530  * Define common interface for timeval-related methods
 2531  */
 2532 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
 2533 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
 2534 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
 2535 
 2536 static uint32_t
 2537 compute_bw_meter_flags(struct bw_upcall *req)
 2538 {
 2539     uint32_t flags = 0;
 2540 
 2541     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 2542         flags |= BW_METER_UNIT_PACKETS;
 2543     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 2544         flags |= BW_METER_UNIT_BYTES;
 2545     if (req->bu_flags & BW_UPCALL_GEQ)
 2546         flags |= BW_METER_GEQ;
 2547     if (req->bu_flags & BW_UPCALL_LEQ)
 2548         flags |= BW_METER_LEQ;
 2549 
 2550     return flags;
 2551 }
 2552 
 2553 /*
 2554  * Add a bw_meter entry
 2555  */
 2556 static int
 2557 add_bw_upcall(struct mbuf *m)
 2558 {
 2559     int s;
 2560     struct mfc *mfc;
 2561     struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 2562                 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
 2563     struct timeval now;
 2564     struct bw_meter *x;
 2565     uint32_t flags;
 2566     struct bw_upcall *req;
 2567 
 2568     if (m == NULL || m->m_len < sizeof(struct bw_upcall))
 2569         return EINVAL;
 2570 
 2571     req = mtod(m, struct bw_upcall *);
 2572 
 2573     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2574         return EOPNOTSUPP;
 2575 
 2576     /* Test if the flags are valid */
 2577     if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 2578         return EINVAL;
 2579     if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 2580         return EINVAL;
 2581     if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2582             == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2583         return EINVAL;
 2584 
 2585     /* Test if the threshold time interval is valid */
 2586     if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 2587         return EINVAL;
 2588 
 2589     flags = compute_bw_meter_flags(req);
 2590 
 2591     /*
 2592      * Find if we have already same bw_meter entry
 2593      */
 2594     s = splsoftnet();
 2595     mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2596     if (mfc == NULL) {
 2597         splx(s);
 2598         return EADDRNOTAVAIL;
 2599     }
 2600     for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 2601         if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2602                            &req->bu_threshold.b_time, ==)) &&
 2603             (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2604             (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2605             (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 2606             splx(s);
 2607             return 0;           /* XXX Already installed */
 2608         }
 2609     }
 2610 
 2611     /* Allocate the new bw_meter entry */
 2612     x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
 2613     if (x == NULL) {
 2614         splx(s);
 2615         return ENOBUFS;
 2616     }
 2617 
 2618     /* Set the new bw_meter entry */
 2619     x->bm_threshold.b_time = req->bu_threshold.b_time;
 2620     microtime(&now);
 2621     x->bm_start_time = now;
 2622     x->bm_threshold.b_packets = req->bu_threshold.b_packets;
 2623     x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
 2624     x->bm_measured.b_packets = 0;
 2625     x->bm_measured.b_bytes = 0;
 2626     x->bm_flags = flags;
 2627     x->bm_time_next = NULL;
 2628     x->bm_time_hash = BW_METER_BUCKETS;
 2629 
 2630     /* Add the new bw_meter entry to the front of entries for this MFC */
 2631     x->bm_mfc = mfc;
 2632     x->bm_mfc_next = mfc->mfc_bw_meter;
 2633     mfc->mfc_bw_meter = x;
 2634     schedule_bw_meter(x, &now);
 2635     splx(s);
 2636 
 2637     return 0;
 2638 }
 2639 
 2640 static void
 2641 free_bw_list(struct bw_meter *list)
 2642 {
 2643     while (list != NULL) {
 2644         struct bw_meter *x = list;
 2645 
 2646         list = list->bm_mfc_next;
 2647         unschedule_bw_meter(x);
 2648         free(x, M_BWMETER);
 2649     }
 2650 }
 2651 
 2652 /*
 2653  * Delete one or multiple bw_meter entries
 2654  */
 2655 static int
 2656 del_bw_upcall(struct mbuf *m)
 2657 {
 2658     int s;
 2659     struct mfc *mfc;
 2660     struct bw_meter *x;
 2661     struct bw_upcall *req;
 2662 
 2663     if (m == NULL || m->m_len < sizeof(struct bw_upcall))
 2664         return EINVAL;
 2665 
 2666     req = mtod(m, struct bw_upcall *);
 2667 
 2668     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2669         return EOPNOTSUPP;
 2670 
 2671     s = splsoftnet();
 2672     /* Find the corresponding MFC entry */
 2673     mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2674     if (mfc == NULL) {
 2675         splx(s);
 2676         return EADDRNOTAVAIL;
 2677     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 2678         /*
 2679          * Delete all bw_meter entries for this mfc
 2680          */
 2681         struct bw_meter *list;
 2682 
 2683         list = mfc->mfc_bw_meter;
 2684         mfc->mfc_bw_meter = NULL;
 2685         free_bw_list(list);
 2686         splx(s);
 2687         return 0;
 2688     } else {                    /* Delete a single bw_meter entry */
 2689         struct bw_meter *prev;
 2690         uint32_t flags = 0;
 2691 
 2692         flags = compute_bw_meter_flags(req);
 2693 
 2694         /* Find the bw_meter entry to delete */
 2695         for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 2696              prev = x, x = x->bm_mfc_next) {
 2697             if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2698                                &req->bu_threshold.b_time, ==)) &&
 2699                 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2700                 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2701                 (x->bm_flags & BW_METER_USER_FLAGS) == flags)
 2702                 break;
 2703         }
 2704         if (x != NULL) { /* Delete entry from the list for this MFC */
 2705             if (prev != NULL)
 2706                 prev->bm_mfc_next = x->bm_mfc_next;     /* remove from middle*/
 2707             else
 2708                 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 2709 
 2710             unschedule_bw_meter(x);
 2711             splx(s);
 2712             /* Free the bw_meter entry */
 2713             free(x, M_BWMETER);
 2714             return 0;
 2715         } else {
 2716             splx(s);
 2717             return EINVAL;
 2718         }
 2719     }
 2720     /* NOTREACHED */
 2721 }
 2722 
 2723 /*
 2724  * Perform bandwidth measurement processing that may result in an upcall
 2725  */
 2726 static void
 2727 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 2728 {
 2729     struct timeval delta;
 2730 
 2731     delta = *nowp;
 2732     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2733 
 2734     if (x->bm_flags & BW_METER_GEQ) {
 2735         /*
 2736          * Processing for ">=" type of bw_meter entry
 2737          */
 2738         if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2739             /* Reset the bw_meter entry */
 2740             x->bm_start_time = *nowp;
 2741             x->bm_measured.b_packets = 0;
 2742             x->bm_measured.b_bytes = 0;
 2743             x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2744         }
 2745 
 2746         /* Record that a packet is received */
 2747         x->bm_measured.b_packets++;
 2748         x->bm_measured.b_bytes += plen;
 2749 
 2750         /*
 2751          * Test if we should deliver an upcall
 2752          */
 2753         if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 2754             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2755                  (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 2756                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2757                  (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 2758                 /* Prepare an upcall for delivery */
 2759                 bw_meter_prepare_upcall(x, nowp);
 2760                 x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 2761             }
 2762         }
 2763     } else if (x->bm_flags & BW_METER_LEQ) {
 2764         /*
 2765          * Processing for "<=" type of bw_meter entry
 2766          */
 2767         if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2768             /*
 2769              * We are behind time with the multicast forwarding table
 2770              * scanning for "<=" type of bw_meter entries, so test now
 2771              * if we should deliver an upcall.
 2772              */
 2773             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2774                  (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 2775                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2776                  (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 2777                 /* Prepare an upcall for delivery */
 2778                 bw_meter_prepare_upcall(x, nowp);
 2779             }
 2780             /* Reschedule the bw_meter entry */
 2781             unschedule_bw_meter(x);
 2782             schedule_bw_meter(x, nowp);
 2783         }
 2784 
 2785         /* Record that a packet is received */
 2786         x->bm_measured.b_packets++;
 2787         x->bm_measured.b_bytes += plen;
 2788 
 2789         /*
 2790          * Test if we should restart the measuring interval
 2791          */
 2792         if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 2793              x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 2794             (x->bm_flags & BW_METER_UNIT_BYTES &&
 2795              x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 2796             /* Don't restart the measuring interval */
 2797         } else {
 2798             /* Do restart the measuring interval */
 2799             /*
 2800              * XXX: note that we don't unschedule and schedule, because this
 2801              * might be too much overhead per packet. Instead, when we process
 2802              * all entries for a given timer hash bin, we check whether it is
 2803              * really a timeout. If not, we reschedule at that time.
 2804              */
 2805             x->bm_start_time = *nowp;
 2806             x->bm_measured.b_packets = 0;
 2807             x->bm_measured.b_bytes = 0;
 2808             x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2809         }
 2810     }
 2811 }
 2812 
 2813 /*
 2814  * Prepare a bandwidth-related upcall
 2815  */
 2816 static void
 2817 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 2818 {
 2819     struct timeval delta;
 2820     struct bw_upcall *u;
 2821 
 2822     /*
 2823      * Compute the measured time interval
 2824      */
 2825     delta = *nowp;
 2826     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2827 
 2828     /*
 2829      * If there are too many pending upcalls, deliver them now
 2830      */
 2831     if (bw_upcalls_n >= BW_UPCALLS_MAX)
 2832         bw_upcalls_send();
 2833 
 2834     /*
 2835      * Set the bw_upcall entry
 2836      */
 2837     u = &bw_upcalls[bw_upcalls_n++];
 2838     u->bu_src = x->bm_mfc->mfc_origin;
 2839     u->bu_dst = x->bm_mfc->mfc_mcastgrp;
 2840     u->bu_threshold.b_time = x->bm_threshold.b_time;
 2841     u->bu_threshold.b_packets = x->bm_threshold.b_packets;
 2842     u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
 2843     u->bu_measured.b_time = delta;
 2844     u->bu_measured.b_packets = x->bm_measured.b_packets;
 2845     u->bu_measured.b_bytes = x->bm_measured.b_bytes;
 2846     u->bu_flags = 0;
 2847     if (x->bm_flags & BW_METER_UNIT_PACKETS)
 2848         u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
 2849     if (x->bm_flags & BW_METER_UNIT_BYTES)
 2850         u->bu_flags |= BW_UPCALL_UNIT_BYTES;
 2851     if (x->bm_flags & BW_METER_GEQ)
 2852         u->bu_flags |= BW_UPCALL_GEQ;
 2853     if (x->bm_flags & BW_METER_LEQ)
 2854         u->bu_flags |= BW_UPCALL_LEQ;
 2855 }
 2856 
 2857 /*
 2858  * Send the pending bandwidth-related upcalls
 2859  */
 2860 static void
 2861 bw_upcalls_send(void)
 2862 {
 2863     struct mbuf *m;
 2864     int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
 2865     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 2866     static struct igmpmsg igmpmsg = { 0,                /* unused1 */
 2867                                       0,                /* unused2 */
 2868                                       IGMPMSG_BW_UPCALL,/* im_msgtype */
 2869                                       0,                /* im_mbz  */
 2870                                       0,                /* im_vif  */
 2871                                       0,                /* unused3 */
 2872                                       { 0 },            /* im_src  */
 2873                                       { 0 } };          /* im_dst  */
 2874 
 2875     if (bw_upcalls_n == 0)
 2876         return;                 /* No pending upcalls */
 2877 
 2878     bw_upcalls_n = 0;
 2879 
 2880     /*
 2881      * Allocate a new mbuf, initialize it with the header and
 2882      * the payload for the pending calls.
 2883      */
 2884     MGETHDR(m, M_DONTWAIT, MT_HEADER);
 2885     if (m == NULL) {
 2886         log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 2887         return;
 2888     }
 2889 
 2890     m->m_len = m->m_pkthdr.len = 0;
 2891     m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
 2892     m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]);
 2893 
 2894     /*
 2895      * Send the upcalls
 2896      * XXX do we need to set the address in k_igmpsrc ?
 2897      */
 2898     mrtstat.mrts_upcalls++;
 2899     if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
 2900         log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 2901         ++mrtstat.mrts_upq_sockfull;
 2902     }
 2903 }
 2904 
 2905 /*
 2906  * Compute the timeout hash value for the bw_meter entries
 2907  */
 2908 #define BW_METER_TIMEHASH(bw_meter, hash)                               \
 2909     do {                                                                \
 2910         struct timeval next_timeval = (bw_meter)->bm_start_time;        \
 2911                                                                         \
 2912         BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 2913         (hash) = next_timeval.tv_sec;                                   \
 2914         if (next_timeval.tv_usec)                                       \
 2915             (hash)++; /* XXX: make sure we don't timeout early */       \
 2916         (hash) %= BW_METER_BUCKETS;                                     \
 2917     } while (/*CONSTCOND*/ 0)
 2918 
 2919 /*
 2920  * Schedule a timer to process periodically bw_meter entry of type "<="
 2921  * by linking the entry in the proper hash bucket.
 2922  */
 2923 static void
 2924 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 2925 {
 2926     int time_hash;
 2927 
 2928     if (!(x->bm_flags & BW_METER_LEQ))
 2929         return;         /* XXX: we schedule timers only for "<=" entries */
 2930 
 2931     /*
 2932      * Reset the bw_meter entry
 2933      */
 2934     x->bm_start_time = *nowp;
 2935     x->bm_measured.b_packets = 0;
 2936     x->bm_measured.b_bytes = 0;
 2937     x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2938 
 2939     /*
 2940      * Compute the timeout hash value and insert the entry
 2941      */
 2942     BW_METER_TIMEHASH(x, time_hash);
 2943     x->bm_time_next = bw_meter_timers[time_hash];
 2944     bw_meter_timers[time_hash] = x;
 2945     x->bm_time_hash = time_hash;
 2946 }
 2947 
 2948 /*
 2949  * Unschedule the periodic timer that processes bw_meter entry of type "<="
 2950  * by removing the entry from the proper hash bucket.
 2951  */
 2952 static void
 2953 unschedule_bw_meter(struct bw_meter *x)
 2954 {
 2955     int time_hash;
 2956     struct bw_meter *prev, *tmp;
 2957 
 2958     if (!(x->bm_flags & BW_METER_LEQ))
 2959         return;         /* XXX: we schedule timers only for "<=" entries */
 2960 
 2961     /*
 2962      * Compute the timeout hash value and delete the entry
 2963      */
 2964     time_hash = x->bm_time_hash;
 2965     if (time_hash >= BW_METER_BUCKETS)
 2966         return;         /* Entry was not scheduled */
 2967 
 2968     for (prev = NULL, tmp = bw_meter_timers[time_hash];
 2969              tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 2970         if (tmp == x)
 2971             break;
 2972 
 2973     if (tmp == NULL)
 2974         panic("unschedule_bw_meter: bw_meter entry not found");
 2975 
 2976     if (prev != NULL)
 2977         prev->bm_time_next = x->bm_time_next;
 2978     else
 2979         bw_meter_timers[time_hash] = x->bm_time_next;
 2980 
 2981     x->bm_time_next = NULL;
 2982     x->bm_time_hash = BW_METER_BUCKETS;
 2983 }
 2984 
 2985 /*
 2986  * Process all "<=" type of bw_meter that should be processed now,
 2987  * and for each entry prepare an upcall if necessary. Each processed
 2988  * entry is rescheduled again for the (periodic) processing.
 2989  *
 2990  * This is run periodically (once per second normally). On each round,
 2991  * all the potentially matching entries are in the hash slot that we are
 2992  * looking at.
 2993  */
 2994 static void
 2995 bw_meter_process(void)
 2996 {
 2997     int s;
 2998     static uint32_t last_tv_sec;        /* last time we processed this */
 2999 
 3000     uint32_t loops;
 3001     int i;
 3002     struct timeval now, process_endtime;
 3003 
 3004     microtime(&now);
 3005     if (last_tv_sec == now.tv_sec)
 3006         return;         /* nothing to do */
 3007 
 3008     loops = now.tv_sec - last_tv_sec;
 3009     last_tv_sec = now.tv_sec;
 3010     if (loops > BW_METER_BUCKETS)
 3011         loops = BW_METER_BUCKETS;
 3012 
 3013     s = splsoftnet();
 3014     /*
 3015      * Process all bins of bw_meter entries from the one after the last
 3016      * processed to the current one. On entry, i points to the last bucket
 3017      * visited, so we need to increment i at the beginning of the loop.
 3018      */
 3019     for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 3020         struct bw_meter *x, *tmp_list;
 3021 
 3022         if (++i >= BW_METER_BUCKETS)
 3023             i = 0;
 3024 
 3025         /* Disconnect the list of bw_meter entries from the bin */
 3026         tmp_list = bw_meter_timers[i];
 3027         bw_meter_timers[i] = NULL;
 3028 
 3029         /* Process the list of bw_meter entries */
 3030         while (tmp_list != NULL) {
 3031             x = tmp_list;
 3032             tmp_list = tmp_list->bm_time_next;
 3033 
 3034             /* Test if the time interval is over */
 3035             process_endtime = x->bm_start_time;
 3036             BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 3037             if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 3038                 /* Not yet: reschedule, but don't reset */
 3039                 int time_hash;
 3040 
 3041                 BW_METER_TIMEHASH(x, time_hash);
 3042                 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 3043                     /*
 3044                      * XXX: somehow the bin processing is a bit ahead of time.
 3045                      * Put the entry in the next bin.
 3046                      */
 3047                     if (++time_hash >= BW_METER_BUCKETS)
 3048                         time_hash = 0;
 3049                 }
 3050                 x->bm_time_next = bw_meter_timers[time_hash];
 3051                 bw_meter_timers[time_hash] = x;
 3052                 x->bm_time_hash = time_hash;
 3053 
 3054                 continue;
 3055             }
 3056 
 3057             /*
 3058              * Test if we should deliver an upcall
 3059              */
 3060             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 3061                  (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 3062                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 3063                  (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 3064                 /* Prepare an upcall for delivery */
 3065                 bw_meter_prepare_upcall(x, &now);
 3066             }
 3067 
 3068             /*
 3069              * Reschedule for next processing
 3070              */
 3071             schedule_bw_meter(x, &now);
 3072         }
 3073     }
 3074 
 3075     /* Send all upcalls that are pending delivery */
 3076     bw_upcalls_send();
 3077 
 3078     splx(s);
 3079 }
 3080 
 3081 /*
 3082  * A periodic function for sending all upcalls that are pending delivery
 3083  */
 3084 static void
 3085 expire_bw_upcalls_send(void *unused)
 3086 {
 3087     int s;
 3088 
 3089     s = splsoftnet();
 3090     bw_upcalls_send();
 3091     splx(s);
 3092 
 3093     callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
 3094                   expire_bw_upcalls_send, NULL);
 3095 }
 3096 
 3097 /*
 3098  * A periodic function for periodic scanning of the multicast forwarding
 3099  * table for processing all "<=" bw_meter entries.
 3100  */
 3101 static void
 3102 expire_bw_meter_process(void *unused)
 3103 {
 3104     if (mrt_api_config & MRT_MFC_BW_UPCALL)
 3105         bw_meter_process();
 3106 
 3107     callout_reset(&bw_meter_ch, BW_METER_PERIOD,
 3108                   expire_bw_meter_process, NULL);
 3109 }
 3110 
 3111 /*
 3112  * End of bandwidth monitoring code
 3113  */
 3114 
 3115 #ifdef PIM
 3116 /*
 3117  * Send the packet up to the user daemon, or eventually do kernel encapsulation
 3118  */
 3119 static int
 3120 pim_register_send(struct ip *ip, struct vif *vifp,
 3121         struct mbuf *m, struct mfc *rt)
 3122 {
 3123     struct mbuf *mb_copy, *mm;
 3124 
 3125     if (mrtdebug & DEBUG_PIM)
 3126         log(LOG_DEBUG, "pim_register_send: ");
 3127 
 3128     mb_copy = pim_register_prepare(ip, m);
 3129     if (mb_copy == NULL)
 3130         return ENOBUFS;
 3131 
 3132     /*
 3133      * Send all the fragments. Note that the mbuf for each fragment
 3134      * is freed by the sending machinery.
 3135      */
 3136     for (mm = mb_copy; mm; mm = mb_copy) {
 3137         mb_copy = mm->m_nextpkt;
 3138         mm->m_nextpkt = NULL;
 3139         mm = m_pullup(mm, sizeof(struct ip));
 3140         if (mm != NULL) {
 3141             ip = mtod(mm, struct ip *);
 3142             if ((mrt_api_config & MRT_MFC_RP) &&
 3143                 !in_nullhost(rt->mfc_rp)) {
 3144                 pim_register_send_rp(ip, vifp, mm, rt);
 3145             } else {
 3146                 pim_register_send_upcall(ip, vifp, mm, rt);
 3147             }
 3148         }
 3149     }
 3150 
 3151     return 0;
 3152 }
 3153 
 3154 /*
 3155  * Return a copy of the data packet that is ready for PIM Register
 3156  * encapsulation.
 3157  * XXX: Note that in the returned copy the IP header is a valid one.
 3158  */
 3159 static struct mbuf *
 3160 pim_register_prepare(struct ip *ip, struct mbuf *m)
 3161 {
 3162     struct mbuf *mb_copy = NULL;
 3163     int mtu;
 3164 
 3165     /* Take care of delayed checksums */
 3166     if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
 3167         in_delayed_cksum(m);
 3168         m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
 3169     }
 3170 
 3171     /*
 3172      * Copy the old packet & pullup its IP header into the
 3173      * new mbuf so we can modify it.
 3174      */
 3175     mb_copy = m_copy(m, 0, M_COPYALL);
 3176     if (mb_copy == NULL)
 3177         return NULL;
 3178     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
 3179     if (mb_copy == NULL)
 3180         return NULL;
 3181 
 3182     /* take care of the TTL */
 3183     ip = mtod(mb_copy, struct ip *);
 3184     --ip->ip_ttl;
 3185 
 3186     /* Compute the MTU after the PIM Register encapsulation */
 3187     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 3188 
 3189     if (ntohs(ip->ip_len) <= mtu) {
 3190         /* Turn the IP header into a valid one */
 3191         ip->ip_sum = 0;
 3192         ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 3193     } else {
 3194         /* Fragment the packet */
 3195         if (ip_fragment(mb_copy, NULL, mtu) != 0) {
 3196             /* XXX: mb_copy was freed by ip_fragment() */
 3197             return NULL;
 3198         }
 3199     }
 3200     return mb_copy;
 3201 }
 3202 
 3203 /*
 3204  * Send an upcall with the data packet to the user-level process.
 3205  */
 3206 static int
 3207 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
 3208         struct mbuf *mb_copy, struct mfc *rt)
 3209 {
 3210     struct mbuf *mb_first;
 3211     int len = ntohs(ip->ip_len);
 3212     struct igmpmsg *im;
 3213     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 3214 
 3215     /*
 3216      * Add a new mbuf with an upcall header
 3217      */
 3218     MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 3219     if (mb_first == NULL) {
 3220         m_freem(mb_copy);
 3221         return ENOBUFS;
 3222     }
 3223     mb_first->m_data += max_linkhdr;
 3224     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
 3225     mb_first->m_len = sizeof(struct igmpmsg);
 3226     mb_first->m_next = mb_copy;
 3227 
 3228     /* Send message to routing daemon */
 3229     im = mtod(mb_first, struct igmpmsg *);
 3230     im->im_msgtype      = IGMPMSG_WHOLEPKT;
 3231     im->im_mbz          = 0;
 3232     im->im_vif          = vifp - viftable;
 3233     im->im_src          = ip->ip_src;
 3234     im->im_dst          = ip->ip_dst;
 3235 
 3236     k_igmpsrc.sin_addr  = ip->ip_src;
 3237 
 3238     mrtstat.mrts_upcalls++;
 3239 
 3240     if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 3241         if (mrtdebug & DEBUG_PIM)
 3242             log(LOG_WARNING,
 3243                 "mcast: pim_register_send_upcall: ip_mrouter socket queue full");
 3244         ++mrtstat.mrts_upq_sockfull;
 3245         return ENOBUFS;
 3246     }
 3247 
 3248     /* Keep statistics */
 3249     pimstat.pims_snd_registers_msgs++;
 3250     pimstat.pims_snd_registers_bytes += len;
 3251 
 3252     return 0;
 3253 }
 3254 
 3255 /*
 3256  * Encapsulate the data packet in PIM Register message and send it to the RP.
 3257  */
 3258 static int
 3259 pim_register_send_rp(struct ip *ip, struct vif *vifp,
 3260         struct mbuf *mb_copy, struct mfc *rt)
 3261 {
 3262     struct mbuf *mb_first;
 3263     struct ip *ip_outer;
 3264     struct pim_encap_pimhdr *pimhdr;
 3265     int len = ntohs(ip->ip_len);
 3266     vifi_t vifi = rt->mfc_parent;
 3267 
 3268     if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
 3269         m_freem(mb_copy);
 3270         return EADDRNOTAVAIL;           /* The iif vif is invalid */
 3271     }
 3272 
 3273     /*
 3274      * Add a new mbuf with the encapsulating header
 3275      */
 3276     MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 3277     if (mb_first == NULL) {
 3278         m_freem(mb_copy);
 3279         return ENOBUFS;
 3280     }
 3281     mb_first->m_data += max_linkhdr;
 3282     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
 3283     mb_first->m_next = mb_copy;
 3284 
 3285     mb_first->m_pkthdr.len = len + mb_first->m_len;
 3286 
 3287     /*
 3288      * Fill in the encapsulating IP and PIM header
 3289      */
 3290     ip_outer = mtod(mb_first, struct ip *);
 3291     *ip_outer = pim_encap_iphdr;
 3292     ip_outer->ip_id = ip_newid();
 3293     ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 3294                              sizeof(pim_encap_pimhdr));
 3295     ip_outer->ip_src = viftable[vifi].v_lcl_addr;
 3296     ip_outer->ip_dst = rt->mfc_rp;
 3297     /*
 3298      * Copy the inner header TOS to the outer header, and take care of the
 3299      * IP_DF bit.
 3300      */
 3301     ip_outer->ip_tos = ip->ip_tos;
 3302     if (ntohs(ip->ip_off) & IP_DF)
 3303         ip_outer->ip_off |= IP_DF;
 3304     pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 3305                                          + sizeof(pim_encap_iphdr));
 3306     *pimhdr = pim_encap_pimhdr;
 3307     /* If the iif crosses a border, set the Border-bit */
 3308     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
 3309         pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 3310 
 3311     mb_first->m_data += sizeof(pim_encap_iphdr);
 3312     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
 3313     mb_first->m_data -= sizeof(pim_encap_iphdr);
 3314 
 3315     if (vifp->v_rate_limit == 0)
 3316         tbf_send_packet(vifp, mb_first);
 3317     else
 3318         tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len));
 3319 
 3320     /* Keep statistics */
 3321     pimstat.pims_snd_registers_msgs++;
 3322     pimstat.pims_snd_registers_bytes += len;
 3323 
 3324     return 0;
 3325 }
 3326 
 3327 /*
 3328  * PIM-SMv2 and PIM-DM messages processing.
 3329  * Receives and verifies the PIM control messages, and passes them
 3330  * up to the listening socket, using rip_input().
 3331  * The only message with special processing is the PIM_REGISTER message
 3332  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
 3333  * is passed to if_simloop().
 3334  */
 3335 void
 3336 pim_input(struct mbuf *m, ...)
 3337 {
 3338     struct ip *ip = mtod(m, struct ip *);
 3339     struct pim *pim;
 3340     int minlen;
 3341     int datalen;
 3342     int ip_tos;
 3343     int proto;
 3344     int iphlen;
 3345     va_list ap;
 3346 
 3347     va_start(ap, m);
 3348     iphlen = va_arg(ap, int);
 3349     proto = va_arg(ap, int);
 3350     va_end(ap);
 3351 
 3352     datalen = ntohs(ip->ip_len) - iphlen;
 3353 
 3354     /* Keep statistics */
 3355     pimstat.pims_rcv_total_msgs++;
 3356     pimstat.pims_rcv_total_bytes += datalen;
 3357 
 3358     /*
 3359      * Validate lengths
 3360      */
 3361     if (datalen < PIM_MINLEN) {
 3362         pimstat.pims_rcv_tooshort++;
 3363         log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
 3364             datalen, (u_long)ip->ip_src.s_addr);
 3365         m_freem(m);
 3366         return;
 3367     }
 3368 
 3369     /*
 3370      * If the packet is at least as big as a REGISTER, go agead
 3371      * and grab the PIM REGISTER header size, to avoid another
 3372      * possible m_pullup() later.
 3373      *
 3374      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
 3375      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
 3376      */
 3377     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
 3378     /*
 3379      * Get the IP and PIM headers in contiguous memory, and
 3380      * possibly the PIM REGISTER header.
 3381      */
 3382     if ((m->m_flags & M_EXT || m->m_len < minlen) &&
 3383         (m = m_pullup(m, minlen)) == NULL) {
 3384         log(LOG_ERR, "pim_input: m_pullup failure\n");
 3385         return;
 3386     }
 3387     /* m_pullup() may have given us a new mbuf so reset ip. */
 3388     ip = mtod(m, struct ip *);
 3389     ip_tos = ip->ip_tos;
 3390 
 3391     /* adjust mbuf to point to the PIM header */
 3392     m->m_data += iphlen;
 3393     m->m_len  -= iphlen;
 3394     pim = mtod(m, struct pim *);
 3395 
 3396     /*
 3397      * Validate checksum. If PIM REGISTER, exclude the data packet.
 3398      *
 3399      * XXX: some older PIMv2 implementations don't make this distinction,
 3400      * so for compatibility reason perform the checksum over part of the
 3401      * message, and if error, then over the whole message.
 3402      */
 3403     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 3404         /* do nothing, checksum okay */
 3405     } else if (in_cksum(m, datalen)) {
 3406         pimstat.pims_rcv_badsum++;
 3407         if (mrtdebug & DEBUG_PIM)
 3408             log(LOG_DEBUG, "pim_input: invalid checksum");
 3409         m_freem(m);
 3410         return;
 3411     }
 3412 
 3413     /* PIM version check */
 3414     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 3415         pimstat.pims_rcv_badversion++;
 3416         log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
 3417             PIM_VT_V(pim->pim_vt), PIM_VERSION);
 3418         m_freem(m);
 3419         return;
 3420     }
 3421 
 3422     /* restore mbuf back to the outer IP */
 3423     m->m_data -= iphlen;
 3424     m->m_len  += iphlen;
 3425 
 3426     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 3427         /*
 3428          * Since this is a REGISTER, we'll make a copy of the register
 3429          * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 3430          * routing daemon.
 3431          */
 3432         int s;
 3433         struct sockaddr_in dst = { sizeof(dst), AF_INET };
 3434         struct mbuf *mcp;
 3435         struct ip *encap_ip;
 3436         u_int32_t *reghdr;
 3437         struct ifnet *vifp;
 3438 
 3439         s = splsoftnet();
 3440         if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
 3441             splx(s);
 3442             if (mrtdebug & DEBUG_PIM)
 3443                 log(LOG_DEBUG,
 3444                     "pim_input: register vif not set: %d\n", reg_vif_num);
 3445             m_freem(m);
 3446             return;
 3447         }
 3448         /* XXX need refcnt? */
 3449         vifp = viftable[reg_vif_num].v_ifp;
 3450         splx(s);
 3451 
 3452         /*
 3453          * Validate length
 3454          */
 3455         if (datalen < PIM_REG_MINLEN) {
 3456             pimstat.pims_rcv_tooshort++;
 3457             pimstat.pims_rcv_badregisters++;
 3458             log(LOG_ERR,
 3459                 "pim_input: register packet size too small %d from %lx\n",
 3460                 datalen, (u_long)ip->ip_src.s_addr);
 3461             m_freem(m);
 3462             return;
 3463         }
 3464 
 3465         reghdr = (u_int32_t *)(pim + 1);
 3466         encap_ip = (struct ip *)(reghdr + 1);
 3467 
 3468         if (mrtdebug & DEBUG_PIM) {
 3469             log(LOG_DEBUG,
 3470                 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
 3471                 (u_long)ntohl(encap_ip->ip_src.s_addr),
 3472                 (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3473                 ntohs(encap_ip->ip_len));
 3474         }
 3475 
 3476         /* verify the version number of the inner packet */
 3477         if (encap_ip->ip_v != IPVERSION) {
 3478             pimstat.pims_rcv_badregisters++;
 3479             if (mrtdebug & DEBUG_PIM) {
 3480                 log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
 3481                     "of the inner packet\n", encap_ip->ip_v);
 3482             }
 3483             m_freem(m);
 3484             return;
 3485         }
 3486 
 3487         /* verify the inner packet is destined to a mcast group */
 3488         if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
 3489             pimstat.pims_rcv_badregisters++;
 3490             if (mrtdebug & DEBUG_PIM)
 3491                 log(LOG_DEBUG,
 3492                     "pim_input: inner packet of register is not "
 3493                     "multicast %lx\n",
 3494                     (u_long)ntohl(encap_ip->ip_dst.s_addr));
 3495             m_freem(m);
 3496             return;
 3497         }
 3498 
 3499         /* If a NULL_REGISTER, pass it to the daemon */
 3500         if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 3501             goto pim_input_to_daemon;
 3502 
 3503         /*
 3504          * Copy the TOS from the outer IP header to the inner IP header.
 3505          */
 3506         if (encap_ip->ip_tos != ip_tos) {
 3507             /* Outer TOS -> inner TOS */
 3508             encap_ip->ip_tos = ip_tos;
 3509             /* Recompute the inner header checksum. Sigh... */
 3510 
 3511             /* adjust mbuf to point to the inner IP header */
 3512             m->m_data += (iphlen + PIM_MINLEN);
 3513             m->m_len  -= (iphlen + PIM_MINLEN);
 3514 
 3515             encap_ip->ip_sum = 0;
 3516             encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 3517 
 3518             /* restore mbuf to point back to the outer IP header */
 3519             m->m_data -= (iphlen + PIM_MINLEN);
 3520             m->m_len  += (iphlen + PIM_MINLEN);
 3521         }
 3522 
 3523         /*
 3524          * Decapsulate the inner IP packet and loopback to forward it
 3525          * as a normal multicast packet. Also, make a copy of the
 3526          *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 3527          * to pass to the daemon later, so it can take the appropriate
 3528          * actions (e.g., send back PIM_REGISTER_STOP).
 3529          * XXX: here m->m_data points to the outer IP header.
 3530          */
 3531         mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
 3532         if (mcp == NULL) {
 3533             log(LOG_ERR,
 3534                 "pim_input: pim register: could not copy register head\n");
 3535             m_freem(m);
 3536             return;
 3537         }
 3538 
 3539         /* Keep statistics */
 3540         /* XXX: registers_bytes include only the encap. mcast pkt */
 3541         pimstat.pims_rcv_registers_msgs++;
 3542         pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
 3543 
 3544         /*
 3545          * forward the inner ip packet; point m_data at the inner ip.
 3546          */
 3547         m_adj(m, iphlen + PIM_MINLEN);
 3548 
 3549         if (mrtdebug & DEBUG_PIM) {
 3550             log(LOG_DEBUG,
 3551                 "pim_input: forwarding decapsulated register: "
 3552                 "src %lx, dst %lx, vif %d\n",
 3553                 (u_long)ntohl(encap_ip->ip_src.s_addr),
 3554                 (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3555                 reg_vif_num);
 3556         }
 3557         /* NB: vifp was collected above; can it change on us? */
 3558         looutput(vifp, m, (struct sockaddr *)&dst, (struct rtentry *)NULL);
 3559 
 3560         /* prepare the register head to send to the mrouting daemon */
 3561         m = mcp;
 3562     }
 3563 
 3564 pim_input_to_daemon:
 3565     /*
 3566      * Pass the PIM message up to the daemon; if it is a Register message,
 3567      * pass the 'head' only up to the daemon. This includes the
 3568      * outer IP header, PIM header, PIM-Register header and the
 3569      * inner IP header.
 3570      * XXX: the outer IP header pkt size of a Register is not adjust to
 3571      * reflect the fact that the inner multicast data is truncated.
 3572      */
 3573     rip_input(m, iphlen, proto);
 3574 
 3575     return;
 3576 }
 3577 #endif /* PIM */

Cache object: 96ba10bf00d88440cc673d72e5d3ccf4


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.