The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/ip_mroute.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: ip_mroute.c,v 1.100.2.1 2008/05/17 16:11:40 bouyer Exp $       */
    2 
    3 /*
    4  * Copyright (c) 1992, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Stephen Deering of Stanford University.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   35  */
   36 
   37 /*
   38  * Copyright (c) 1989 Stephen Deering
   39  *
   40  * This code is derived from software contributed to Berkeley by
   41  * Stephen Deering of Stanford University.
   42  *
   43  * Redistribution and use in source and binary forms, with or without
   44  * modification, are permitted provided that the following conditions
   45  * are met:
   46  * 1. Redistributions of source code must retain the above copyright
   47  *    notice, this list of conditions and the following disclaimer.
   48  * 2. Redistributions in binary form must reproduce the above copyright
   49  *    notice, this list of conditions and the following disclaimer in the
   50  *    documentation and/or other materials provided with the distribution.
   51  * 3. All advertising materials mentioning features or use of this software
   52  *    must display the following acknowledgement:
   53  *      This product includes software developed by the University of
   54  *      California, Berkeley and its contributors.
   55  * 4. Neither the name of the University nor the names of its contributors
   56  *    may be used to endorse or promote products derived from this software
   57  *    without specific prior written permission.
   58  *
   59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   69  * SUCH DAMAGE.
   70  *
   71  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
   72  */
   73 
   74 /*
   75  * IP multicast forwarding procedures
   76  *
   77  * Written by David Waitzman, BBN Labs, August 1988.
   78  * Modified by Steve Deering, Stanford, February 1989.
   79  * Modified by Mark J. Steiglitz, Stanford, May, 1991
   80  * Modified by Van Jacobson, LBL, January 1993
   81  * Modified by Ajit Thyagarajan, PARC, August 1993
   82  * Modified by Bill Fenner, PARC, April 1994
   83  * Modified by Charles M. Hannum, NetBSD, May 1995.
   84  * Modified by Ahmed Helmy, SGI, June 1996
   85  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
   86  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
   87  * Modified by Hitoshi Asaeda, WIDE, August 2000
   88  * Modified by Pavlin Radoslavov, ICSI, October 2002
   89  *
   90  * MROUTING Revision: 1.2
   91  * and PIM-SMv2 and PIM-DM support, advanced API support,
   92  * bandwidth metering and signaling
   93  */
   94 
   95 #include <sys/cdefs.h>
   96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.100.2.1 2008/05/17 16:11:40 bouyer Exp $");
   97 
   98 #include "opt_inet.h"
   99 #include "opt_ipsec.h"
  100 #include "opt_pim.h"
  101 
  102 #ifdef PIM
  103 #define _PIM_VT 1
  104 #endif
  105 
  106 #include <sys/param.h>
  107 #include <sys/systm.h>
  108 #include <sys/callout.h>
  109 #include <sys/mbuf.h>
  110 #include <sys/socket.h>
  111 #include <sys/socketvar.h>
  112 #include <sys/protosw.h>
  113 #include <sys/errno.h>
  114 #include <sys/time.h>
  115 #include <sys/kernel.h>
  116 #include <sys/ioctl.h>
  117 #include <sys/syslog.h>
  118 
  119 #include <net/if.h>
  120 #include <net/route.h>
  121 #include <net/raw_cb.h>
  122 
  123 #include <netinet/in.h>
  124 #include <netinet/in_var.h>
  125 #include <netinet/in_systm.h>
  126 #include <netinet/ip.h>
  127 #include <netinet/ip_var.h>
  128 #include <netinet/in_pcb.h>
  129 #include <netinet/udp.h>
  130 #include <netinet/igmp.h>
  131 #include <netinet/igmp_var.h>
  132 #include <netinet/ip_mroute.h>
  133 #ifdef PIM
  134 #include <netinet/pim.h>
  135 #include <netinet/pim_var.h>
  136 #endif
  137 #include <netinet/ip_encap.h>
  138 
  139 #ifdef IPSEC
  140 #include <netinet6/ipsec.h>
  141 #include <netkey/key.h>
  142 #endif
  143 
  144 #ifdef FAST_IPSEC
  145 #include <netipsec/ipsec.h>
  146 #include <netipsec/key.h>
  147 #endif
  148 
  149 #include <machine/stdarg.h>
  150 
  151 #define IP_MULTICASTOPTS 0
  152 #define M_PULLUP(m, len)                                                 \
  153         do {                                                             \
  154                 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
  155                         (m) = m_pullup((m), (len));                      \
  156         } while (/*CONSTCOND*/ 0)
  157 
  158 /*
  159  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
  160  * except for netstat or debugging purposes.
  161  */
  162 struct socket  *ip_mrouter  = NULL;
  163 int             ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
  164 
  165 #define NO_RTE_FOUND    0x1
  166 #define RTE_FOUND       0x2
  167 
  168 #define MFCHASH(a, g)                                                   \
  169         ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^        \
  170           ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
  171 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
  172 u_long  mfchash;
  173 
  174 u_char          nexpire[MFCTBLSIZ];
  175 struct vif      viftable[MAXVIFS];
  176 struct mrtstat  mrtstat;
  177 u_int           mrtdebug = 0;     /* debug level        */
  178 #define         DEBUG_MFC       0x02
  179 #define         DEBUG_FORWARD   0x04
  180 #define         DEBUG_EXPIRE    0x08
  181 #define         DEBUG_XMIT      0x10
  182 #define         DEBUG_PIM       0x20
  183 
  184 #define         VIFI_INVALID    ((vifi_t) -1)
  185 
  186 u_int           tbfdebug = 0;     /* tbf debug level    */
  187 #ifdef RSVP_ISI
  188 u_int           rsvpdebug = 0;    /* rsvp debug level   */
  189 extern struct socket *ip_rsvpd;
  190 extern int rsvp_on;
  191 #endif /* RSVP_ISI */
  192 
  193 /* vif attachment using sys/netinet/ip_encap.c */
  194 static void vif_input(struct mbuf *, ...);
  195 static int vif_encapcheck(struct mbuf *, int, int, void *);
  196 
  197 static const struct protosw vif_protosw =
  198 { SOCK_RAW,     &inetdomain,    IPPROTO_IPV4,   PR_ATOMIC|PR_ADDR,
  199   vif_input,    rip_output,     0,              rip_ctloutput,
  200   rip_usrreq,
  201   0,            0,              0,              0,
  202 };
  203 
  204 #define         EXPIRE_TIMEOUT  (hz / 4)        /* 4x / second */
  205 #define         UPCALL_EXPIRE   6               /* number of timeouts */
  206 
  207 /*
  208  * Define the token bucket filter structures
  209  */
  210 
  211 #define         TBF_REPROCESS   (hz / 100)      /* 100x / second */
  212 
  213 static int get_sg_cnt(struct sioc_sg_req *);
  214 static int get_vif_cnt(struct sioc_vif_req *);
  215 static int ip_mrouter_init(struct socket *, struct mbuf *);
  216 static int get_version(struct mbuf *);
  217 static int set_assert(struct mbuf *);
  218 static int get_assert(struct mbuf *);
  219 static int add_vif(struct mbuf *);
  220 static int del_vif(struct mbuf *);
  221 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
  222 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
  223 static void expire_mfc(struct mfc *);
  224 static int add_mfc(struct mbuf *);
  225 #ifdef UPCALL_TIMING
  226 static void collate(struct timeval *);
  227 #endif
  228 static int del_mfc(struct mbuf *);
  229 static int set_api_config(struct mbuf *); /* chose API capabilities */
  230 static int get_api_support(struct mbuf *);
  231 static int get_api_config(struct mbuf *);
  232 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
  233 static void expire_upcalls(void *);
  234 #ifdef RSVP_ISI
  235 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
  236 #else
  237 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
  238 #endif
  239 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
  240 static void encap_send(struct ip *, struct vif *, struct mbuf *);
  241 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t);
  242 static void tbf_queue(struct vif *, struct mbuf *);
  243 static void tbf_process_q(struct vif *);
  244 static void tbf_reprocess_q(void *);
  245 static int tbf_dq_sel(struct vif *, struct ip *);
  246 static void tbf_send_packet(struct vif *, struct mbuf *);
  247 static void tbf_update_tokens(struct vif *);
  248 static int priority(struct vif *, struct ip *);
  249 
  250 /*
  251  * Bandwidth monitoring
  252  */
  253 static void free_bw_list(struct bw_meter *);
  254 static int add_bw_upcall(struct mbuf *);
  255 static int del_bw_upcall(struct mbuf *);
  256 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
  257 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
  258 static void bw_upcalls_send(void);
  259 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
  260 static void unschedule_bw_meter(struct bw_meter *);
  261 static void bw_meter_process(void);
  262 static void expire_bw_upcalls_send(void *);
  263 static void expire_bw_meter_process(void *);
  264 
  265 #ifdef PIM
  266 static int pim_register_send(struct ip *, struct vif *,
  267                 struct mbuf *, struct mfc *);
  268 static int pim_register_send_rp(struct ip *, struct vif *,
  269                 struct mbuf *, struct mfc *);
  270 static int pim_register_send_upcall(struct ip *, struct vif *,
  271                 struct mbuf *, struct mfc *);
  272 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
  273 #endif
  274 
  275 /*
  276  * 'Interfaces' associated with decapsulator (so we can tell
  277  * packets that went through it from ones that get reflected
  278  * by a broken gateway).  These interfaces are never linked into
  279  * the system ifnet list & no routes point to them.  I.e., packets
  280  * can't be sent this way.  They only exist as a placeholder for
  281  * multicast source verification.
  282  */
  283 #if 0
  284 struct ifnet multicast_decap_if[MAXVIFS];
  285 #endif
  286 
  287 #define ENCAP_TTL       64
  288 #define ENCAP_PROTO     IPPROTO_IPIP    /* 4 */
  289 
  290 /* prototype IP hdr for encapsulated packets */
  291 struct ip multicast_encap_iphdr = {
  292         .ip_hl = sizeof(struct ip) >> 2,
  293         .ip_v = IPVERSION,
  294         .ip_len = sizeof(struct ip),
  295         .ip_ttl = ENCAP_TTL,
  296         .ip_p = ENCAP_PROTO,
  297 };
  298 
  299 /*
  300  * Bandwidth meter variables and constants
  301  */
  302 
  303 /*
  304  * Pending timeouts are stored in a hash table, the key being the
  305  * expiration time. Periodically, the entries are analysed and processed.
  306  */
  307 #define BW_METER_BUCKETS        1024
  308 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
  309 struct callout bw_meter_ch;
  310 #define BW_METER_PERIOD (hz)            /* periodical handling of bw meters */
  311 
  312 /*
  313  * Pending upcalls are stored in a vector which is flushed when
  314  * full, or periodically
  315  */
  316 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX];
  317 static u_int    bw_upcalls_n; /* # of pending upcalls */
  318 struct callout  bw_upcalls_ch;
  319 #define BW_UPCALLS_PERIOD (hz)          /* periodical flush of bw upcalls */
  320 
  321 #ifdef PIM
  322 struct pimstat pimstat;
  323 
  324 /*
  325  * Note: the PIM Register encapsulation adds the following in front of a
  326  * data packet:
  327  *
  328  * struct pim_encap_hdr {
  329  *    struct ip ip;
  330  *    struct pim_encap_pimhdr  pim;
  331  * }
  332  *
  333  */
  334 
  335 struct pim_encap_pimhdr {
  336         struct pim pim;
  337         uint32_t   flags;
  338 };
  339 
  340 static struct ip pim_encap_iphdr = {
  341         .ip_v = IPVERSION,
  342         .ip_hl = sizeof(struct ip) >> 2,
  343         .ip_len = sizeof(struct ip),
  344         .ip_ttl = ENCAP_TTL,
  345         .ip_p = IPPROTO_PIM,
  346 };
  347 
  348 static struct pim_encap_pimhdr pim_encap_pimhdr = {
  349     {
  350         PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
  351         0,                      /* reserved */
  352         0,                      /* checksum */
  353     },
  354     0                           /* flags */
  355 };
  356 
  357 static struct ifnet multicast_register_if;
  358 static vifi_t reg_vif_num = VIFI_INVALID;
  359 #endif /* PIM */
  360 
  361 
  362 /*
  363  * Private variables.
  364  */
  365 static vifi_t      numvifs = 0;
  366 
  367 static struct callout expire_upcalls_ch;
  368 
  369 /*
  370  * whether or not special PIM assert processing is enabled.
  371  */
  372 static int pim_assert;
  373 /*
  374  * Rate limit for assert notification messages, in usec
  375  */
  376 #define ASSERT_MSG_TIME         3000000
  377 
  378 /*
  379  * Kernel multicast routing API capabilities and setup.
  380  * If more API capabilities are added to the kernel, they should be
  381  * recorded in `mrt_api_support'.
  382  */
  383 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
  384                                           MRT_MFC_FLAGS_BORDER_VIF |
  385                                           MRT_MFC_RP |
  386                                           MRT_MFC_BW_UPCALL);
  387 static u_int32_t mrt_api_config = 0;
  388 
  389 /*
  390  * Find a route for a given origin IP address and Multicast group address
  391  * Type of service parameter to be added in the future!!!
  392  * Statistics are updated by the caller if needed
  393  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
  394  */
  395 static struct mfc *
  396 mfc_find(struct in_addr *o, struct in_addr *g)
  397 {
  398         struct mfc *rt;
  399 
  400         LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
  401                 if (in_hosteq(rt->mfc_origin, *o) &&
  402                     in_hosteq(rt->mfc_mcastgrp, *g) &&
  403                     (rt->mfc_stall == NULL))
  404                         break;
  405         }
  406 
  407         return (rt);
  408 }
  409 
  410 /*
  411  * Macros to compute elapsed time efficiently
  412  * Borrowed from Van Jacobson's scheduling code
  413  */
  414 #define TV_DELTA(a, b, delta) do {                                      \
  415         int xxs;                                                        \
  416         delta = (a).tv_usec - (b).tv_usec;                              \
  417         xxs = (a).tv_sec - (b).tv_sec;                                  \
  418         switch (xxs) {                                                  \
  419         case 2:                                                         \
  420                 delta += 1000000;                                       \
  421                 /* fall through */                                      \
  422         case 1:                                                         \
  423                 delta += 1000000;                                       \
  424                 /* fall through */                                      \
  425         case 0:                                                         \
  426                 break;                                                  \
  427         default:                                                        \
  428                 delta += (1000000 * xxs);                               \
  429                 break;                                                  \
  430         }                                                               \
  431 } while (/*CONSTCOND*/ 0)
  432 
  433 #ifdef UPCALL_TIMING
  434 u_int32_t upcall_data[51];
  435 #endif /* UPCALL_TIMING */
  436 
  437 /*
  438  * Handle MRT setsockopt commands to modify the multicast routing tables.
  439  */
  440 int
  441 ip_mrouter_set(struct socket *so, int optname, struct mbuf **m)
  442 {
  443         int error;
  444 
  445         if (optname != MRT_INIT && so != ip_mrouter)
  446                 error = ENOPROTOOPT;
  447         else
  448                 switch (optname) {
  449                 case MRT_INIT:
  450                         error = ip_mrouter_init(so, *m);
  451                         break;
  452                 case MRT_DONE:
  453                         error = ip_mrouter_done();
  454                         break;
  455                 case MRT_ADD_VIF:
  456                         error = add_vif(*m);
  457                         break;
  458                 case MRT_DEL_VIF:
  459                         error = del_vif(*m);
  460                         break;
  461                 case MRT_ADD_MFC:
  462                         error = add_mfc(*m);
  463                         break;
  464                 case MRT_DEL_MFC:
  465                         error = del_mfc(*m);
  466                         break;
  467                 case MRT_ASSERT:
  468                         error = set_assert(*m);
  469                         break;
  470                 case MRT_API_CONFIG:
  471                         error = set_api_config(*m);
  472                         break;
  473                 case MRT_ADD_BW_UPCALL:
  474                         error = add_bw_upcall(*m);
  475                         break;
  476                 case MRT_DEL_BW_UPCALL:
  477                         error = del_bw_upcall(*m);
  478                         break;
  479                 default:
  480                         error = ENOPROTOOPT;
  481                         break;
  482                 }
  483 
  484         if (*m)
  485                 m_free(*m);
  486         return (error);
  487 }
  488 
  489 /*
  490  * Handle MRT getsockopt commands
  491  */
  492 int
  493 ip_mrouter_get(struct socket *so, int optname, struct mbuf **m)
  494 {
  495         int error;
  496 
  497         if (so != ip_mrouter)
  498                 error = ENOPROTOOPT;
  499         else {
  500                 *m = m_get(M_WAIT, MT_SOOPTS);
  501                 MCLAIM(*m, so->so_mowner);
  502 
  503                 switch (optname) {
  504                 case MRT_VERSION:
  505                         error = get_version(*m);
  506                         break;
  507                 case MRT_ASSERT:
  508                         error = get_assert(*m);
  509                         break;
  510                 case MRT_API_SUPPORT:
  511                         error = get_api_support(*m);
  512                         break;
  513                 case MRT_API_CONFIG:
  514                         error = get_api_config(*m);
  515                         break;
  516                 default:
  517                         error = ENOPROTOOPT;
  518                         break;
  519                 }
  520 
  521                 if (error)
  522                         m_free(*m);
  523         }
  524 
  525         return (error);
  526 }
  527 
  528 /*
  529  * Handle ioctl commands to obtain information from the cache
  530  */
  531 int
  532 mrt_ioctl(struct socket *so, u_long cmd, caddr_t data)
  533 {
  534         int error;
  535 
  536         if (so != ip_mrouter)
  537                 error = EINVAL;
  538         else
  539                 switch (cmd) {
  540                 case SIOCGETVIFCNT:
  541                         error = get_vif_cnt((struct sioc_vif_req *)data);
  542                         break;
  543                 case SIOCGETSGCNT:
  544                         error = get_sg_cnt((struct sioc_sg_req *)data);
  545                         break;
  546                 default:
  547                         error = EINVAL;
  548                         break;
  549                 }
  550 
  551         return (error);
  552 }
  553 
  554 /*
  555  * returns the packet, byte, rpf-failure count for the source group provided
  556  */
  557 static int
  558 get_sg_cnt(struct sioc_sg_req *req)
  559 {
  560         int s;
  561         struct mfc *rt;
  562 
  563         s = splsoftnet();
  564         rt = mfc_find(&req->src, &req->grp);
  565         if (rt == NULL) {
  566                 splx(s);
  567                 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
  568                 return (EADDRNOTAVAIL);
  569         }
  570         req->pktcnt = rt->mfc_pkt_cnt;
  571         req->bytecnt = rt->mfc_byte_cnt;
  572         req->wrong_if = rt->mfc_wrong_if;
  573         splx(s);
  574 
  575         return (0);
  576 }
  577 
  578 /*
  579  * returns the input and output packet and byte counts on the vif provided
  580  */
  581 static int
  582 get_vif_cnt(struct sioc_vif_req *req)
  583 {
  584         vifi_t vifi = req->vifi;
  585 
  586         if (vifi >= numvifs)
  587                 return (EINVAL);
  588 
  589         req->icount = viftable[vifi].v_pkt_in;
  590         req->ocount = viftable[vifi].v_pkt_out;
  591         req->ibytes = viftable[vifi].v_bytes_in;
  592         req->obytes = viftable[vifi].v_bytes_out;
  593 
  594         return (0);
  595 }
  596 
  597 /*
  598  * Enable multicast routing
  599  */
  600 static int
  601 ip_mrouter_init(struct socket *so, struct mbuf *m)
  602 {
  603         int *v;
  604 
  605         if (mrtdebug)
  606                 log(LOG_DEBUG,
  607                     "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
  608                     so->so_type, so->so_proto->pr_protocol);
  609 
  610         if (so->so_type != SOCK_RAW ||
  611             so->so_proto->pr_protocol != IPPROTO_IGMP)
  612                 return (EOPNOTSUPP);
  613 
  614         if (m == NULL || m->m_len < sizeof(int))
  615                 return (EINVAL);
  616 
  617         v = mtod(m, int *);
  618         if (*v != 1)
  619                 return (EINVAL);
  620 
  621         if (ip_mrouter != NULL)
  622                 return (EADDRINUSE);
  623 
  624         ip_mrouter = so;
  625 
  626         mfchashtbl =
  627             hashinit(MFCTBLSIZ, HASH_LIST, M_MRTABLE, M_WAITOK, &mfchash);
  628         bzero((caddr_t)nexpire, sizeof(nexpire));
  629 
  630         pim_assert = 0;
  631 
  632         callout_init(&expire_upcalls_ch);
  633         callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
  634                       expire_upcalls, NULL);
  635 
  636         callout_init(&bw_upcalls_ch);
  637         callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
  638                       expire_bw_upcalls_send, NULL);
  639 
  640         callout_init(&bw_meter_ch);
  641         callout_reset(&bw_meter_ch, BW_METER_PERIOD,
  642                       expire_bw_meter_process, NULL);
  643 
  644         if (mrtdebug)
  645                 log(LOG_DEBUG, "ip_mrouter_init\n");
  646 
  647         return (0);
  648 }
  649 
  650 /*
  651  * Disable multicast routing
  652  */
  653 int
  654 ip_mrouter_done(void)
  655 {
  656         vifi_t vifi;
  657         struct vif *vifp;
  658         int i;
  659         int s;
  660 
  661         s = splsoftnet();
  662 
  663         /* Clear out all the vifs currently in use. */
  664         for (vifi = 0; vifi < numvifs; vifi++) {
  665                 vifp = &viftable[vifi];
  666                 if (!in_nullhost(vifp->v_lcl_addr))
  667                         reset_vif(vifp);
  668         }
  669 
  670         numvifs = 0;
  671         pim_assert = 0;
  672         mrt_api_config = 0;
  673 
  674         callout_stop(&expire_upcalls_ch);
  675         callout_stop(&bw_upcalls_ch);
  676         callout_stop(&bw_meter_ch);
  677 
  678         /*
  679          * Free all multicast forwarding cache entries.
  680          */
  681         for (i = 0; i < MFCTBLSIZ; i++) {
  682                 struct mfc *rt, *nrt;
  683 
  684                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
  685                         nrt = LIST_NEXT(rt, mfc_hash);
  686 
  687                         expire_mfc(rt);
  688                 }
  689         }
  690 
  691         bzero((caddr_t)nexpire, sizeof(nexpire));
  692         free(mfchashtbl, M_MRTABLE);
  693         mfchashtbl = NULL;
  694 
  695         bw_upcalls_n = 0;
  696         bzero(bw_meter_timers, sizeof(bw_meter_timers));
  697 
  698         /* Reset de-encapsulation cache. */
  699 
  700         ip_mrouter = NULL;
  701 
  702         splx(s);
  703 
  704         if (mrtdebug)
  705                 log(LOG_DEBUG, "ip_mrouter_done\n");
  706 
  707         return (0);
  708 }
  709 
  710 void
  711 ip_mrouter_detach(struct ifnet *ifp)
  712 {
  713         int vifi, i;
  714         struct vif *vifp;
  715         struct mfc *rt;
  716         struct rtdetq *rte;
  717 
  718         /* XXX not sure about side effect to userland routing daemon */
  719         for (vifi = 0; vifi < numvifs; vifi++) {
  720                 vifp = &viftable[vifi];
  721                 if (vifp->v_ifp == ifp)
  722                         reset_vif(vifp);
  723         }
  724         for (i = 0; i < MFCTBLSIZ; i++) {
  725                 if (nexpire[i] == 0)
  726                         continue;
  727                 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
  728                         for (rte = rt->mfc_stall; rte; rte = rte->next) {
  729                                 if (rte->ifp == ifp)
  730                                         rte->ifp = NULL;
  731                         }
  732                 }
  733         }
  734 }
  735 
  736 static int
  737 get_version(struct mbuf *m)
  738 {
  739         int *v = mtod(m, int *);
  740 
  741         *v = 0x0305;    /* XXX !!!! */
  742         m->m_len = sizeof(int);
  743         return (0);
  744 }
  745 
  746 /*
  747  * Set PIM assert processing global
  748  */
  749 static int
  750 set_assert(struct mbuf *m)
  751 {
  752         int *i;
  753 
  754         if (m == NULL || m->m_len < sizeof(int))
  755                 return (EINVAL);
  756 
  757         i = mtod(m, int *);
  758         pim_assert = !!*i;
  759         return (0);
  760 }
  761 
  762 /*
  763  * Get PIM assert processing global
  764  */
  765 static int
  766 get_assert(struct mbuf *m)
  767 {
  768         int *i = mtod(m, int *);
  769 
  770         *i = pim_assert;
  771         m->m_len = sizeof(int);
  772         return (0);
  773 }
  774 
  775 /*
  776  * Configure API capabilities
  777  */
  778 static int
  779 set_api_config(struct mbuf *m)
  780 {
  781         int i;
  782         u_int32_t *apival;
  783 
  784         if (m == NULL || m->m_len < sizeof(u_int32_t))
  785                 return (EINVAL);
  786 
  787         apival = mtod(m, u_int32_t *);
  788 
  789         /*
  790          * We can set the API capabilities only if it is the first operation
  791          * after MRT_INIT. I.e.:
  792          *  - there are no vifs installed
  793          *  - pim_assert is not enabled
  794          *  - the MFC table is empty
  795          */
  796         if (numvifs > 0) {
  797                 *apival = 0;
  798                 return (EPERM);
  799         }
  800         if (pim_assert) {
  801                 *apival = 0;
  802                 return (EPERM);
  803         }
  804         for (i = 0; i < MFCTBLSIZ; i++) {
  805                 if (LIST_FIRST(&mfchashtbl[i]) != NULL) {
  806                         *apival = 0;
  807                         return (EPERM);
  808                 }
  809         }
  810 
  811         mrt_api_config = *apival & mrt_api_support;
  812         *apival = mrt_api_config;
  813 
  814         return (0);
  815 }
  816 
  817 /*
  818  * Get API capabilities
  819  */
  820 static int
  821 get_api_support(struct mbuf *m)
  822 {
  823         u_int32_t *apival;
  824 
  825         if (m == NULL || m->m_len < sizeof(u_int32_t))
  826                 return (EINVAL);
  827 
  828         apival = mtod(m, u_int32_t *);
  829 
  830         *apival = mrt_api_support;
  831 
  832         return (0);
  833 }
  834 
  835 /*
  836  * Get API configured capabilities
  837  */
  838 static int
  839 get_api_config(struct mbuf *m)
  840 {
  841         u_int32_t *apival;
  842 
  843         if (m == NULL || m->m_len < sizeof(u_int32_t))
  844                 return (EINVAL);
  845 
  846         apival = mtod(m, u_int32_t *);
  847 
  848         *apival = mrt_api_config;
  849 
  850         return (0);
  851 }
  852 
  853 static struct sockaddr_in sin = {
  854         .sin_len = sizeof(sin),
  855         .sin_family = AF_INET
  856 };
  857 
  858 /*
  859  * Add a vif to the vif table
  860  */
  861 static int
  862 add_vif(struct mbuf *m)
  863 {
  864         struct vifctl *vifcp;
  865         struct vif *vifp;
  866         struct ifaddr *ifa;
  867         struct ifnet *ifp;
  868         struct ifreq ifr;
  869         int error, s;
  870 
  871         if (m == NULL || m->m_len < sizeof(struct vifctl))
  872                 return (EINVAL);
  873 
  874         vifcp = mtod(m, struct vifctl *);
  875         if (vifcp->vifc_vifi >= MAXVIFS)
  876                 return (EINVAL);
  877         if (in_nullhost(vifcp->vifc_lcl_addr))
  878                 return (EADDRNOTAVAIL);
  879 
  880         vifp = &viftable[vifcp->vifc_vifi];
  881         if (!in_nullhost(vifp->v_lcl_addr))
  882                 return (EADDRINUSE);
  883 
  884         /* Find the interface with an address in AF_INET family. */
  885 #ifdef PIM
  886         if (vifcp->vifc_flags & VIFF_REGISTER) {
  887                 /*
  888                  * XXX: Because VIFF_REGISTER does not really need a valid
  889                  * local interface (e.g. it could be 127.0.0.2), we don't
  890                  * check its address.
  891                  */
  892             ifp = NULL;
  893         } else
  894 #endif
  895         {
  896                 sin.sin_addr = vifcp->vifc_lcl_addr;
  897                 ifa = ifa_ifwithaddr(sintosa(&sin));
  898                 if (ifa == NULL)
  899                         return (EADDRNOTAVAIL);
  900                 ifp = ifa->ifa_ifp;
  901         }
  902 
  903         if (vifcp->vifc_flags & VIFF_TUNNEL) {
  904                 if (vifcp->vifc_flags & VIFF_SRCRT) {
  905                         log(LOG_ERR, "source routed tunnels not supported\n");
  906                         return (EOPNOTSUPP);
  907                 }
  908 
  909                 /* attach this vif to decapsulator dispatch table */
  910                 /*
  911                  * XXX Use addresses in registration so that matching
  912                  * can be done with radix tree in decapsulator.  But,
  913                  * we need to check inner header for multicast, so
  914                  * this requires both radix tree lookup and then a
  915                  * function to check, and this is not supported yet.
  916                  */
  917                 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
  918                     vif_encapcheck, &vif_protosw, vifp);
  919                 if (!vifp->v_encap_cookie)
  920                         return (EINVAL);
  921 
  922                 /* Create a fake encapsulation interface. */
  923                 ifp = (struct ifnet *)malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK);
  924                 bzero(ifp, sizeof(*ifp));
  925                 snprintf(ifp->if_xname, sizeof(ifp->if_xname),
  926                          "mdecap%d", vifcp->vifc_vifi);
  927 
  928                 /* Prepare cached route entry. */
  929                 bzero(&vifp->v_route, sizeof(vifp->v_route));
  930 #ifdef PIM
  931         } else if (vifcp->vifc_flags & VIFF_REGISTER) {
  932                 ifp = &multicast_register_if;
  933                 if (mrtdebug)
  934                         log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
  935                             (void *)ifp);
  936                 if (reg_vif_num == VIFI_INVALID) {
  937                         bzero(ifp, sizeof(*ifp));
  938                         snprintf(ifp->if_xname, sizeof(ifp->if_xname),
  939                                  "register_vif");
  940                         ifp->if_flags = IFF_LOOPBACK;
  941                         bzero(&vifp->v_route, sizeof(vifp->v_route));
  942                         reg_vif_num = vifcp->vifc_vifi;
  943                 }
  944 #endif
  945         } else {
  946                 /* Make sure the interface supports multicast. */
  947                 if ((ifp->if_flags & IFF_MULTICAST) == 0)
  948                         return (EOPNOTSUPP);
  949 
  950                 /* Enable promiscuous reception of all IP multicasts. */
  951                 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
  952                 satosin(&ifr.ifr_addr)->sin_family = AF_INET;
  953                 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
  954                 error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
  955                 if (error)
  956                         return (error);
  957         }
  958 
  959         s = splsoftnet();
  960 
  961         /* Define parameters for the tbf structure. */
  962         vifp->tbf_q = NULL;
  963         vifp->tbf_t = &vifp->tbf_q;
  964         microtime(&vifp->tbf_last_pkt_t);
  965         vifp->tbf_n_tok = 0;
  966         vifp->tbf_q_len = 0;
  967         vifp->tbf_max_q_len = MAXQSIZE;
  968 
  969         vifp->v_flags = vifcp->vifc_flags;
  970         vifp->v_threshold = vifcp->vifc_threshold;
  971         /* scaling up here allows division by 1024 in critical code */
  972         vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
  973         vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
  974         vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
  975         vifp->v_ifp = ifp;
  976         /* Initialize per vif pkt counters. */
  977         vifp->v_pkt_in = 0;
  978         vifp->v_pkt_out = 0;
  979         vifp->v_bytes_in = 0;
  980         vifp->v_bytes_out = 0;
  981 
  982         callout_init(&vifp->v_repq_ch);
  983 
  984 #ifdef RSVP_ISI
  985         vifp->v_rsvp_on = 0;
  986         vifp->v_rsvpd = NULL;
  987 #endif /* RSVP_ISI */
  988 
  989         splx(s);
  990 
  991         /* Adjust numvifs up if the vifi is higher than numvifs. */
  992         if (numvifs <= vifcp->vifc_vifi)
  993                 numvifs = vifcp->vifc_vifi + 1;
  994 
  995         if (mrtdebug)
  996                 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
  997                     vifcp->vifc_vifi,
  998                     ntohl(vifcp->vifc_lcl_addr.s_addr),
  999                     (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
 1000                     ntohl(vifcp->vifc_rmt_addr.s_addr),
 1001                     vifcp->vifc_threshold,
 1002                     vifcp->vifc_rate_limit);
 1003 
 1004         return (0);
 1005 }
 1006 
 1007 void
 1008 reset_vif(struct vif *vifp)
 1009 {
 1010         struct mbuf *m, *n;
 1011         struct ifnet *ifp;
 1012         struct ifreq ifr;
 1013 
 1014         callout_stop(&vifp->v_repq_ch);
 1015 
 1016         /* detach this vif from decapsulator dispatch table */
 1017         encap_detach(vifp->v_encap_cookie);
 1018         vifp->v_encap_cookie = NULL;
 1019 
 1020         /*
 1021          * Free packets queued at the interface
 1022          */
 1023         for (m = vifp->tbf_q; m != NULL; m = n) {
 1024                 n = m->m_nextpkt;
 1025                 m_freem(m);
 1026         }
 1027 
 1028         if (vifp->v_flags & VIFF_TUNNEL)
 1029                 free(vifp->v_ifp, M_MRTABLE);
 1030         else if (vifp->v_flags & VIFF_REGISTER) {
 1031 #ifdef PIM
 1032                 reg_vif_num = VIFI_INVALID;
 1033 #endif
 1034         } else {
 1035                 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
 1036                 satosin(&ifr.ifr_addr)->sin_family = AF_INET;
 1037                 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
 1038                 ifp = vifp->v_ifp;
 1039                 (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
 1040         }
 1041         bzero((caddr_t)vifp, sizeof(*vifp));
 1042 }
 1043 
 1044 /*
 1045  * Delete a vif from the vif table
 1046  */
 1047 static int
 1048 del_vif(struct mbuf *m)
 1049 {
 1050         vifi_t *vifip;
 1051         struct vif *vifp;
 1052         vifi_t vifi;
 1053         int s;
 1054 
 1055         if (m == NULL || m->m_len < sizeof(vifi_t))
 1056                 return (EINVAL);
 1057 
 1058         vifip = mtod(m, vifi_t *);
 1059         if (*vifip >= numvifs)
 1060                 return (EINVAL);
 1061 
 1062         vifp = &viftable[*vifip];
 1063         if (in_nullhost(vifp->v_lcl_addr))
 1064                 return (EADDRNOTAVAIL);
 1065 
 1066         s = splsoftnet();
 1067 
 1068         reset_vif(vifp);
 1069 
 1070         /* Adjust numvifs down */
 1071         for (vifi = numvifs; vifi > 0; vifi--)
 1072                 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
 1073                         break;
 1074         numvifs = vifi;
 1075 
 1076         splx(s);
 1077 
 1078         if (mrtdebug)
 1079                 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
 1080 
 1081         return (0);
 1082 }
 1083 
 1084 /*
 1085  * update an mfc entry without resetting counters and S,G addresses.
 1086  */
 1087 static void
 1088 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 1089 {
 1090         int i;
 1091 
 1092         rt->mfc_parent = mfccp->mfcc_parent;
 1093         for (i = 0; i < numvifs; i++) {
 1094                 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 1095                 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
 1096                         MRT_MFC_FLAGS_ALL;
 1097         }
 1098         /* set the RP address */
 1099         if (mrt_api_config & MRT_MFC_RP)
 1100                 rt->mfc_rp = mfccp->mfcc_rp;
 1101         else
 1102                 rt->mfc_rp = zeroin_addr;
 1103 }
 1104 
 1105 /*
 1106  * fully initialize an mfc entry from the parameter.
 1107  */
 1108 static void
 1109 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 1110 {
 1111         rt->mfc_origin     = mfccp->mfcc_origin;
 1112         rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 1113 
 1114         update_mfc_params(rt, mfccp);
 1115 
 1116         /* initialize pkt counters per src-grp */
 1117         rt->mfc_pkt_cnt    = 0;
 1118         rt->mfc_byte_cnt   = 0;
 1119         rt->mfc_wrong_if   = 0;
 1120         timerclear(&rt->mfc_last_assert);
 1121 }
 1122 
 1123 static void
 1124 expire_mfc(struct mfc *rt)
 1125 {
 1126         struct rtdetq *rte, *nrte;
 1127 
 1128         free_bw_list(rt->mfc_bw_meter);
 1129 
 1130         for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
 1131                 nrte = rte->next;
 1132                 m_freem(rte->m);
 1133                 free(rte, M_MRTABLE);
 1134         }
 1135 
 1136         LIST_REMOVE(rt, mfc_hash);
 1137         free(rt, M_MRTABLE);
 1138 }
 1139 
 1140 /*
 1141  * Add an mfc entry
 1142  */
 1143 static int
 1144 add_mfc(struct mbuf *m)
 1145 {
 1146         struct mfcctl2 mfcctl2;
 1147         struct mfcctl2 *mfccp;
 1148         struct mfc *rt;
 1149         u_int32_t hash = 0;
 1150         struct rtdetq *rte, *nrte;
 1151         u_short nstl;
 1152         int s;
 1153         int mfcctl_size = sizeof(struct mfcctl);
 1154 
 1155         if (mrt_api_config & MRT_API_FLAGS_ALL)
 1156                 mfcctl_size = sizeof(struct mfcctl2);
 1157 
 1158         if (m == NULL || m->m_len < mfcctl_size)
 1159                 return (EINVAL);
 1160 
 1161         /*
 1162          * select data size depending on API version.
 1163          */
 1164         if (mrt_api_config & MRT_API_FLAGS_ALL) {
 1165                 struct mfcctl2 *mp2 = mtod(m, struct mfcctl2 *);
 1166                 bcopy(mp2, (caddr_t)&mfcctl2, sizeof(*mp2));
 1167         } else {
 1168                 struct mfcctl *mp = mtod(m, struct mfcctl *);
 1169                 bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp));
 1170                 bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl),
 1171                       sizeof(mfcctl2) - sizeof(struct mfcctl));
 1172         }
 1173         mfccp = &mfcctl2;
 1174 
 1175         s = splsoftnet();
 1176         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1177 
 1178         /* If an entry already exists, just update the fields */
 1179         if (rt) {
 1180                 if (mrtdebug & DEBUG_MFC)
 1181                         log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
 1182                             ntohl(mfccp->mfcc_origin.s_addr),
 1183                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1184                             mfccp->mfcc_parent);
 1185 
 1186                 update_mfc_params(rt, mfccp);
 1187 
 1188                 splx(s);
 1189                 return (0);
 1190         }
 1191 
 1192         /*
 1193          * Find the entry for which the upcall was made and update
 1194          */
 1195         nstl = 0;
 1196         hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
 1197         LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1198                 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1199                     in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 1200                     rt->mfc_stall != NULL) {
 1201                         if (nstl++)
 1202                                 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
 1203                                     "multiple kernel entries",
 1204                                     ntohl(mfccp->mfcc_origin.s_addr),
 1205                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1206                                     mfccp->mfcc_parent, rt->mfc_stall);
 1207 
 1208                         if (mrtdebug & DEBUG_MFC)
 1209                                 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n",
 1210                                     ntohl(mfccp->mfcc_origin.s_addr),
 1211                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1212                                     mfccp->mfcc_parent, rt->mfc_stall);
 1213 
 1214                         rte = rt->mfc_stall;
 1215                         init_mfc_params(rt, mfccp);
 1216                         rt->mfc_stall = NULL;
 1217 
 1218                         rt->mfc_expire = 0; /* Don't clean this guy up */
 1219                         nexpire[hash]--;
 1220 
 1221                         /* free packets Qed at the end of this entry */
 1222                         for (; rte != NULL; rte = nrte) {
 1223                                 nrte = rte->next;
 1224                                 if (rte->ifp) {
 1225 #ifdef RSVP_ISI
 1226                                         ip_mdq(rte->m, rte->ifp, rt, -1);
 1227 #else
 1228                                         ip_mdq(rte->m, rte->ifp, rt);
 1229 #endif /* RSVP_ISI */
 1230                                 }
 1231                                 m_freem(rte->m);
 1232 #ifdef UPCALL_TIMING
 1233                                 collate(&rte->t);
 1234 #endif /* UPCALL_TIMING */
 1235                                 free(rte, M_MRTABLE);
 1236                         }
 1237                 }
 1238         }
 1239 
 1240         /*
 1241          * It is possible that an entry is being inserted without an upcall
 1242          */
 1243         if (nstl == 0) {
 1244                 /*
 1245                  * No mfc; make a new one
 1246                  */
 1247                 if (mrtdebug & DEBUG_MFC)
 1248                         log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
 1249                             ntohl(mfccp->mfcc_origin.s_addr),
 1250                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1251                             mfccp->mfcc_parent);
 1252 
 1253                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1254                         if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 1255                             in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 1256                                 init_mfc_params(rt, mfccp);
 1257                                 if (rt->mfc_expire)
 1258                                         nexpire[hash]--;
 1259                                 rt->mfc_expire = 0;
 1260                                 break; /* XXX */
 1261                         }
 1262                 }
 1263                 if (rt == NULL) {       /* no upcall, so make a new entry */
 1264                         rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
 1265                                                   M_NOWAIT);
 1266                         if (rt == NULL) {
 1267                                 splx(s);
 1268                                 return (ENOBUFS);
 1269                         }
 1270 
 1271                         init_mfc_params(rt, mfccp);
 1272                         rt->mfc_expire  = 0;
 1273                         rt->mfc_stall   = NULL;
 1274                         rt->mfc_bw_meter = NULL;
 1275 
 1276                         /* insert new entry at head of hash chain */
 1277                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1278                 }
 1279         }
 1280 
 1281         splx(s);
 1282         return (0);
 1283 }
 1284 
 1285 #ifdef UPCALL_TIMING
 1286 /*
 1287  * collect delay statistics on the upcalls
 1288  */
 1289 static void
 1290 collate(struct timeval *t)
 1291 {
 1292         u_int32_t d;
 1293         struct timeval tp;
 1294         u_int32_t delta;
 1295 
 1296         microtime(&tp);
 1297 
 1298         if (timercmp(t, &tp, <)) {
 1299                 TV_DELTA(tp, *t, delta);
 1300 
 1301                 d = delta >> 10;
 1302                 if (d > 50)
 1303                         d = 50;
 1304 
 1305                 ++upcall_data[d];
 1306         }
 1307 }
 1308 #endif /* UPCALL_TIMING */
 1309 
 1310 /*
 1311  * Delete an mfc entry
 1312  */
 1313 static int
 1314 del_mfc(struct mbuf *m)
 1315 {
 1316         struct mfcctl2 mfcctl2;
 1317         struct mfcctl2 *mfccp;
 1318         struct mfc *rt;
 1319         int s;
 1320         int mfcctl_size = sizeof(struct mfcctl);
 1321         struct mfcctl *mp = mtod(m, struct mfcctl *);
 1322 
 1323         /*
 1324          * XXX: for deleting MFC entries the information in entries
 1325          * of size "struct mfcctl" is sufficient.
 1326          */
 1327 
 1328         if (m == NULL || m->m_len < mfcctl_size)
 1329                 return (EINVAL);
 1330 
 1331         bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp));
 1332         bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl),
 1333               sizeof(mfcctl2) - sizeof(struct mfcctl));
 1334 
 1335         mfccp = &mfcctl2;
 1336 
 1337         if (mrtdebug & DEBUG_MFC)
 1338                 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
 1339                     ntohl(mfccp->mfcc_origin.s_addr),
 1340                     ntohl(mfccp->mfcc_mcastgrp.s_addr));
 1341 
 1342         s = splsoftnet();
 1343 
 1344         rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 1345         if (rt == NULL) {
 1346                 splx(s);
 1347                 return (EADDRNOTAVAIL);
 1348         }
 1349 
 1350         /*
 1351          * free the bw_meter entries
 1352          */
 1353         free_bw_list(rt->mfc_bw_meter);
 1354         rt->mfc_bw_meter = NULL;
 1355 
 1356         LIST_REMOVE(rt, mfc_hash);
 1357         free(rt, M_MRTABLE);
 1358 
 1359         splx(s);
 1360         return (0);
 1361 }
 1362 
 1363 static int
 1364 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 1365 {
 1366         if (s) {
 1367                 if (sbappendaddr(&s->so_rcv, sintosa(src), mm,
 1368                     (struct mbuf *)NULL) != 0) {
 1369                         sorwakeup(s);
 1370                         return (0);
 1371                 }
 1372         }
 1373         m_freem(mm);
 1374         return (-1);
 1375 }
 1376 
 1377 /*
 1378  * IP multicast forwarding function. This function assumes that the packet
 1379  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
 1380  * pointed to by "ifp", and the packet is to be relayed to other networks
 1381  * that have members of the packet's destination IP multicast group.
 1382  *
 1383  * The packet is returned unscathed to the caller, unless it is
 1384  * erroneous, in which case a non-zero return value tells the caller to
 1385  * discard it.
 1386  */
 1387 
 1388 #define IP_HDR_LEN  20  /* # bytes of fixed IP header (excluding options) */
 1389 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 1390 
 1391 int
 1392 #ifdef RSVP_ISI
 1393 ip_mforward(struct mbuf *m, struct ifnet *ifp, struct ip_moptions *imo)
 1394 #else
 1395 ip_mforward(struct mbuf *m, struct ifnet *ifp)
 1396 #endif /* RSVP_ISI */
 1397 {
 1398         struct ip *ip = mtod(m, struct ip *);
 1399         struct mfc *rt;
 1400         static int srctun = 0;
 1401         struct mbuf *mm;
 1402         int s;
 1403         vifi_t vifi;
 1404 
 1405         if (mrtdebug & DEBUG_FORWARD)
 1406                 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
 1407                     ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
 1408 
 1409         if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
 1410             ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
 1411                 /*
 1412                  * Packet arrived via a physical interface or
 1413                  * an encapsulated tunnel or a register_vif.
 1414                  */
 1415         } else {
 1416                 /*
 1417                  * Packet arrived through a source-route tunnel.
 1418                  * Source-route tunnels are no longer supported.
 1419                  */
 1420                 if ((srctun++ % 1000) == 0)
 1421                         log(LOG_ERR,
 1422                             "ip_mforward: received source-routed packet from %x\n",
 1423                             ntohl(ip->ip_src.s_addr));
 1424 
 1425                 return (1);
 1426         }
 1427 
 1428         /*
 1429          * Clear any in-bound checksum flags for this packet.
 1430          */
 1431         m->m_pkthdr.csum_flags = 0;
 1432 
 1433 #ifdef RSVP_ISI
 1434         if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
 1435                 if (ip->ip_ttl < MAXTTL)
 1436                         ip->ip_ttl++;   /* compensate for -1 in *_send routines */
 1437                 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1438                         struct vif *vifp = viftable + vifi;
 1439                         printf("Sending IPPROTO_RSVP from %x to %x on vif %d (%s%s)\n",
 1440                             ntohl(ip->ip_src), ntohl(ip->ip_dst), vifi,
 1441                             (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
 1442                             vifp->v_ifp->if_xname);
 1443                 }
 1444                 return (ip_mdq(m, ifp, (struct mfc *)NULL, vifi));
 1445         }
 1446         if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1447                 printf("Warning: IPPROTO_RSVP from %x to %x without vif option\n",
 1448                     ntohl(ip->ip_src), ntohl(ip->ip_dst));
 1449         }
 1450 #endif /* RSVP_ISI */
 1451 
 1452         /*
 1453          * Don't forward a packet with time-to-live of zero or one,
 1454          * or a packet destined to a local-only group.
 1455          */
 1456         if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
 1457                 return (0);
 1458 
 1459         /*
 1460          * Determine forwarding vifs from the forwarding cache table
 1461          */
 1462         s = splsoftnet();
 1463         ++mrtstat.mrts_mfc_lookups;
 1464         rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 1465 
 1466         /* Entry exists, so forward if necessary */
 1467         if (rt != NULL) {
 1468                 splx(s);
 1469 #ifdef RSVP_ISI
 1470                 return (ip_mdq(m, ifp, rt, -1));
 1471 #else
 1472                 return (ip_mdq(m, ifp, rt));
 1473 #endif /* RSVP_ISI */
 1474         } else {
 1475                 /*
 1476                  * If we don't have a route for packet's origin,
 1477                  * Make a copy of the packet & send message to routing daemon
 1478                  */
 1479 
 1480                 struct mbuf *mb0;
 1481                 struct rtdetq *rte;
 1482                 u_int32_t hash;
 1483                 int hlen = ip->ip_hl << 2;
 1484 #ifdef UPCALL_TIMING
 1485                 struct timeval tp;
 1486 
 1487                 microtime(&tp);
 1488 #endif /* UPCALL_TIMING */
 1489 
 1490                 ++mrtstat.mrts_mfc_misses;
 1491 
 1492                 mrtstat.mrts_no_route++;
 1493                 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
 1494                         log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
 1495                             ntohl(ip->ip_src.s_addr),
 1496                             ntohl(ip->ip_dst.s_addr));
 1497 
 1498                 /*
 1499                  * Allocate mbufs early so that we don't do extra work if we are
 1500                  * just going to fail anyway.  Make sure to pullup the header so
 1501                  * that other people can't step on it.
 1502                  */
 1503                 rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE,
 1504                                               M_NOWAIT);
 1505                 if (rte == NULL) {
 1506                         splx(s);
 1507                         return (ENOBUFS);
 1508                 }
 1509                 mb0 = m_copy(m, 0, M_COPYALL);
 1510                 M_PULLUP(mb0, hlen);
 1511                 if (mb0 == NULL) {
 1512                         free(rte, M_MRTABLE);
 1513                         splx(s);
 1514                         return (ENOBUFS);
 1515                 }
 1516 
 1517                 /* is there an upcall waiting for this flow? */
 1518                 hash = MFCHASH(ip->ip_src, ip->ip_dst);
 1519                 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
 1520                         if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 1521                             in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 1522                             rt->mfc_stall != NULL)
 1523                                 break;
 1524                 }
 1525 
 1526                 if (rt == NULL) {
 1527                         int i;
 1528                         struct igmpmsg *im;
 1529 
 1530                         /*
 1531                          * Locate the vifi for the incoming interface for
 1532                          * this packet.
 1533                          * If none found, drop packet.
 1534                          */
 1535                         for (vifi = 0; vifi < numvifs &&
 1536                                  viftable[vifi].v_ifp != ifp; vifi++)
 1537                                 ;
 1538                         if (vifi >= numvifs) /* vif not found, drop packet */
 1539                                 goto non_fatal;
 1540 
 1541                         /* no upcall, so make a new entry */
 1542                         rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
 1543                                                   M_NOWAIT);
 1544                         if (rt == NULL)
 1545                                 goto fail;
 1546 
 1547                         /*
 1548                          * Make a copy of the header to send to the user level
 1549                          * process
 1550                          */
 1551                         mm = m_copy(m, 0, hlen);
 1552                         M_PULLUP(mm, hlen);
 1553                         if (mm == NULL)
 1554                                 goto fail1;
 1555 
 1556                         /*
 1557                          * Send message to routing daemon to install
 1558                          * a route into the kernel table
 1559                          */
 1560 
 1561                         im = mtod(mm, struct igmpmsg *);
 1562                         im->im_msgtype = IGMPMSG_NOCACHE;
 1563                         im->im_mbz = 0;
 1564                         im->im_vif = vifi;
 1565 
 1566                         mrtstat.mrts_upcalls++;
 1567 
 1568                         sin.sin_addr = ip->ip_src;
 1569                         if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1570                                 log(LOG_WARNING,
 1571                                     "ip_mforward: ip_mrouter socket queue full\n");
 1572                                 ++mrtstat.mrts_upq_sockfull;
 1573                         fail1:
 1574                                 free(rt, M_MRTABLE);
 1575                         fail:
 1576                                 free(rte, M_MRTABLE);
 1577                                 m_freem(mb0);
 1578                                 splx(s);
 1579                                 return (ENOBUFS);
 1580                         }
 1581 
 1582                         /* insert new entry at head of hash chain */
 1583                         rt->mfc_origin = ip->ip_src;
 1584                         rt->mfc_mcastgrp = ip->ip_dst;
 1585                         rt->mfc_pkt_cnt = 0;
 1586                         rt->mfc_byte_cnt = 0;
 1587                         rt->mfc_wrong_if = 0;
 1588                         rt->mfc_expire = UPCALL_EXPIRE;
 1589                         nexpire[hash]++;
 1590                         for (i = 0; i < numvifs; i++) {
 1591                                 rt->mfc_ttls[i] = 0;
 1592                                 rt->mfc_flags[i] = 0;
 1593                         }
 1594                         rt->mfc_parent = -1;
 1595 
 1596                         /* clear the RP address */
 1597                         rt->mfc_rp = zeroin_addr;
 1598 
 1599                         rt->mfc_bw_meter = NULL;
 1600 
 1601                         /* link into table */
 1602                         LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
 1603                         /* Add this entry to the end of the queue */
 1604                         rt->mfc_stall = rte;
 1605                 } else {
 1606                         /* determine if q has overflowed */
 1607                         struct rtdetq **p;
 1608                         int npkts = 0;
 1609 
 1610                         /*
 1611                          * XXX ouch! we need to append to the list, but we
 1612                          * only have a pointer to the front, so we have to
 1613                          * scan the entire list every time.
 1614                          */
 1615                         for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
 1616                                 if (++npkts > MAX_UPQ) {
 1617                                         mrtstat.mrts_upq_ovflw++;
 1618                                 non_fatal:
 1619                                         free(rte, M_MRTABLE);
 1620                                         m_freem(mb0);
 1621                                         splx(s);
 1622                                         return (0);
 1623                                 }
 1624 
 1625                         /* Add this entry to the end of the queue */
 1626                         *p = rte;
 1627                 }
 1628 
 1629                 rte->next = NULL;
 1630                 rte->m = mb0;
 1631                 rte->ifp = ifp;
 1632 #ifdef UPCALL_TIMING
 1633                 rte->t = tp;
 1634 #endif /* UPCALL_TIMING */
 1635 
 1636                 splx(s);
 1637 
 1638                 return (0);
 1639         }
 1640 }
 1641 
 1642 
 1643 /*ARGSUSED*/
 1644 static void
 1645 expire_upcalls(void *v)
 1646 {
 1647         int i;
 1648         int s;
 1649 
 1650         s = splsoftnet();
 1651 
 1652         for (i = 0; i < MFCTBLSIZ; i++) {
 1653                 struct mfc *rt, *nrt;
 1654 
 1655                 if (nexpire[i] == 0)
 1656                         continue;
 1657 
 1658                 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
 1659                         nrt = LIST_NEXT(rt, mfc_hash);
 1660 
 1661                         if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 1662                                 continue;
 1663                         nexpire[i]--;
 1664 
 1665                         /*
 1666                          * free the bw_meter entries
 1667                          */
 1668                         while (rt->mfc_bw_meter != NULL) {
 1669                                 struct bw_meter *x = rt->mfc_bw_meter;
 1670 
 1671                                 rt->mfc_bw_meter = x->bm_mfc_next;
 1672                                 free(x, M_BWMETER);
 1673                         }
 1674 
 1675                         ++mrtstat.mrts_cache_cleanups;
 1676                         if (mrtdebug & DEBUG_EXPIRE)
 1677                                 log(LOG_DEBUG,
 1678                                     "expire_upcalls: expiring (%x %x)\n",
 1679                                     ntohl(rt->mfc_origin.s_addr),
 1680                                     ntohl(rt->mfc_mcastgrp.s_addr));
 1681 
 1682                         expire_mfc(rt);
 1683                 }
 1684         }
 1685 
 1686         splx(s);
 1687         callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 1688             expire_upcalls, NULL);
 1689 }
 1690 
 1691 /*
 1692  * Packet forwarding routine once entry in the cache is made
 1693  */
 1694 static int
 1695 #ifdef RSVP_ISI
 1696 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 1697 #else
 1698 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
 1699 #endif /* RSVP_ISI */
 1700 {
 1701         struct ip  *ip = mtod(m, struct ip *);
 1702         vifi_t vifi;
 1703         struct vif *vifp;
 1704         int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
 1705 
 1706 /*
 1707  * Macro to send packet on vif.  Since RSVP packets don't get counted on
 1708  * input, they shouldn't get counted on output, so statistics keeping is
 1709  * separate.
 1710  */
 1711 #define MC_SEND(ip, vifp, m) do {                                       \
 1712         if ((vifp)->v_flags & VIFF_TUNNEL)                              \
 1713                 encap_send((ip), (vifp), (m));                          \
 1714         else                                                            \
 1715                 phyint_send((ip), (vifp), (m));                         \
 1716 } while (/*CONSTCOND*/ 0)
 1717 
 1718 #ifdef RSVP_ISI
 1719         /*
 1720          * If xmt_vif is not -1, send on only the requested vif.
 1721          *
 1722          * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.
 1723          */
 1724         if (xmt_vif < numvifs) {
 1725 #ifdef PIM
 1726                 if (viftable[xmt_vif].v_flags & VIFF_REGISTER)
 1727                         pim_register_send(ip, viftable + xmt_vif, m, rt);
 1728                 else
 1729 #endif
 1730                 MC_SEND(ip, viftable + xmt_vif, m);
 1731                 return (1);
 1732         }
 1733 #endif /* RSVP_ISI */
 1734 
 1735         /*
 1736          * Don't forward if it didn't arrive from the parent vif for its origin.
 1737          */
 1738         vifi = rt->mfc_parent;
 1739         if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
 1740                 /* came in the wrong interface */
 1741                 if (mrtdebug & DEBUG_FORWARD)
 1742                         log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
 1743                             ifp, vifi,
 1744                             vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
 1745                 ++mrtstat.mrts_wrong_if;
 1746                 ++rt->mfc_wrong_if;
 1747                 /*
 1748                  * If we are doing PIM assert processing, send a message
 1749                  * to the routing daemon.
 1750                  *
 1751                  * XXX: A PIM-SM router needs the WRONGVIF detection so it
 1752                  * can complete the SPT switch, regardless of the type
 1753                  * of the iif (broadcast media, GRE tunnel, etc).
 1754                  */
 1755                 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
 1756                         struct timeval now;
 1757                         u_int32_t delta;
 1758 
 1759 #ifdef PIM
 1760                         if (ifp == &multicast_register_if)
 1761                                 pimstat.pims_rcv_registers_wrongiif++;
 1762 #endif
 1763 
 1764                         /* Get vifi for the incoming packet */
 1765                         for (vifi = 0;
 1766                              vifi < numvifs && viftable[vifi].v_ifp != ifp;
 1767                              vifi++)
 1768                             ;
 1769                         if (vifi >= numvifs) {
 1770                                 /* The iif is not found: ignore the packet. */
 1771                                 return (0);
 1772                         }
 1773 
 1774                         if (rt->mfc_flags[vifi] &
 1775                             MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
 1776                                 /* WRONGVIF disabled: ignore the packet */
 1777                                 return (0);
 1778                         }
 1779 
 1780                         microtime(&now);
 1781 
 1782                         TV_DELTA(rt->mfc_last_assert, now, delta);
 1783 
 1784                         if (delta > ASSERT_MSG_TIME) {
 1785                                 struct igmpmsg *im;
 1786                                 int hlen = ip->ip_hl << 2;
 1787                                 struct mbuf *mm = m_copy(m, 0, hlen);
 1788 
 1789                                 M_PULLUP(mm, hlen);
 1790                                 if (mm == NULL)
 1791                                         return (ENOBUFS);
 1792 
 1793                                 rt->mfc_last_assert = now;
 1794 
 1795                                 im = mtod(mm, struct igmpmsg *);
 1796                                 im->im_msgtype  = IGMPMSG_WRONGVIF;
 1797                                 im->im_mbz      = 0;
 1798                                 im->im_vif      = vifi;
 1799 
 1800                                 mrtstat.mrts_upcalls++;
 1801 
 1802                                 sin.sin_addr = im->im_src;
 1803                                 if (socket_send(ip_mrouter, mm, &sin) < 0) {
 1804                                         log(LOG_WARNING,
 1805                                             "ip_mforward: ip_mrouter socket queue full\n");
 1806                                         ++mrtstat.mrts_upq_sockfull;
 1807                                         return (ENOBUFS);
 1808                                 }
 1809                         }
 1810                 }
 1811                 return (0);
 1812         }
 1813 
 1814         /* If I sourced this packet, it counts as output, else it was input. */
 1815         if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
 1816                 viftable[vifi].v_pkt_out++;
 1817                 viftable[vifi].v_bytes_out += plen;
 1818         } else {
 1819                 viftable[vifi].v_pkt_in++;
 1820                 viftable[vifi].v_bytes_in += plen;
 1821         }
 1822         rt->mfc_pkt_cnt++;
 1823         rt->mfc_byte_cnt += plen;
 1824 
 1825         /*
 1826          * For each vif, decide if a copy of the packet should be forwarded.
 1827          * Forward if:
 1828          *              - the ttl exceeds the vif's threshold
 1829          *              - there are group members downstream on interface
 1830          */
 1831         for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
 1832                 if ((rt->mfc_ttls[vifi] > 0) &&
 1833                         (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 1834                         vifp->v_pkt_out++;
 1835                         vifp->v_bytes_out += plen;
 1836 #ifdef PIM
 1837                         if (vifp->v_flags & VIFF_REGISTER)
 1838                                 pim_register_send(ip, vifp, m, rt);
 1839                         else
 1840 #endif
 1841                         MC_SEND(ip, vifp, m);
 1842                 }
 1843 
 1844         /*
 1845          * Perform upcall-related bw measuring.
 1846          */
 1847         if (rt->mfc_bw_meter != NULL) {
 1848                 struct bw_meter *x;
 1849                 struct timeval now;
 1850 
 1851                 microtime(&now);
 1852                 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 1853                         bw_meter_receive_packet(x, plen, &now);
 1854         }
 1855 
 1856         return (0);
 1857 }
 1858 
 1859 #ifdef RSVP_ISI
 1860 /*
 1861  * check if a vif number is legal/ok. This is used by ip_output.
 1862  */
 1863 int
 1864 legal_vif_num(int vif)
 1865 {
 1866         if (vif >= 0 && vif < numvifs)
 1867                 return (1);
 1868         else
 1869                 return (0);
 1870 }
 1871 #endif /* RSVP_ISI */
 1872 
 1873 static void
 1874 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1875 {
 1876         struct mbuf *mb_copy;
 1877         int hlen = ip->ip_hl << 2;
 1878 
 1879         /*
 1880          * Make a new reference to the packet; make sure that
 1881          * the IP header is actually copied, not just referenced,
 1882          * so that ip_output() only scribbles on the copy.
 1883          */
 1884         mb_copy = m_copy(m, 0, M_COPYALL);
 1885         M_PULLUP(mb_copy, hlen);
 1886         if (mb_copy == NULL)
 1887                 return;
 1888 
 1889         if (vifp->v_rate_limit <= 0)
 1890                 tbf_send_packet(vifp, mb_copy);
 1891         else
 1892                 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *),
 1893                     ntohs(ip->ip_len));
 1894 }
 1895 
 1896 static void
 1897 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1898 {
 1899         struct mbuf *mb_copy;
 1900         struct ip *ip_copy;
 1901         int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
 1902 
 1903         /* Take care of delayed checksums */
 1904         if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
 1905                 in_delayed_cksum(m);
 1906                 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
 1907         }
 1908 
 1909         /*
 1910          * copy the old packet & pullup it's IP header into the
 1911          * new mbuf so we can modify it.  Try to fill the new
 1912          * mbuf since if we don't the ethernet driver will.
 1913          */
 1914         MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
 1915         if (mb_copy == NULL)
 1916                 return;
 1917         mb_copy->m_data += max_linkhdr;
 1918         mb_copy->m_pkthdr.len = len;
 1919         mb_copy->m_len = sizeof(multicast_encap_iphdr);
 1920 
 1921         if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
 1922                 m_freem(mb_copy);
 1923                 return;
 1924         }
 1925         i = MHLEN - max_linkhdr;
 1926         if (i > len)
 1927                 i = len;
 1928         mb_copy = m_pullup(mb_copy, i);
 1929         if (mb_copy == NULL)
 1930                 return;
 1931 
 1932         /*
 1933          * fill in the encapsulating IP header.
 1934          */
 1935         ip_copy = mtod(mb_copy, struct ip *);
 1936         *ip_copy = multicast_encap_iphdr;
 1937         ip_copy->ip_id = ip_newid();
 1938         ip_copy->ip_len = htons(len);
 1939         ip_copy->ip_src = vifp->v_lcl_addr;
 1940         ip_copy->ip_dst = vifp->v_rmt_addr;
 1941 
 1942         /*
 1943          * turn the encapsulated IP header back into a valid one.
 1944          */
 1945         ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
 1946         --ip->ip_ttl;
 1947         ip->ip_sum = 0;
 1948         mb_copy->m_data += sizeof(multicast_encap_iphdr);
 1949         ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 1950         mb_copy->m_data -= sizeof(multicast_encap_iphdr);
 1951 
 1952         if (vifp->v_rate_limit <= 0)
 1953                 tbf_send_packet(vifp, mb_copy);
 1954         else
 1955                 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len));
 1956 }
 1957 
 1958 /*
 1959  * De-encapsulate a packet and feed it back through ip input.
 1960  */
 1961 static void
 1962 vif_input(struct mbuf *m, ...)
 1963 {
 1964         int off, proto;
 1965         va_list ap;
 1966         struct vif *vifp;
 1967         int s;
 1968         struct ifqueue *ifq;
 1969 
 1970         va_start(ap, m);
 1971         off = va_arg(ap, int);
 1972         proto = va_arg(ap, int);
 1973         va_end(ap);
 1974 
 1975         vifp = (struct vif *)encap_getarg(m);
 1976         if (!vifp || proto != ENCAP_PROTO) {
 1977                 m_freem(m);
 1978                 mrtstat.mrts_bad_tunnel++;
 1979                 return;
 1980         }
 1981 
 1982         m_adj(m, off);
 1983         m->m_pkthdr.rcvif = vifp->v_ifp;
 1984         ifq = &ipintrq;
 1985         s = splnet();
 1986         if (IF_QFULL(ifq)) {
 1987                 IF_DROP(ifq);
 1988                 m_freem(m);
 1989         } else {
 1990                 IF_ENQUEUE(ifq, m);
 1991                 /*
 1992                  * normally we would need a "schednetisr(NETISR_IP)"
 1993                  * here but we were called by ip_input and it is going
 1994                  * to loop back & try to dequeue the packet we just
 1995                  * queued as soon as we return so we avoid the
 1996                  * unnecessary software interrrupt.
 1997                  */
 1998         }
 1999         splx(s);
 2000 }
 2001 
 2002 /*
 2003  * Check if the packet should be received on the vif denoted by arg.
 2004  * (The encap selection code will call this once per vif since each is
 2005  * registered separately.)
 2006  */
 2007 static int
 2008 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg)
 2009 {
 2010         struct vif *vifp;
 2011         struct ip ip;
 2012 
 2013 #ifdef DIAGNOSTIC
 2014         if (!arg || proto != IPPROTO_IPV4)
 2015                 panic("unexpected arg in vif_encapcheck");
 2016 #endif
 2017 
 2018         /*
 2019          * Accept the packet only if the inner heaader is multicast
 2020          * and the outer header matches a tunnel-mode vif.  Order
 2021          * checks in the hope that common non-matching packets will be
 2022          * rejected quickly.  Assume that unicast IPv4 traffic in a
 2023          * parallel tunnel (e.g. gif(4)) is unlikely.
 2024          */
 2025 
 2026         /* Obtain the outer IP header and the vif pointer. */
 2027         m_copydata((struct mbuf *)m, 0, sizeof(ip), (caddr_t)&ip);
 2028         vifp = (struct vif *)arg;
 2029 
 2030         /*
 2031          * The outer source must match the vif's remote peer address.
 2032          * For a multicast router with several tunnels, this is the
 2033          * only check that will fail on packets in other tunnels,
 2034          * assuming the local address is the same.         
 2035          */
 2036         if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src))
 2037                 return 0;
 2038 
 2039         /* The outer destination must match the vif's local address. */
 2040         if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst))
 2041                 return 0;
 2042 
 2043         /* The vif must be of tunnel type. */
 2044         if ((vifp->v_flags & VIFF_TUNNEL) == 0)
 2045                 return 0;
 2046 
 2047         /* Check that the inner destination is multicast. */
 2048         m_copydata((struct mbuf *)m, off, sizeof(ip), (caddr_t)&ip);
 2049         if (!IN_MULTICAST(ip.ip_dst.s_addr))
 2050                 return 0;
 2051 
 2052         /*
 2053          * We have checked that both the outer src and dst addresses
 2054          * match the vif, and that the inner destination is multicast
 2055          * (224/5).  By claiming more than 64, we intend to
 2056          * preferentially take packets that also match a parallel
 2057          * gif(4).
 2058          */
 2059         return 32 + 32 + 5;
 2060 }
 2061 
 2062 /*
 2063  * Token bucket filter module
 2064  */
 2065 static void
 2066 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len)
 2067 {
 2068 
 2069         if (len > MAX_BKT_SIZE) {
 2070                 /* drop if packet is too large */
 2071                 mrtstat.mrts_pkt2large++;
 2072                 m_freem(m);
 2073                 return;
 2074         }
 2075 
 2076         tbf_update_tokens(vifp);
 2077 
 2078         /*
 2079          * If there are enough tokens, and the queue is empty, send this packet
 2080          * out immediately.  Otherwise, try to insert it on this vif's queue.
 2081          */
 2082         if (vifp->tbf_q_len == 0) {
 2083                 if (len <= vifp->tbf_n_tok) {
 2084                         vifp->tbf_n_tok -= len;
 2085                         tbf_send_packet(vifp, m);
 2086                 } else {
 2087                         /* queue packet and timeout till later */
 2088                         tbf_queue(vifp, m);
 2089                         callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
 2090                             tbf_reprocess_q, vifp);
 2091                 }
 2092         } else {
 2093                 if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
 2094                     !tbf_dq_sel(vifp, ip)) {
 2095                         /* queue full, and couldn't make room */
 2096                         mrtstat.mrts_q_overflow++;
 2097                         m_freem(m);
 2098                 } else {
 2099                         /* queue length low enough, or made room */
 2100                         tbf_queue(vifp, m);
 2101                         tbf_process_q(vifp);
 2102                 }
 2103         }
 2104 }
 2105 
 2106 /*
 2107  * adds a packet to the queue at the interface
 2108  */
 2109 static void
 2110 tbf_queue(struct vif *vifp, struct mbuf *m)
 2111 {
 2112         int s = splsoftnet();
 2113 
 2114         /* insert at tail */
 2115         *vifp->tbf_t = m;
 2116         vifp->tbf_t = &m->m_nextpkt;
 2117         vifp->tbf_q_len++;
 2118 
 2119         splx(s);
 2120 }
 2121 
 2122 
 2123 /*
 2124  * processes the queue at the interface
 2125  */
 2126 static void
 2127 tbf_process_q(struct vif *vifp)
 2128 {
 2129         struct mbuf *m;
 2130         int len;
 2131         int s = splsoftnet();
 2132 
 2133         /*
 2134          * Loop through the queue at the interface and send as many packets
 2135          * as possible.
 2136          */
 2137         for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) {
 2138                 len = ntohs(mtod(m, struct ip *)->ip_len);
 2139 
 2140                 /* determine if the packet can be sent */
 2141                 if (len <= vifp->tbf_n_tok) {
 2142                         /* if so,
 2143                          * reduce no of tokens, dequeue the packet,
 2144                          * send the packet.
 2145                          */
 2146                         if ((vifp->tbf_q = m->m_nextpkt) == NULL)
 2147                                 vifp->tbf_t = &vifp->tbf_q;
 2148                         --vifp->tbf_q_len;
 2149 
 2150                         m->m_nextpkt = NULL;
 2151                         vifp->tbf_n_tok -= len;
 2152                         tbf_send_packet(vifp, m);
 2153                 } else
 2154                         break;
 2155         }
 2156         splx(s);
 2157 }
 2158 
 2159 static void
 2160 tbf_reprocess_q(void *arg)
 2161 {
 2162         struct vif *vifp = arg;
 2163 
 2164         if (ip_mrouter == NULL)
 2165                 return;
 2166 
 2167         tbf_update_tokens(vifp);
 2168         tbf_process_q(vifp);
 2169 
 2170         if (vifp->tbf_q_len != 0)
 2171                 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
 2172                     tbf_reprocess_q, vifp);
 2173 }
 2174 
 2175 /* function that will selectively discard a member of the queue
 2176  * based on the precedence value and the priority
 2177  */
 2178 static int
 2179 tbf_dq_sel(struct vif *vifp, struct ip *ip)
 2180 {
 2181         u_int p;
 2182         struct mbuf **mp, *m;
 2183         int s = splsoftnet();
 2184 
 2185         p = priority(vifp, ip);
 2186 
 2187         for (mp = &vifp->tbf_q, m = *mp;
 2188             m != NULL;
 2189             mp = &m->m_nextpkt, m = *mp) {
 2190                 if (p > priority(vifp, mtod(m, struct ip *))) {
 2191                         if ((*mp = m->m_nextpkt) == NULL)
 2192                                 vifp->tbf_t = mp;
 2193                         --vifp->tbf_q_len;
 2194 
 2195                         m_freem(m);
 2196                         mrtstat.mrts_drop_sel++;
 2197                         splx(s);
 2198                         return (1);
 2199                 }
 2200         }
 2201         splx(s);
 2202         return (0);
 2203 }
 2204 
 2205 static void
 2206 tbf_send_packet(struct vif *vifp, struct mbuf *m)
 2207 {
 2208         int error;
 2209         int s = splsoftnet();
 2210 
 2211         if (vifp->v_flags & VIFF_TUNNEL) {
 2212                 /* If tunnel options */
 2213                 ip_output(m, (struct mbuf *)NULL, &vifp->v_route,
 2214                     IP_FORWARDING, (struct ip_moptions *)NULL,
 2215                     (struct socket *)NULL);
 2216         } else {
 2217                 /* if physical interface option, extract the options and then send */
 2218                 struct ip_moptions imo;
 2219 
 2220                 imo.imo_multicast_ifp = vifp->v_ifp;
 2221                 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
 2222                 imo.imo_multicast_loop = 1;
 2223 #ifdef RSVP_ISI
 2224                 imo.imo_multicast_vif = -1;
 2225 #endif
 2226 
 2227                 error = ip_output(m, (struct mbuf *)NULL, (struct route *)NULL,
 2228                     IP_FORWARDING|IP_MULTICASTOPTS, &imo,
 2229                     (struct socket *)NULL);
 2230 
 2231                 if (mrtdebug & DEBUG_XMIT)
 2232                         log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
 2233                             (long)(vifp - viftable), error);
 2234         }
 2235         splx(s);
 2236 }
 2237 
 2238 /* determine the current time and then
 2239  * the elapsed time (between the last time and time now)
 2240  * in milliseconds & update the no. of tokens in the bucket
 2241  */
 2242 static void
 2243 tbf_update_tokens(struct vif *vifp)
 2244 {
 2245         struct timeval tp;
 2246         u_int32_t tm;
 2247         int s = splsoftnet();
 2248 
 2249         microtime(&tp);
 2250 
 2251         TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
 2252 
 2253         /*
 2254          * This formula is actually
 2255          * "time in seconds" * "bytes/second".
 2256          *
 2257          * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
 2258          *
 2259          * The (1000/1024) was introduced in add_vif to optimize
 2260          * this divide into a shift.
 2261          */
 2262         vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
 2263         vifp->tbf_last_pkt_t = tp;
 2264 
 2265         if (vifp->tbf_n_tok > MAX_BKT_SIZE)
 2266                 vifp->tbf_n_tok = MAX_BKT_SIZE;
 2267 
 2268         splx(s);
 2269 }
 2270 
 2271 static int
 2272 priority(struct vif *vifp, struct ip *ip)
 2273 {
 2274         int prio = 50;  /* the lowest priority -- default case */
 2275 
 2276         /* temporary hack; may add general packet classifier some day */
 2277 
 2278         /*
 2279          * The UDP port space is divided up into four priority ranges:
 2280          * [0, 16384)     : unclassified - lowest priority
 2281          * [16384, 32768) : audio - highest priority
 2282          * [32768, 49152) : whiteboard - medium priority
 2283          * [49152, 65536) : video - low priority
 2284          */
 2285         if (ip->ip_p == IPPROTO_UDP) {
 2286                 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
 2287 
 2288                 switch (ntohs(udp->uh_dport) & 0xc000) {
 2289                 case 0x4000:
 2290                         prio = 70;
 2291                         break;
 2292                 case 0x8000:
 2293                         prio = 60;
 2294                         break;
 2295                 case 0xc000:
 2296                         prio = 55;
 2297                         break;
 2298                 }
 2299 
 2300                 if (tbfdebug > 1)
 2301                         log(LOG_DEBUG, "port %x prio %d\n",
 2302                             ntohs(udp->uh_dport), prio);
 2303         }
 2304 
 2305         return (prio);
 2306 }
 2307 
 2308 /*
 2309  * End of token bucket filter modifications
 2310  */
 2311 #ifdef RSVP_ISI
 2312 int
 2313 ip_rsvp_vif_init(struct socket *so, struct mbuf *m)
 2314 {
 2315         int vifi, s;
 2316 
 2317         if (rsvpdebug)
 2318                 printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n",
 2319                     so->so_type, so->so_proto->pr_protocol);
 2320 
 2321         if (so->so_type != SOCK_RAW ||
 2322             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2323                 return (EOPNOTSUPP);
 2324 
 2325         /* Check mbuf. */
 2326         if (m == NULL || m->m_len != sizeof(int)) {
 2327                 return (EINVAL);
 2328         }
 2329         vifi = *(mtod(m, int *));
 2330 
 2331         if (rsvpdebug)
 2332                 printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n",
 2333                        vifi, rsvp_on);
 2334 
 2335         s = splsoftnet();
 2336 
 2337         /* Check vif. */
 2338         if (!legal_vif_num(vifi)) {
 2339                 splx(s);
 2340                 return (EADDRNOTAVAIL);
 2341         }
 2342 
 2343         /* Check if socket is available. */
 2344         if (viftable[vifi].v_rsvpd != NULL) {
 2345                 splx(s);
 2346                 return (EADDRINUSE);
 2347         }
 2348 
 2349         viftable[vifi].v_rsvpd = so;
 2350         /*
 2351          * This may seem silly, but we need to be sure we don't over-increment
 2352          * the RSVP counter, in case something slips up.
 2353          */
 2354         if (!viftable[vifi].v_rsvp_on) {
 2355                 viftable[vifi].v_rsvp_on = 1;
 2356                 rsvp_on++;
 2357         }
 2358 
 2359         splx(s);
 2360         return (0);
 2361 }
 2362 
 2363 int
 2364 ip_rsvp_vif_done(struct socket *so, struct mbuf *m)
 2365 {
 2366         int vifi, s;
 2367 
 2368         if (rsvpdebug)
 2369                 printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n",
 2370                     so->so_type, so->so_proto->pr_protocol);
 2371 
 2372         if (so->so_type != SOCK_RAW ||
 2373             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2374                 return (EOPNOTSUPP);
 2375 
 2376         /* Check mbuf. */
 2377         if (m == NULL || m->m_len != sizeof(int)) {
 2378                 return (EINVAL);
 2379         }
 2380         vifi = *(mtod(m, int *));
 2381 
 2382         s = splsoftnet();
 2383 
 2384         /* Check vif. */
 2385         if (!legal_vif_num(vifi)) {
 2386                 splx(s);
 2387                 return (EADDRNOTAVAIL);
 2388         }
 2389 
 2390         if (rsvpdebug)
 2391                 printf("ip_rsvp_vif_done: v_rsvpd = %x so = %x\n",
 2392                     viftable[vifi].v_rsvpd, so);
 2393 
 2394         viftable[vifi].v_rsvpd = NULL;
 2395         /*
 2396          * This may seem silly, but we need to be sure we don't over-decrement
 2397          * the RSVP counter, in case something slips up.
 2398          */
 2399         if (viftable[vifi].v_rsvp_on) {
 2400                 viftable[vifi].v_rsvp_on = 0;
 2401                 rsvp_on--;
 2402         }
 2403 
 2404         splx(s);
 2405         return (0);
 2406 }
 2407 
 2408 void
 2409 ip_rsvp_force_done(struct socket *so)
 2410 {
 2411         int vifi, s;
 2412 
 2413         /* Don't bother if it is not the right type of socket. */
 2414         if (so->so_type != SOCK_RAW ||
 2415             so->so_proto->pr_protocol != IPPROTO_RSVP)
 2416                 return;
 2417 
 2418         s = splsoftnet();
 2419 
 2420         /*
 2421          * The socket may be attached to more than one vif...this
 2422          * is perfectly legal.
 2423          */
 2424         for (vifi = 0; vifi < numvifs; vifi++) {
 2425                 if (viftable[vifi].v_rsvpd == so) {
 2426                         viftable[vifi].v_rsvpd = NULL;
 2427                         /*
 2428                          * This may seem silly, but we need to be sure we don't
 2429                          * over-decrement the RSVP counter, in case something
 2430                          * slips up.
 2431                          */
 2432                         if (viftable[vifi].v_rsvp_on) {
 2433                                 viftable[vifi].v_rsvp_on = 0;
 2434                                 rsvp_on--;
 2435                         }
 2436                 }
 2437         }
 2438 
 2439         splx(s);
 2440         return;
 2441 }
 2442 
 2443 void
 2444 rsvp_input(struct mbuf *m, struct ifnet *ifp)
 2445 {
 2446         int vifi, s;
 2447         struct ip *ip = mtod(m, struct ip *);
 2448         static struct sockaddr_in rsvp_src = { sizeof(sin), AF_INET };
 2449 
 2450         if (rsvpdebug)
 2451                 printf("rsvp_input: rsvp_on %d\n", rsvp_on);
 2452 
 2453         /*
 2454          * Can still get packets with rsvp_on = 0 if there is a local member
 2455          * of the group to which the RSVP packet is addressed.  But in this
 2456          * case we want to throw the packet away.
 2457          */
 2458         if (!rsvp_on) {
 2459                 m_freem(m);
 2460                 return;
 2461         }
 2462 
 2463         /*
 2464          * If the old-style non-vif-associated socket is set, then use
 2465          * it and ignore the new ones.
 2466          */
 2467         if (ip_rsvpd != NULL) {
 2468                 if (rsvpdebug)
 2469                         printf("rsvp_input: "
 2470                             "Sending packet up old-style socket\n");
 2471                 rip_input(m);   /*XXX*/
 2472                 return;
 2473         }
 2474 
 2475         s = splsoftnet();
 2476 
 2477         if (rsvpdebug)
 2478                 printf("rsvp_input: check vifs\n");
 2479 
 2480         /* Find which vif the packet arrived on. */
 2481         for (vifi = 0; vifi < numvifs; vifi++) {
 2482                 if (viftable[vifi].v_ifp == ifp)
 2483                         break;
 2484         }
 2485 
 2486         if (vifi == numvifs) {
 2487                 /* Can't find vif packet arrived on. Drop packet. */
 2488                 if (rsvpdebug)
 2489                         printf("rsvp_input: "
 2490                             "Can't find vif for packet...dropping it.\n");
 2491                 m_freem(m);
 2492                 splx(s);
 2493                 return;
 2494         }
 2495 
 2496         if (rsvpdebug)
 2497                 printf("rsvp_input: check socket\n");
 2498 
 2499         if (viftable[vifi].v_rsvpd == NULL) {
 2500                 /*
 2501                  * drop packet, since there is no specific socket for this
 2502                  * interface
 2503                  */
 2504                 if (rsvpdebug)
 2505                         printf("rsvp_input: No socket defined for vif %d\n",
 2506                             vifi);
 2507                 m_freem(m);
 2508                 splx(s);
 2509                 return;
 2510         }
 2511 
 2512         rsvp_src.sin_addr = ip->ip_src;
 2513 
 2514         if (rsvpdebug && m)
 2515                 printf("rsvp_input: m->m_len = %d, sbspace() = %d\n",
 2516                     m->m_len, sbspace(&viftable[vifi].v_rsvpd->so_rcv));
 2517 
 2518         if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0)
 2519                 if (rsvpdebug)
 2520                         printf("rsvp_input: Failed to append to socket\n");
 2521         else
 2522                 if (rsvpdebug)
 2523                         printf("rsvp_input: send packet up\n");
 2524 
 2525         splx(s);
 2526 }
 2527 #endif /* RSVP_ISI */
 2528 
 2529 /*
 2530  * Code for bandwidth monitors
 2531  */
 2532 
 2533 /*
 2534  * Define common interface for timeval-related methods
 2535  */
 2536 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
 2537 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
 2538 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
 2539 
 2540 static uint32_t
 2541 compute_bw_meter_flags(struct bw_upcall *req)
 2542 {
 2543     uint32_t flags = 0;
 2544 
 2545     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 2546         flags |= BW_METER_UNIT_PACKETS;
 2547     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 2548         flags |= BW_METER_UNIT_BYTES;
 2549     if (req->bu_flags & BW_UPCALL_GEQ)
 2550         flags |= BW_METER_GEQ;
 2551     if (req->bu_flags & BW_UPCALL_LEQ)
 2552         flags |= BW_METER_LEQ;
 2553 
 2554     return flags;
 2555 }
 2556 
 2557 /*
 2558  * Add a bw_meter entry
 2559  */
 2560 static int
 2561 add_bw_upcall(struct mbuf *m)
 2562 {
 2563     int s;
 2564     struct mfc *mfc;
 2565     struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 2566                 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
 2567     struct timeval now;
 2568     struct bw_meter *x;
 2569     uint32_t flags;
 2570     struct bw_upcall *req;
 2571 
 2572     if (m == NULL || m->m_len < sizeof(struct bw_upcall))
 2573         return EINVAL;
 2574 
 2575     req = mtod(m, struct bw_upcall *);
 2576 
 2577     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2578         return EOPNOTSUPP;
 2579 
 2580     /* Test if the flags are valid */
 2581     if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 2582         return EINVAL;
 2583     if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 2584         return EINVAL;
 2585     if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2586             == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2587         return EINVAL;
 2588 
 2589     /* Test if the threshold time interval is valid */
 2590     if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 2591         return EINVAL;
 2592 
 2593     flags = compute_bw_meter_flags(req);
 2594 
 2595     /*
 2596      * Find if we have already same bw_meter entry
 2597      */
 2598     s = splsoftnet();
 2599     mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2600     if (mfc == NULL) {
 2601         splx(s);
 2602         return EADDRNOTAVAIL;
 2603     }
 2604     for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 2605         if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2606                            &req->bu_threshold.b_time, ==)) &&
 2607             (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2608             (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2609             (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 2610             splx(s);
 2611             return 0;           /* XXX Already installed */
 2612         }
 2613     }
 2614 
 2615     /* Allocate the new bw_meter entry */
 2616     x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
 2617     if (x == NULL) {
 2618         splx(s);
 2619         return ENOBUFS;
 2620     }
 2621 
 2622     /* Set the new bw_meter entry */
 2623     x->bm_threshold.b_time = req->bu_threshold.b_time;
 2624     microtime(&now);
 2625     x->bm_start_time = now;
 2626     x->bm_threshold.b_packets = req->bu_threshold.b_packets;
 2627     x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
 2628     x->bm_measured.b_packets = 0;
 2629     x->bm_measured.b_bytes = 0;
 2630     x->bm_flags = flags;
 2631     x->bm_time_next = NULL;
 2632     x->bm_time_hash = BW_METER_BUCKETS;
 2633 
 2634     /* Add the new bw_meter entry to the front of entries for this MFC */
 2635     x->bm_mfc = mfc;
 2636     x->bm_mfc_next = mfc->mfc_bw_meter;
 2637     mfc->mfc_bw_meter = x;
 2638     schedule_bw_meter(x, &now);
 2639     splx(s);
 2640 
 2641     return 0;
 2642 }
 2643 
 2644 static void
 2645 free_bw_list(struct bw_meter *list)
 2646 {
 2647     while (list != NULL) {
 2648         struct bw_meter *x = list;
 2649 
 2650         list = list->bm_mfc_next;
 2651         unschedule_bw_meter(x);
 2652         free(x, M_BWMETER);
 2653     }
 2654 }
 2655 
 2656 /*
 2657  * Delete one or multiple bw_meter entries
 2658  */
 2659 static int
 2660 del_bw_upcall(struct mbuf *m)
 2661 {
 2662     int s;
 2663     struct mfc *mfc;
 2664     struct bw_meter *x;
 2665     struct bw_upcall *req;
 2666 
 2667     if (m == NULL || m->m_len < sizeof(struct bw_upcall))
 2668         return EINVAL;
 2669 
 2670     req = mtod(m, struct bw_upcall *);
 2671 
 2672     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2673         return EOPNOTSUPP;
 2674 
 2675     s = splsoftnet();
 2676     /* Find the corresponding MFC entry */
 2677     mfc = mfc_find(&req->bu_src, &req->bu_dst);
 2678     if (mfc == NULL) {
 2679         splx(s);
 2680         return EADDRNOTAVAIL;
 2681     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 2682         /*
 2683          * Delete all bw_meter entries for this mfc
 2684          */
 2685         struct bw_meter *list;
 2686 
 2687         list = mfc->mfc_bw_meter;
 2688         mfc->mfc_bw_meter = NULL;
 2689         free_bw_list(list);
 2690         splx(s);
 2691         return 0;
 2692     } else {                    /* Delete a single bw_meter entry */
 2693         struct bw_meter *prev;
 2694         uint32_t flags = 0;
 2695 
 2696         flags = compute_bw_meter_flags(req);
 2697 
 2698         /* Find the bw_meter entry to delete */
 2699         for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 2700              prev = x, x = x->bm_mfc_next) {
 2701             if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2702                                &req->bu_threshold.b_time, ==)) &&
 2703                 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2704                 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2705                 (x->bm_flags & BW_METER_USER_FLAGS) == flags)
 2706                 break;
 2707         }
 2708         if (x != NULL) { /* Delete entry from the list for this MFC */
 2709             if (prev != NULL)
 2710                 prev->bm_mfc_next = x->bm_mfc_next;     /* remove from middle*/
 2711             else
 2712                 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 2713 
 2714             unschedule_bw_meter(x);
 2715             splx(s);
 2716             /* Free the bw_meter entry */
 2717             free(x, M_BWMETER);
 2718             return 0;
 2719         } else {
 2720             splx(s);
 2721             return EINVAL;
 2722         }
 2723     }
 2724     /* NOTREACHED */
 2725 }
 2726 
 2727 /*
 2728  * Perform bandwidth measurement processing that may result in an upcall
 2729  */
 2730 static void
 2731 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 2732 {
 2733     struct timeval delta;
 2734 
 2735     delta = *nowp;
 2736     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2737 
 2738     if (x->bm_flags & BW_METER_GEQ) {
 2739         /*
 2740          * Processing for ">=" type of bw_meter entry
 2741          */
 2742         if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2743             /* Reset the bw_meter entry */
 2744             x->bm_start_time = *nowp;
 2745             x->bm_measured.b_packets = 0;
 2746             x->bm_measured.b_bytes = 0;
 2747             x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2748         }
 2749 
 2750         /* Record that a packet is received */
 2751         x->bm_measured.b_packets++;
 2752         x->bm_measured.b_bytes += plen;
 2753 
 2754         /*
 2755          * Test if we should deliver an upcall
 2756          */
 2757         if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 2758             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2759                  (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 2760                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2761                  (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 2762                 /* Prepare an upcall for delivery */
 2763                 bw_meter_prepare_upcall(x, nowp);
 2764                 x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 2765             }
 2766         }
 2767     } else if (x->bm_flags & BW_METER_LEQ) {
 2768         /*
 2769          * Processing for "<=" type of bw_meter entry
 2770          */
 2771         if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2772             /*
 2773              * We are behind time with the multicast forwarding table
 2774              * scanning for "<=" type of bw_meter entries, so test now
 2775              * if we should deliver an upcall.
 2776              */
 2777             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2778                  (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 2779                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2780                  (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 2781                 /* Prepare an upcall for delivery */
 2782                 bw_meter_prepare_upcall(x, nowp);
 2783             }
 2784             /* Reschedule the bw_meter entry */
 2785             unschedule_bw_meter(x);
 2786             schedule_bw_meter(x, nowp);
 2787         }
 2788 
 2789         /* Record that a packet is received */
 2790         x->bm_measured.b_packets++;
 2791         x->bm_measured.b_bytes += plen;
 2792 
 2793         /*
 2794          * Test if we should restart the measuring interval
 2795          */
 2796         if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 2797              x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 2798             (x->bm_flags & BW_METER_UNIT_BYTES &&
 2799              x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 2800             /* Don't restart the measuring interval */
 2801         } else {
 2802             /* Do restart the measuring interval */
 2803             /*
 2804              * XXX: note that we don't unschedule and schedule, because this
 2805              * might be too much overhead per packet. Instead, when we process
 2806              * all entries for a given timer hash bin, we check whether it is
 2807              * really a timeout. If not, we reschedule at that time.
 2808              */
 2809             x->bm_start_time = *nowp;
 2810             x->bm_measured.b_packets = 0;
 2811             x->bm_measured.b_bytes = 0;
 2812             x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2813         }
 2814     }
 2815 }
 2816 
 2817 /*
 2818  * Prepare a bandwidth-related upcall
 2819  */
 2820 static void
 2821 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 2822 {
 2823     struct timeval delta;
 2824     struct bw_upcall *u;
 2825 
 2826     /*
 2827      * Compute the measured time interval
 2828      */
 2829     delta = *nowp;
 2830     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2831 
 2832     /*
 2833      * If there are too many pending upcalls, deliver them now
 2834      */
 2835     if (bw_upcalls_n >= BW_UPCALLS_MAX)
 2836         bw_upcalls_send();
 2837 
 2838     /*
 2839      * Set the bw_upcall entry
 2840      */
 2841     u = &bw_upcalls[bw_upcalls_n++];
 2842     u->bu_src = x->bm_mfc->mfc_origin;
 2843     u->bu_dst = x->bm_mfc->mfc_mcastgrp;
 2844     u->bu_threshold.b_time = x->bm_threshold.b_time;
 2845     u->bu_threshold.b_packets = x->bm_threshold.b_packets;
 2846     u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
 2847     u->bu_measured.b_time = delta;
 2848     u->bu_measured.b_packets = x->bm_measured.b_packets;
 2849     u->bu_measured.b_bytes = x->bm_measured.b_bytes;
 2850     u->bu_flags = 0;
 2851     if (x->bm_flags & BW_METER_UNIT_PACKETS)
 2852         u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
 2853     if (x->bm_flags & BW_METER_UNIT_BYTES)
 2854         u->bu_flags |= BW_UPCALL_UNIT_BYTES;
 2855     if (x->bm_flags & BW_METER_GEQ)
 2856         u->bu_flags |= BW_UPCALL_GEQ;
 2857     if (x->bm_flags & BW_METER_LEQ)
 2858         u->bu_flags |= BW_UPCALL_LEQ;
 2859 }
 2860 
 2861 /*
 2862  * Send the pending bandwidth-related upcalls
 2863  */
 2864 static void
 2865 bw_upcalls_send(void)
 2866 {
 2867     struct mbuf *m;
 2868     int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
 2869     struct sockaddr_in k_igmpsrc = { 
 2870             .sin_len = sizeof(k_igmpsrc),
 2871             .sin_family = AF_INET,
 2872     };
 2873     static struct igmpmsg igmpmsg = { 0,                /* unused1 */
 2874                                       0,                /* unused2 */
 2875                                       IGMPMSG_BW_UPCALL,/* im_msgtype */
 2876                                       0,                /* im_mbz  */
 2877                                       0,                /* im_vif  */
 2878                                       0,                /* unused3 */
 2879                                       { 0 },            /* im_src  */
 2880                                       { 0 } };          /* im_dst  */
 2881 
 2882     if (bw_upcalls_n == 0)
 2883         return;                 /* No pending upcalls */
 2884 
 2885     bw_upcalls_n = 0;
 2886 
 2887     /*
 2888      * Allocate a new mbuf, initialize it with the header and
 2889      * the payload for the pending calls.
 2890      */
 2891     MGETHDR(m, M_DONTWAIT, MT_HEADER);
 2892     if (m == NULL) {
 2893         log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 2894         return;
 2895     }
 2896 
 2897     m->m_len = m->m_pkthdr.len = 0;
 2898     m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
 2899     m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]);
 2900 
 2901     /*
 2902      * Send the upcalls
 2903      * XXX do we need to set the address in k_igmpsrc ?
 2904      */
 2905     mrtstat.mrts_upcalls++;
 2906     if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
 2907         log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 2908         ++mrtstat.mrts_upq_sockfull;
 2909     }
 2910 }
 2911 
 2912 /*
 2913  * Compute the timeout hash value for the bw_meter entries
 2914  */
 2915 #define BW_METER_TIMEHASH(bw_meter, hash)                               \
 2916     do {                                                                \
 2917         struct timeval next_timeval = (bw_meter)->bm_start_time;        \
 2918                                                                         \
 2919         BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 2920         (hash) = next_timeval.tv_sec;                                   \
 2921         if (next_timeval.tv_usec)                                       \
 2922             (hash)++; /* XXX: make sure we don't timeout early */       \
 2923         (hash) %= BW_METER_BUCKETS;                                     \
 2924     } while (/*CONSTCOND*/ 0)
 2925 
 2926 /*
 2927  * Schedule a timer to process periodically bw_meter entry of type "<="
 2928  * by linking the entry in the proper hash bucket.
 2929  */
 2930 static void
 2931 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 2932 {
 2933     int time_hash;
 2934 
 2935     if (!(x->bm_flags & BW_METER_LEQ))
 2936         return;         /* XXX: we schedule timers only for "<=" entries */
 2937 
 2938     /*
 2939      * Reset the bw_meter entry
 2940      */
 2941     x->bm_start_time = *nowp;
 2942     x->bm_measured.b_packets = 0;
 2943     x->bm_measured.b_bytes = 0;
 2944     x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2945 
 2946     /*
 2947      * Compute the timeout hash value and insert the entry
 2948      */
 2949     BW_METER_TIMEHASH(x, time_hash);
 2950     x->bm_time_next = bw_meter_timers[time_hash];
 2951     bw_meter_timers[time_hash] = x;
 2952     x->bm_time_hash = time_hash;
 2953 }
 2954 
 2955 /*
 2956  * Unschedule the periodic timer that processes bw_meter entry of type "<="
 2957  * by removing the entry from the proper hash bucket.
 2958  */
 2959 static void
 2960 unschedule_bw_meter(struct bw_meter *x)
 2961 {
 2962     int time_hash;
 2963     struct bw_meter *prev, *tmp;
 2964 
 2965     if (!(x->bm_flags & BW_METER_LEQ))
 2966         return;         /* XXX: we schedule timers only for "<=" entries */
 2967 
 2968     /*
 2969      * Compute the timeout hash value and delete the entry
 2970      */
 2971     time_hash = x->bm_time_hash;
 2972     if (time_hash >= BW_METER_BUCKETS)
 2973         return;         /* Entry was not scheduled */
 2974 
 2975     for (prev = NULL, tmp = bw_meter_timers[time_hash];
 2976              tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 2977         if (tmp == x)
 2978             break;
 2979 
 2980     if (tmp == NULL)
 2981         panic("unschedule_bw_meter: bw_meter entry not found");
 2982 
 2983     if (prev != NULL)
 2984         prev->bm_time_next = x->bm_time_next;
 2985     else
 2986         bw_meter_timers[time_hash] = x->bm_time_next;
 2987 
 2988     x->bm_time_next = NULL;
 2989     x->bm_time_hash = BW_METER_BUCKETS;
 2990 }
 2991 
 2992 /*
 2993  * Process all "<=" type of bw_meter that should be processed now,
 2994  * and for each entry prepare an upcall if necessary. Each processed
 2995  * entry is rescheduled again for the (periodic) processing.
 2996  *
 2997  * This is run periodically (once per second normally). On each round,
 2998  * all the potentially matching entries are in the hash slot that we are
 2999  * looking at.
 3000  */
 3001 static void
 3002 bw_meter_process(void)
 3003 {
 3004     int s;
 3005     static uint32_t last_tv_sec;        /* last time we processed this */
 3006 
 3007     uint32_t loops;
 3008     int i;
 3009     struct timeval now, process_endtime;
 3010 
 3011     microtime(&now);
 3012     if (last_tv_sec == now.tv_sec)
 3013         return;         /* nothing to do */
 3014 
 3015     loops = now.tv_sec - last_tv_sec;
 3016     last_tv_sec = now.tv_sec;
 3017     if (loops > BW_METER_BUCKETS)
 3018         loops = BW_METER_BUCKETS;
 3019 
 3020     s = splsoftnet();
 3021     /*
 3022      * Process all bins of bw_meter entries from the one after the last
 3023      * processed to the current one. On entry, i points to the last bucket
 3024      * visited, so we need to increment i at the beginning of the loop.
 3025      */
 3026     for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 3027         struct bw_meter *x, *tmp_list;
 3028 
 3029         if (++i >= BW_METER_BUCKETS)
 3030             i = 0;
 3031 
 3032         /* Disconnect the list of bw_meter entries from the bin */
 3033         tmp_list = bw_meter_timers[i];
 3034         bw_meter_timers[i] = NULL;
 3035 
 3036         /* Process the list of bw_meter entries */
 3037         while (tmp_list != NULL) {
 3038             x = tmp_list;
 3039             tmp_list = tmp_list->bm_time_next;
 3040 
 3041             /* Test if the time interval is over */
 3042             process_endtime = x->bm_start_time;
 3043             BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 3044             if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 3045                 /* Not yet: reschedule, but don't reset */
 3046                 int time_hash;
 3047 
 3048                 BW_METER_TIMEHASH(x, time_hash);
 3049                 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 3050                     /*
 3051                      * XXX: somehow the bin processing is a bit ahead of time.
 3052                      * Put the entry in the next bin.
 3053                      */
 3054                     if (++time_hash >= BW_METER_BUCKETS)
 3055                         time_hash = 0;
 3056                 }
 3057                 x->bm_time_next = bw_meter_timers[time_hash];
 3058                 bw_meter_timers[time_hash] = x;
 3059                 x->bm_time_hash = time_hash;
 3060 
 3061                 continue;
 3062             }
 3063 
 3064             /*
 3065              * Test if we should deliver an upcall
 3066              */
 3067             if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 3068                  (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 3069                 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 3070                  (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 3071                 /* Prepare an upcall for delivery */
 3072                 bw_meter_prepare_upcall(x, &now);
 3073             }
 3074 
 3075             /*
 3076              * Reschedule for next processing
 3077              */
 3078             schedule_bw_meter(x, &now);
 3079         }
 3080     }
 3081 
 3082     /* Send all upcalls that are pending delivery */
 3083     bw_upcalls_send();
 3084 
 3085     splx(s);
 3086 }
 3087 
 3088 /*
 3089  * A periodic function for sending all upcalls that are pending delivery
 3090  */
 3091 static void
 3092 expire_bw_upcalls_send(void *unused)
 3093 {
 3094     int s;
 3095 
 3096     s = splsoftnet();
 3097     bw_upcalls_send();
 3098     splx(s);
 3099 
 3100     callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
 3101                   expire_bw_upcalls_send, NULL);
 3102 }
 3103 
 3104 /*
 3105  * A periodic function for periodic scanning of the multicast forwarding
 3106  * table for processing all "<=" bw_meter entries.
 3107  */
 3108 static void
 3109 expire_bw_meter_process(void *unused)
 3110 {
 3111     if (mrt_api_config & MRT_MFC_BW_UPCALL)
 3112         bw_meter_process();
 3113 
 3114     callout_reset(&bw_meter_ch, BW_METER_PERIOD,
 3115                   expire_bw_meter_process, NULL);
 3116 }
 3117 
 3118 /*
 3119  * End of bandwidth monitoring code
 3120  */
 3121 
 3122 #ifdef PIM
 3123 /*
 3124  * Send the packet up to the user daemon, or eventually do kernel encapsulation
 3125  */
 3126 static int
 3127 pim_register_send(struct ip *ip, struct vif *vifp,
 3128         struct mbuf *m, struct mfc *rt)
 3129 {
 3130     struct mbuf *mb_copy, *mm;
 3131 
 3132     if (mrtdebug & DEBUG_PIM)
 3133         log(LOG_DEBUG, "pim_register_send: ");
 3134 
 3135     mb_copy = pim_register_prepare(ip, m);
 3136     if (mb_copy == NULL)
 3137         return ENOBUFS;
 3138 
 3139     /*
 3140      * Send all the fragments. Note that the mbuf for each fragment
 3141      * is freed by the sending machinery.
 3142      */
 3143     for (mm = mb_copy; mm; mm = mb_copy) {
 3144         mb_copy = mm->m_nextpkt;
 3145         mm->m_nextpkt = NULL;
 3146         mm = m_pullup(mm, sizeof(struct ip));
 3147         if (mm != NULL) {
 3148             ip = mtod(mm, struct ip *);
 3149             if ((mrt_api_config & MRT_MFC_RP) &&
 3150                 !in_nullhost(rt->mfc_rp)) {
 3151                 pim_register_send_rp(ip, vifp, mm, rt);
 3152             } else {
 3153                 pim_register_send_upcall(ip, vifp, mm, rt);
 3154             }
 3155         }
 3156     }
 3157 
 3158     return 0;
 3159 }
 3160 
 3161 /*
 3162  * Return a copy of the data packet that is ready for PIM Register
 3163  * encapsulation.
 3164  * XXX: Note that in the returned copy the IP header is a valid one.
 3165  */
 3166 static struct mbuf *
 3167 pim_register_prepare(struct ip *ip, struct mbuf *m)
 3168 {
 3169     struct mbuf *mb_copy = NULL;
 3170     int mtu;
 3171 
 3172     /* Take care of delayed checksums */
 3173     if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
 3174         in_delayed_cksum(m);
 3175         m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
 3176     }
 3177 
 3178     /*
 3179      * Copy the old packet & pullup its IP header into the
 3180      * new mbuf so we can modify it.
 3181      */
 3182     mb_copy = m_copy(m, 0, M_COPYALL);
 3183     if (mb_copy == NULL)
 3184         return NULL;
 3185     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
 3186     if (mb_copy == NULL)
 3187         return NULL;
 3188 
 3189     /* take care of the TTL */
 3190     ip = mtod(mb_copy, struct ip *);
 3191     --ip->ip_ttl;
 3192 
 3193     /* Compute the MTU after the PIM Register encapsulation */
 3194     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 3195 
 3196     if (ntohs(ip->ip_len) <= mtu) {
 3197         /* Turn the IP header into a valid one */
 3198         ip->ip_sum = 0;
 3199         ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 3200     } else {
 3201         /* Fragment the packet */
 3202         if (ip_fragment(mb_copy, NULL, mtu) != 0) {
 3203             /* XXX: mb_copy was freed by ip_fragment() */
 3204             return NULL;
 3205         }
 3206     }
 3207     return mb_copy;
 3208 }
 3209 
 3210 /*
 3211  * Send an upcall with the data packet to the user-level process.
 3212  */
 3213 static int
 3214 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
 3215     struct mbuf *mb_copy, struct mfc *rt)
 3216 {
 3217     struct mbuf *mb_first;
 3218     int len = ntohs(ip->ip_len);
 3219     struct igmpmsg *im;
 3220     struct sockaddr_in k_igmpsrc = {
 3221             .sin_len = sizeof(k_igmpsrc),
 3222             .sin_family = AF_INET,
 3223     };
 3224 
 3225     /*
 3226      * Add a new mbuf with an upcall header
 3227      */
 3228     MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 3229     if (mb_first == NULL) {
 3230         m_freem(mb_copy);
 3231         return ENOBUFS;
 3232     }
 3233     mb_first->m_data += max_linkhdr;
 3234     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
 3235     mb_first->m_len = sizeof(struct igmpmsg);
 3236     mb_first->m_next = mb_copy;
 3237 
 3238     /* Send message to routing daemon */
 3239     im = mtod(mb_first, struct igmpmsg *);
 3240     im->im_msgtype      = IGMPMSG_WHOLEPKT;
 3241     im->im_mbz          = 0;
 3242     im->im_vif          = vifp - viftable;
 3243     im->im_src          = ip->ip_src;
 3244     im->im_dst          = ip->ip_dst;
 3245 
 3246     k_igmpsrc.sin_addr  = ip->ip_src;
 3247 
 3248     mrtstat.mrts_upcalls++;
 3249 
 3250     if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 3251         if (mrtdebug & DEBUG_PIM)
 3252             log(LOG_WARNING,
 3253                 "mcast: pim_register_send_upcall: ip_mrouter socket queue full");
 3254         ++mrtstat.mrts_upq_sockfull;
 3255         return ENOBUFS;
 3256     }
 3257 
 3258     /* Keep statistics */
 3259     pimstat.pims_snd_registers_msgs++;
 3260     pimstat.pims_snd_registers_bytes += len;
 3261 
 3262     return 0;
 3263 }
 3264 
 3265 /*
 3266  * Encapsulate the data packet in PIM Register message and send it to the RP.
 3267  */
 3268 static int
 3269 pim_register_send_rp(struct ip *ip, struct vif *vifp,
 3270         struct mbuf *mb_copy, struct mfc *rt)
 3271 {
 3272     struct mbuf *mb_first;
 3273     struct ip *ip_outer;
 3274     struct pim_encap_pimhdr *pimhdr;
 3275     int len = ntohs(ip->ip_len);
 3276     vifi_t vifi = rt->mfc_parent;
 3277 
 3278     if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
 3279         m_freem(mb_copy);
 3280         return EADDRNOTAVAIL;           /* The iif vif is invalid */
 3281     }
 3282 
 3283     /*
 3284      * Add a new mbuf with the encapsulating header
 3285      */
 3286     MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 3287     if (mb_first == NULL) {
 3288         m_freem(mb_copy);
 3289         return ENOBUFS;
 3290     }
 3291     mb_first->m_data += max_linkhdr;
 3292     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
 3293     mb_first->m_next = mb_copy;
 3294 
 3295     mb_first->m_pkthdr.len = len + mb_first->m_len;
 3296 
 3297     /*
 3298      * Fill in the encapsulating IP and PIM header
 3299      */
 3300     ip_outer = mtod(mb_first, struct ip *);
 3301     *ip_outer = pim_encap_iphdr;
 3302     ip_outer->ip_id = ip_newid();
 3303     ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 3304                              sizeof(pim_encap_pimhdr));
 3305     ip_outer->ip_src = viftable[vifi].v_lcl_addr;
 3306     ip_outer->ip_dst = rt->mfc_rp;
 3307     /*
 3308      * Copy the inner header TOS to the outer header, and take care of the
 3309      * IP_DF bit.
 3310      */
 3311     ip_outer->ip_tos = ip->ip_tos;
 3312     if (ntohs(ip->ip_off) & IP_DF)
 3313         ip_outer->ip_off |= IP_DF;
 3314     pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 3315                                          + sizeof(pim_encap_iphdr));
 3316     *pimhdr = pim_encap_pimhdr;
 3317     /* If the iif crosses a border, set the Border-bit */
 3318     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
 3319         pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 3320 
 3321     mb_first->m_data += sizeof(pim_encap_iphdr);
 3322     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
 3323     mb_first->m_data -= sizeof(pim_encap_iphdr);
 3324 
 3325     if (vifp->v_rate_limit == 0)
 3326         tbf_send_packet(vifp, mb_first);
 3327     else
 3328         tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len));
 3329 
 3330     /* Keep statistics */
 3331     pimstat.pims_snd_registers_msgs++;
 3332     pimstat.pims_snd_registers_bytes += len;
 3333 
 3334     return 0;
 3335 }
 3336 
 3337 /*
 3338  * PIM-SMv2 and PIM-DM messages processing.
 3339  * Receives and verifies the PIM control messages, and passes them
 3340  * up to the listening socket, using rip_input().
 3341  * The only message with special processing is the PIM_REGISTER message
 3342  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
 3343  * is passed to if_simloop().
 3344  */
 3345 void
 3346 pim_input(struct mbuf *m, ...)
 3347 {
 3348     struct ip *ip = mtod(m, struct ip *);
 3349     struct pim *pim;
 3350     int minlen;
 3351     int datalen;
 3352     int ip_tos;
 3353     int proto;
 3354     int iphlen;
 3355     va_list ap;
 3356 
 3357     va_start(ap, m);
 3358     iphlen = va_arg(ap, int);
 3359     proto = va_arg(ap, int);
 3360     va_end(ap);
 3361 
 3362     datalen = ntohs(ip->ip_len) - iphlen;
 3363 
 3364     /* Keep statistics */
 3365     pimstat.pims_rcv_total_msgs++;
 3366     pimstat.pims_rcv_total_bytes += datalen;
 3367 
 3368     /*
 3369      * Validate lengths
 3370      */
 3371     if (datalen < PIM_MINLEN) {
 3372         pimstat.pims_rcv_tooshort++;
 3373         log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
 3374             datalen, (u_long)ip->ip_src.s_addr);
 3375         m_freem(m);
 3376         return;
 3377     }
 3378 
 3379     /*
 3380      * If the packet is at least as big as a REGISTER, go agead
 3381      * and grab the PIM REGISTER header size, to avoid another
 3382      * possible m_pullup() later.
 3383      *
 3384      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
 3385      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
 3386      */
 3387     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
 3388     /*
 3389      * Get the IP and PIM headers in contiguous memory, and
 3390      * possibly the PIM REGISTER header.
 3391      */
 3392     if ((m->m_flags & M_EXT || m->m_len < minlen) &&
 3393         (m = m_pullup(m, minlen)) == NULL) {
 3394         log(LOG_ERR, "pim_input: m_pullup failure\n");
 3395         return;
 3396     }
 3397     /* m_pullup() may have given us a new mbuf so reset ip. */
 3398     ip = mtod(m, struct ip *);
 3399     ip_tos = ip->ip_tos;
 3400 
 3401     /* adjust mbuf to point to the PIM header */
 3402     m->m_data += iphlen;
 3403     m->m_len  -= iphlen;
 3404     pim = mtod(m, struct pim *);
 3405 
 3406     /*
 3407      * Validate checksum. If PIM REGISTER, exclude the data packet.
 3408      *
 3409      * XXX: some older PIMv2 implementations don't make this distinction,
 3410      * so for compatibility reason perform the checksum over part of the
 3411      * message, and if error, then over the whole message.
 3412      */
 3413     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 3414         /* do nothing, checksum okay */
 3415     } else if (in_cksum(m, datalen)) {
 3416         pimstat.pims_rcv_badsum++;
 3417         if (mrtdebug & DEBUG_PIM)
 3418             log(LOG_DEBUG, "pim_input: invalid checksum");
 3419         m_freem(m);
 3420         return;
 3421     }
 3422 
 3423     /* PIM version check */
 3424     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 3425         pimstat.pims_rcv_badversion++;
 3426         log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
 3427             PIM_VT_V(pim->pim_vt), PIM_VERSION);
 3428         m_freem(m);
 3429         return;
 3430     }
 3431 
 3432     /* restore mbuf back to the outer IP */
 3433     m->m_data -= iphlen;
 3434     m->m_len  += iphlen;
 3435 
 3436     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 3437         /*
 3438          * Since this is a REGISTER, we'll make a copy of the register
 3439          * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 3440          * routing daemon.
 3441          */
 3442         int s;
 3443         struct sockaddr_in dst = {
 3444                 .sin_len = sizeof(dst),
 3445                 .sin_family = AF_INET,
 3446         };
 3447         struct mbuf *mcp;
 3448         struct ip *encap_ip;
 3449         u_int32_t *reghdr;
 3450         struct ifnet *vifp;
 3451 
 3452         s = splsoftnet();
 3453         if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
 3454             splx(s);
 3455             if (mrtdebug & DEBUG_PIM)
 3456                 log(LOG_DEBUG,
 3457                     "pim_input: register vif not set: %d\n", reg_vif_num);
 3458             m_freem(m);
 3459             return;
 3460         }
 3461         /* XXX need refcnt? */
 3462         vifp = viftable[reg_vif_num].v_ifp;
 3463         splx(s);
 3464 
 3465         /*
 3466          * Validate length
 3467          */
 3468         if (datalen < PIM_REG_MINLEN) {
 3469             pimstat.pims_rcv_tooshort++;
 3470             pimstat.pims_rcv_badregisters++;
 3471             log(LOG_ERR,
 3472                 "pim_input: register packet size too small %d from %lx\n",
 3473                 datalen, (u_long)ip->ip_src.s_addr);
 3474             m_freem(m);
 3475             return;
 3476         }
 3477 
 3478         reghdr = (u_int32_t *)(pim + 1);
 3479         encap_ip = (struct ip *)(reghdr + 1);
 3480 
 3481         if (mrtdebug & DEBUG_PIM) {
 3482             log(LOG_DEBUG,
 3483                 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
 3484                 (u_long)ntohl(encap_ip->ip_src.s_addr),
 3485                 (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3486                 ntohs(encap_ip->ip_len));
 3487         }
 3488 
 3489         /* verify the version number of the inner packet */
 3490         if (encap_ip->ip_v != IPVERSION) {
 3491             pimstat.pims_rcv_badregisters++;
 3492             if (mrtdebug & DEBUG_PIM) {
 3493                 log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
 3494                     "of the inner packet\n", encap_ip->ip_v);
 3495             }
 3496             m_freem(m);
 3497             return;
 3498         }
 3499 
 3500         /* verify the inner packet is destined to a mcast group */
 3501         if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
 3502             pimstat.pims_rcv_badregisters++;
 3503             if (mrtdebug & DEBUG_PIM)
 3504                 log(LOG_DEBUG,
 3505                     "pim_input: inner packet of register is not "
 3506                     "multicast %lx\n",
 3507                     (u_long)ntohl(encap_ip->ip_dst.s_addr));
 3508             m_freem(m);
 3509             return;
 3510         }
 3511 
 3512         /* If a NULL_REGISTER, pass it to the daemon */
 3513         if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 3514             goto pim_input_to_daemon;
 3515 
 3516         /*
 3517          * Copy the TOS from the outer IP header to the inner IP header.
 3518          */
 3519         if (encap_ip->ip_tos != ip_tos) {
 3520             /* Outer TOS -> inner TOS */
 3521             encap_ip->ip_tos = ip_tos;
 3522             /* Recompute the inner header checksum. Sigh... */
 3523 
 3524             /* adjust mbuf to point to the inner IP header */
 3525             m->m_data += (iphlen + PIM_MINLEN);
 3526             m->m_len  -= (iphlen + PIM_MINLEN);
 3527 
 3528             encap_ip->ip_sum = 0;
 3529             encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 3530 
 3531             /* restore mbuf to point back to the outer IP header */
 3532             m->m_data -= (iphlen + PIM_MINLEN);
 3533             m->m_len  += (iphlen + PIM_MINLEN);
 3534         }
 3535 
 3536         /*
 3537          * Decapsulate the inner IP packet and loopback to forward it
 3538          * as a normal multicast packet. Also, make a copy of the
 3539          *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 3540          * to pass to the daemon later, so it can take the appropriate
 3541          * actions (e.g., send back PIM_REGISTER_STOP).
 3542          * XXX: here m->m_data points to the outer IP header.
 3543          */
 3544         mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
 3545         if (mcp == NULL) {
 3546             log(LOG_ERR,
 3547                 "pim_input: pim register: could not copy register head\n");
 3548             m_freem(m);
 3549             return;
 3550         }
 3551 
 3552         /* Keep statistics */
 3553         /* XXX: registers_bytes include only the encap. mcast pkt */
 3554         pimstat.pims_rcv_registers_msgs++;
 3555         pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
 3556 
 3557         /*
 3558          * forward the inner ip packet; point m_data at the inner ip.
 3559          */
 3560         m_adj(m, iphlen + PIM_MINLEN);
 3561 
 3562         if (mrtdebug & DEBUG_PIM) {
 3563             log(LOG_DEBUG,
 3564                 "pim_input: forwarding decapsulated register: "
 3565                 "src %lx, dst %lx, vif %d\n",
 3566                 (u_long)ntohl(encap_ip->ip_src.s_addr),
 3567                 (u_long)ntohl(encap_ip->ip_dst.s_addr),
 3568                 reg_vif_num);
 3569         }
 3570         /* NB: vifp was collected above; can it change on us? */
 3571         looutput(vifp, m, (struct sockaddr *)&dst, (struct rtentry *)NULL);
 3572 
 3573         /* prepare the register head to send to the mrouting daemon */
 3574         m = mcp;
 3575     }
 3576 
 3577 pim_input_to_daemon:
 3578     /*
 3579      * Pass the PIM message up to the daemon; if it is a Register message,
 3580      * pass the 'head' only up to the daemon. This includes the
 3581      * outer IP header, PIM header, PIM-Register header and the
 3582      * inner IP header.
 3583      * XXX: the outer IP header pkt size of a Register is not adjust to
 3584      * reflect the fact that the inner multicast data is truncated.
 3585      */
 3586     rip_input(m, iphlen, proto);
 3587 
 3588     return;
 3589 }
 3590 #endif /* PIM */

Cache object: fdb3b8fe4293abdf54f1bcb3de0d57b1


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.