The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/ip_dummynet.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
    3  * Portions Copyright (c) 2000 Akamba Corp.
    4  * All rights reserved
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   25  * SUCH DAMAGE.
   26  */
   27 
   28 #include <sys/cdefs.h>
   29 __FBSDID("$FreeBSD$");
   30 
   31 #define DUMMYNET_DEBUG
   32 
   33 #include "opt_inet6.h"
   34 
   35 /*
   36  * This module implements IP dummynet, a bandwidth limiter/delay emulator
   37  * used in conjunction with the ipfw package.
   38  * Description of the data structures used is in ip_dummynet.h
   39  * Here you mainly find the following blocks of code:
   40  *  + variable declarations;
   41  *  + heap management functions;
   42  *  + scheduler and dummynet functions;
   43  *  + configuration and initialization.
   44  *
   45  * NOTA BENE: critical sections are protected by the "dummynet lock".
   46  *
   47  * Most important Changes:
   48  *
   49  * 011004: KLDable
   50  * 010124: Fixed WF2Q behaviour
   51  * 010122: Fixed spl protection.
   52  * 000601: WF2Q support
   53  * 000106: large rewrite, use heaps to handle very many pipes.
   54  * 980513:      initial release
   55  *
   56  * include files marked with XXX are probably not needed
   57  */
   58 
   59 #include <sys/param.h>
   60 #include <sys/systm.h>
   61 #include <sys/malloc.h>
   62 #include <sys/mbuf.h>
   63 #include <sys/kernel.h>
   64 #include <sys/module.h>
   65 #include <sys/proc.h>
   66 #include <sys/socket.h>
   67 #include <sys/socketvar.h>
   68 #include <sys/time.h>
   69 #include <sys/sysctl.h>
   70 #include <sys/taskqueue.h>
   71 #include <net/if.h>
   72 #include <net/netisr.h>
   73 #include <net/route.h>
   74 #include <netinet/in.h>
   75 #include <netinet/in_systm.h>
   76 #include <netinet/in_var.h>
   77 #include <netinet/ip.h>
   78 #include <netinet/ip_fw.h>
   79 #include <netinet/ip_dummynet.h>
   80 #include <netinet/ip_var.h>
   81 
   82 #include <netinet/if_ether.h> /* for struct arpcom */
   83 
   84 #include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
   85 #include <netinet6/ip6_var.h>
   86 
   87 /*
   88  * We keep a private variable for the simulation time, but we could
   89  * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
   90  */
   91 static dn_key curr_time = 0 ; /* current simulation time */
   92 
   93 static int dn_hash_size = 64 ;  /* default hash size */
   94 
   95 /* statistics on number of queue searches and search steps */
   96 static long searches, search_steps ;
   97 static int pipe_expire = 1 ;   /* expire queue if empty */
   98 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
   99 
  100 static int red_lookup_depth = 256;      /* RED - default lookup table depth */
  101 static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
  102 static int red_max_pkt_size = 1500;     /* RED - default max packet size */
  103 
  104 static struct timeval prev_t, t;
  105 static long tick_last;                  /* Last tick duration (usec). */
  106 static long tick_delta;                 /* Last vs standard tick diff (usec). */
  107 static long tick_delta_sum;             /* Accumulated tick difference (usec).*/
  108 static long tick_adjustment;            /* Tick adjustments done. */
  109 static long tick_lost;                  /* Lost(coalesced) ticks number. */
  110 /* Adjusted vs non-adjusted curr_time difference (ticks). */
  111 static long tick_diff;
  112 
  113 /*
  114  * Three heaps contain queues and pipes that the scheduler handles:
  115  *
  116  * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
  117  *
  118  * wfq_ready_heap contains the pipes associated with WF2Q flows
  119  *
  120  * extract_heap contains pipes associated with delay lines.
  121  *
  122  */
  123 
  124 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
  125 
  126 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
  127 
  128 static int      heap_init(struct dn_heap *h, int size);
  129 static int      heap_insert (struct dn_heap *h, dn_key key1, void *p);
  130 static void     heap_extract(struct dn_heap *h, void *obj);
  131 static void     transmit_event(struct dn_pipe *pipe, struct mbuf **head,
  132                     struct mbuf **tail);
  133 static void     ready_event(struct dn_flow_queue *q, struct mbuf **head,
  134                     struct mbuf **tail);
  135 static void     ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
  136                     struct mbuf **tail);
  137 
  138 #define HASHSIZE        16
  139 #define HASH(num)       ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
  140 static struct dn_pipe_head      pipehash[HASHSIZE];     /* all pipes */
  141 static struct dn_flow_set_head  flowsethash[HASHSIZE];  /* all flowsets */
  142 
  143 static struct callout dn_timeout;
  144 
  145 extern  void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
  146 
  147 #ifdef SYSCTL_NODE
  148 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
  149 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
  150     CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
  151 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
  152     CTLFLAG_RD, &curr_time, 0, "Current tick");
  153 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
  154     CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
  155 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
  156     CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
  157 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
  158     CTLFLAG_RD, &searches, 0, "Number of queue searches");
  159 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
  160     CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
  161 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
  162     CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
  163 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
  164     CTLFLAG_RW, &dn_max_ratio, 0,
  165     "Max ratio between dynamic queues and buckets");
  166 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
  167     CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
  168 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
  169     CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
  170 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
  171     CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
  172 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
  173     CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
  174 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
  175     CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
  176 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
  177     CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
  178 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
  179     CTLFLAG_RD, &tick_diff, 0,
  180     "Adjusted vs non-adjusted curr_time difference (ticks).");
  181 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
  182     CTLFLAG_RD, &tick_lost, 0,
  183     "Number of ticks coalesced by dummynet taskqueue.");
  184 #endif
  185 
  186 #ifdef DUMMYNET_DEBUG
  187 int     dummynet_debug = 0;
  188 #ifdef SYSCTL_NODE
  189 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
  190             0, "control debugging printfs");
  191 #endif
  192 #define DPRINTF(X)      if (dummynet_debug) printf X
  193 #else
  194 #define DPRINTF(X)
  195 #endif
  196 
  197 static struct task      dn_task;
  198 static struct taskqueue *dn_tq = NULL;
  199 static void dummynet_task(void *, int);
  200 
  201 static struct mtx dummynet_mtx;
  202 #define DUMMYNET_LOCK_INIT() \
  203         mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
  204 #define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
  205 #define DUMMYNET_LOCK()         mtx_lock(&dummynet_mtx)
  206 #define DUMMYNET_UNLOCK()       mtx_unlock(&dummynet_mtx)
  207 #define DUMMYNET_LOCK_ASSERT()  mtx_assert(&dummynet_mtx, MA_OWNED)
  208 
  209 static int config_pipe(struct dn_pipe *p);
  210 static int ip_dn_ctl(struct sockopt *sopt);
  211 
  212 static void dummynet(void *);
  213 static void dummynet_flush(void);
  214 static void dummynet_send(struct mbuf *);
  215 void dummynet_drain(void);
  216 static ip_dn_io_t dummynet_io;
  217 static void dn_rule_delete(void *);
  218 
  219 /*
  220  * Heap management functions.
  221  *
  222  * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
  223  * Some macros help finding parent/children so we can optimize them.
  224  *
  225  * heap_init() is called to expand the heap when needed.
  226  * Increment size in blocks of 16 entries.
  227  * XXX failure to allocate a new element is a pretty bad failure
  228  * as we basically stall a whole queue forever!!
  229  * Returns 1 on error, 0 on success
  230  */
  231 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
  232 #define HEAP_LEFT(x) ( 2*(x) + 1 )
  233 #define HEAP_IS_LEFT(x) ( (x) & 1 )
  234 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
  235 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
  236 #define HEAP_INCREMENT  15
  237 
  238 static int
  239 heap_init(struct dn_heap *h, int new_size)
  240 {
  241     struct dn_heap_entry *p;
  242 
  243     if (h->size >= new_size ) {
  244         printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
  245                 h->size, new_size);
  246         return 0 ;
  247     }
  248     new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
  249     p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
  250     if (p == NULL) {
  251         printf("dummynet: %s, resize %d failed\n", __func__, new_size );
  252         return 1 ; /* error */
  253     }
  254     if (h->size > 0) {
  255         bcopy(h->p, p, h->size * sizeof(*p) );
  256         free(h->p, M_DUMMYNET);
  257     }
  258     h->p = p ;
  259     h->size = new_size ;
  260     return 0 ;
  261 }
  262 
  263 /*
  264  * Insert element in heap. Normally, p != NULL, we insert p in
  265  * a new position and bubble up. If p == NULL, then the element is
  266  * already in place, and key is the position where to start the
  267  * bubble-up.
  268  * Returns 1 on failure (cannot allocate new heap entry)
  269  *
  270  * If offset > 0 the position (index, int) of the element in the heap is
  271  * also stored in the element itself at the given offset in bytes.
  272  */
  273 #define SET_OFFSET(heap, node) \
  274     if (heap->offset > 0) \
  275             *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
  276 /*
  277  * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
  278  */
  279 #define RESET_OFFSET(heap, node) \
  280     if (heap->offset > 0) \
  281             *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
  282 static int
  283 heap_insert(struct dn_heap *h, dn_key key1, void *p)
  284 {
  285     int son = h->elements ;
  286 
  287     if (p == NULL)      /* data already there, set starting point */
  288         son = key1 ;
  289     else {              /* insert new element at the end, possibly resize */
  290         son = h->elements ;
  291         if (son == h->size) /* need resize... */
  292             if (heap_init(h, h->elements+1) )
  293                 return 1 ; /* failure... */
  294         h->p[son].object = p ;
  295         h->p[son].key = key1 ;
  296         h->elements++ ;
  297     }
  298     while (son > 0) {                           /* bubble up */
  299         int father = HEAP_FATHER(son) ;
  300         struct dn_heap_entry tmp  ;
  301 
  302         if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
  303             break ; /* found right position */
  304         /* son smaller than father, swap and repeat */
  305         HEAP_SWAP(h->p[son], h->p[father], tmp) ;
  306         SET_OFFSET(h, son);
  307         son = father ;
  308     }
  309     SET_OFFSET(h, son);
  310     return 0 ;
  311 }
  312 
  313 /*
  314  * remove top element from heap, or obj if obj != NULL
  315  */
  316 static void
  317 heap_extract(struct dn_heap *h, void *obj)
  318 {
  319     int child, father, max = h->elements - 1 ;
  320 
  321     if (max < 0) {
  322         printf("dummynet: warning, extract from empty heap 0x%p\n", h);
  323         return ;
  324     }
  325     father = 0 ; /* default: move up smallest child */
  326     if (obj != NULL) { /* extract specific element, index is at offset */
  327         if (h->offset <= 0)
  328             panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
  329         father = *((int *)((char *)obj + h->offset)) ;
  330         if (father < 0 || father >= h->elements) {
  331             printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
  332                 father, h->elements);
  333             panic("dummynet: heap_extract");
  334         }
  335     }
  336     RESET_OFFSET(h, father);
  337     child = HEAP_LEFT(father) ;         /* left child */
  338     while (child <= max) {              /* valid entry */
  339         if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
  340             child = child+1 ;           /* take right child, otherwise left */
  341         h->p[father] = h->p[child] ;
  342         SET_OFFSET(h, father);
  343         father = child ;
  344         child = HEAP_LEFT(child) ;   /* left child for next loop */
  345     }
  346     h->elements-- ;
  347     if (father != max) {
  348         /*
  349          * Fill hole with last entry and bubble up, reusing the insert code
  350          */
  351         h->p[father] = h->p[max] ;
  352         heap_insert(h, father, NULL); /* this one cannot fail */
  353     }
  354 }
  355 
  356 #if 0
  357 /*
  358  * change object position and update references
  359  * XXX this one is never used!
  360  */
  361 static void
  362 heap_move(struct dn_heap *h, dn_key new_key, void *object)
  363 {
  364     int temp;
  365     int i ;
  366     int max = h->elements-1 ;
  367     struct dn_heap_entry buf ;
  368 
  369     if (h->offset <= 0)
  370         panic("cannot move items on this heap");
  371 
  372     i = *((int *)((char *)object + h->offset));
  373     if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
  374         h->p[i].key = new_key ;
  375         for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
  376                  i = temp ) { /* bubble up */
  377             HEAP_SWAP(h->p[i], h->p[temp], buf) ;
  378             SET_OFFSET(h, i);
  379         }
  380     } else {            /* must move down */
  381         h->p[i].key = new_key ;
  382         while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
  383             if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
  384                 temp++ ; /* select child with min key */
  385             if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
  386                 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
  387                 SET_OFFSET(h, i);
  388             } else
  389                 break ;
  390             i = temp ;
  391         }
  392     }
  393     SET_OFFSET(h, i);
  394 }
  395 #endif /* heap_move, unused */
  396 
  397 /*
  398  * heapify() will reorganize data inside an array to maintain the
  399  * heap property. It is needed when we delete a bunch of entries.
  400  */
  401 static void
  402 heapify(struct dn_heap *h)
  403 {
  404     int i ;
  405 
  406     for (i = 0 ; i < h->elements ; i++ )
  407         heap_insert(h, i , NULL) ;
  408 }
  409 
  410 /*
  411  * cleanup the heap and free data structure
  412  */
  413 static void
  414 heap_free(struct dn_heap *h)
  415 {
  416     if (h->size >0 )
  417         free(h->p, M_DUMMYNET);
  418     bzero(h, sizeof(*h) );
  419 }
  420 
  421 /*
  422  * --- end of heap management functions ---
  423  */
  424 
  425 /*
  426  * Return the mbuf tag holding the dummynet state.  As an optimization
  427  * this is assumed to be the first tag on the list.  If this turns out
  428  * wrong we'll need to search the list.
  429  */
  430 static struct dn_pkt_tag *
  431 dn_tag_get(struct mbuf *m)
  432 {
  433     struct m_tag *mtag = m_tag_first(m);
  434     KASSERT(mtag != NULL &&
  435             mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
  436             mtag->m_tag_id == PACKET_TAG_DUMMYNET,
  437             ("packet on dummynet queue w/o dummynet tag!"));
  438     return (struct dn_pkt_tag *)(mtag+1);
  439 }
  440 
  441 /*
  442  * Scheduler functions:
  443  *
  444  * transmit_event() is called when the delay-line needs to enter
  445  * the scheduler, either because of existing pkts getting ready,
  446  * or new packets entering the queue. The event handled is the delivery
  447  * time of the packet.
  448  *
  449  * ready_event() does something similar with fixed-rate queues, and the
  450  * event handled is the finish time of the head pkt.
  451  *
  452  * wfq_ready_event() does something similar with WF2Q queues, and the
  453  * event handled is the start time of the head pkt.
  454  *
  455  * In all cases, we make sure that the data structures are consistent
  456  * before passing pkts out, because this might trigger recursive
  457  * invocations of the procedures.
  458  */
  459 static void
  460 transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
  461 {
  462         struct mbuf *m;
  463         struct dn_pkt_tag *pkt;
  464 
  465         DUMMYNET_LOCK_ASSERT();
  466 
  467         while ((m = pipe->head) != NULL) {
  468                 pkt = dn_tag_get(m);
  469                 if (!DN_KEY_LEQ(pkt->output_time, curr_time))
  470                         break;
  471 
  472                 pipe->head = m->m_nextpkt;
  473                 if (*tail != NULL)
  474                         (*tail)->m_nextpkt = m;
  475                 else
  476                         *head = m;
  477                 *tail = m;
  478         }
  479         if (*tail != NULL)
  480                 (*tail)->m_nextpkt = NULL;
  481 
  482         /* If there are leftover packets, put into the heap for next event. */
  483         if ((m = pipe->head) != NULL) {
  484                 pkt = dn_tag_get(m);
  485                 /*
  486                  * XXX: Should check errors on heap_insert, by draining the
  487                  * whole pipe p and hoping in the future we are more successful.
  488                  */
  489                 heap_insert(&extract_heap, pkt->output_time, pipe);
  490         }
  491 }
  492 
  493 /*
  494  * the following macro computes how many ticks we have to wait
  495  * before being able to transmit a packet. The credit is taken from
  496  * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
  497  */
  498 #define SET_TICKS(_m, q, p)     \
  499     ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
  500             p->bandwidth ;
  501 
  502 /*
  503  * extract pkt from queue, compute output time (could be now)
  504  * and put into delay line (p_queue)
  505  */
  506 static void
  507 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
  508     int len)
  509 {
  510     struct dn_pkt_tag *dt = dn_tag_get(pkt);
  511 
  512     q->head = pkt->m_nextpkt ;
  513     q->len-- ;
  514     q->len_bytes -= len ;
  515 
  516     dt->output_time = curr_time + p->delay ;
  517 
  518     if (p->head == NULL)
  519         p->head = pkt;
  520     else
  521         p->tail->m_nextpkt = pkt;
  522     p->tail = pkt;
  523     p->tail->m_nextpkt = NULL;
  524 }
  525 
  526 /*
  527  * ready_event() is invoked every time the queue must enter the
  528  * scheduler, either because the first packet arrives, or because
  529  * a previously scheduled event fired.
  530  * On invokation, drain as many pkts as possible (could be 0) and then
  531  * if there are leftover packets reinsert the pkt in the scheduler.
  532  */
  533 static void
  534 ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
  535 {
  536     struct mbuf *pkt;
  537     struct dn_pipe *p = q->fs->pipe ;
  538     int p_was_empty ;
  539 
  540     DUMMYNET_LOCK_ASSERT();
  541 
  542     if (p == NULL) {
  543         printf("dummynet: ready_event- pipe is gone\n");
  544         return ;
  545     }
  546     p_was_empty = (p->head == NULL) ;
  547 
  548     /*
  549      * schedule fixed-rate queues linked to this pipe:
  550      * Account for the bw accumulated since last scheduling, then
  551      * drain as many pkts as allowed by q->numbytes and move to
  552      * the delay line (in p) computing output time.
  553      * bandwidth==0 (no limit) means we can drain the whole queue,
  554      * setting len_scaled = 0 does the job.
  555      */
  556     q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
  557     while ( (pkt = q->head) != NULL ) {
  558         int len = pkt->m_pkthdr.len;
  559         int len_scaled = p->bandwidth ? len*8*hz : 0 ;
  560         if (len_scaled > q->numbytes )
  561             break ;
  562         q->numbytes -= len_scaled ;
  563         move_pkt(pkt, q, p, len);
  564     }
  565     /*
  566      * If we have more packets queued, schedule next ready event
  567      * (can only occur when bandwidth != 0, otherwise we would have
  568      * flushed the whole queue in the previous loop).
  569      * To this purpose we record the current time and compute how many
  570      * ticks to go for the finish time of the packet.
  571      */
  572     if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
  573         dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
  574         q->sched_time = curr_time ;
  575         heap_insert(&ready_heap, curr_time + t, (void *)q );
  576         /* XXX should check errors on heap_insert, and drain the whole
  577          * queue on error hoping next time we are luckier.
  578          */
  579     } else {    /* RED needs to know when the queue becomes empty */
  580         q->q_time = curr_time;
  581         q->numbytes = 0;
  582     }
  583     /*
  584      * If the delay line was empty call transmit_event() now.
  585      * Otherwise, the scheduler will take care of it.
  586      */
  587     if (p_was_empty)
  588         transmit_event(p, head, tail);
  589 }
  590 
  591 /*
  592  * Called when we can transmit packets on WF2Q queues. Take pkts out of
  593  * the queues at their start time, and enqueue into the delay line.
  594  * Packets are drained until p->numbytes < 0. As long as
  595  * len_scaled >= p->numbytes, the packet goes into the delay line
  596  * with a deadline p->delay. For the last packet, if p->numbytes<0,
  597  * there is an additional delay.
  598  */
  599 static void
  600 ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
  601 {
  602     int p_was_empty = (p->head == NULL) ;
  603     struct dn_heap *sch = &(p->scheduler_heap);
  604     struct dn_heap *neh = &(p->not_eligible_heap) ;
  605 
  606     DUMMYNET_LOCK_ASSERT();
  607 
  608     if (p->if_name[0] == 0) /* tx clock is simulated */
  609         p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
  610     else { /* tx clock is for real, the ifq must be empty or this is a NOP */
  611         if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
  612             return ;
  613         else {
  614             DPRINTF(("dummynet: pipe %d ready from %s --\n",
  615                 p->pipe_nr, p->if_name));
  616         }
  617     }
  618 
  619     /*
  620      * While we have backlogged traffic AND credit, we need to do
  621      * something on the queue.
  622      */
  623     while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
  624         if (sch->elements > 0) { /* have some eligible pkts to send out */
  625             struct dn_flow_queue *q = sch->p[0].object ;
  626             struct mbuf *pkt = q->head;
  627             struct dn_flow_set *fs = q->fs;
  628             u_int64_t len = pkt->m_pkthdr.len;
  629             int len_scaled = p->bandwidth ? len*8*hz : 0 ;
  630 
  631             heap_extract(sch, NULL); /* remove queue from heap */
  632             p->numbytes -= len_scaled ;
  633             move_pkt(pkt, q, p, len);
  634 
  635             p->V += (len<<MY_M) / p->sum ; /* update V */
  636             q->S = q->F ; /* update start time */
  637             if (q->len == 0) { /* Flow not backlogged any more */
  638                 fs->backlogged-- ;
  639                 heap_insert(&(p->idle_heap), q->F, q);
  640             } else { /* still backlogged */
  641                 /*
  642                  * update F and position in backlogged queue, then
  643                  * put flow in not_eligible_heap (we will fix this later).
  644                  */
  645                 len = (q->head)->m_pkthdr.len;
  646                 q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
  647                 if (DN_KEY_LEQ(q->S, p->V))
  648                     heap_insert(neh, q->S, q);
  649                 else
  650                     heap_insert(sch, q->F, q);
  651             }
  652         }
  653         /*
  654          * now compute V = max(V, min(S_i)). Remember that all elements in sch
  655          * have by definition S_i <= V so if sch is not empty, V is surely
  656          * the max and we must not update it. Conversely, if sch is empty
  657          * we only need to look at neh.
  658          */
  659         if (sch->elements == 0 && neh->elements > 0)
  660             p->V = MAX64 ( p->V, neh->p[0].key );
  661         /* move from neh to sch any packets that have become eligible */
  662         while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
  663             struct dn_flow_queue *q = neh->p[0].object ;
  664             heap_extract(neh, NULL);
  665             heap_insert(sch, q->F, q);
  666         }
  667 
  668         if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
  669             p->numbytes = -1 ; /* mark not ready for I/O */
  670             break ;
  671         }
  672     }
  673     if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
  674             && p->idle_heap.elements > 0) {
  675         /*
  676          * no traffic and no events scheduled. We can get rid of idle-heap.
  677          */
  678         int i ;
  679 
  680         for (i = 0 ; i < p->idle_heap.elements ; i++) {
  681             struct dn_flow_queue *q = p->idle_heap.p[i].object ;
  682 
  683             q->F = 0 ;
  684             q->S = q->F + 1 ;
  685         }
  686         p->sum = 0 ;
  687         p->V = 0 ;
  688         p->idle_heap.elements = 0 ;
  689     }
  690     /*
  691      * If we are getting clocks from dummynet (not a real interface) and
  692      * If we are under credit, schedule the next ready event.
  693      * Also fix the delivery time of the last packet.
  694      */
  695     if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
  696         dn_key t=0 ; /* number of ticks i have to wait */
  697 
  698         if (p->bandwidth > 0)
  699             t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
  700         dn_tag_get(p->tail)->output_time += t ;
  701         p->sched_time = curr_time ;
  702         heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
  703         /* XXX should check errors on heap_insert, and drain the whole
  704          * queue on error hoping next time we are luckier.
  705          */
  706     }
  707     /*
  708      * If the delay line was empty call transmit_event() now.
  709      * Otherwise, the scheduler will take care of it.
  710      */
  711     if (p_was_empty)
  712         transmit_event(p, head, tail);
  713 }
  714 
  715 /*
  716  * This is called one tick, after previous run. It is used to
  717  * schedule next run.
  718  */
  719 static void
  720 dummynet(void * __unused unused)
  721 {
  722 
  723         taskqueue_enqueue(dn_tq, &dn_task);
  724 }
  725 
  726 /*
  727  * The main dummynet processing function.
  728  */
  729 static void
  730 dummynet_task(void *context, int pending)
  731 {
  732         struct mbuf *head = NULL, *tail = NULL;
  733         struct dn_pipe *pipe;
  734         struct dn_heap *heaps[3];
  735         struct dn_heap *h;
  736         void *p;        /* generic parameter to handler */
  737         int i;
  738 
  739         DUMMYNET_LOCK();
  740 
  741         heaps[0] = &ready_heap;                 /* fixed-rate queues */
  742         heaps[1] = &wfq_ready_heap;             /* wfq queues */
  743         heaps[2] = &extract_heap;               /* delay line */
  744 
  745         /* Update number of lost(coalesced) ticks. */
  746         tick_lost += pending - 1;
  747  
  748         getmicrouptime(&t);
  749         /* Last tick duration (usec). */
  750         tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
  751             (t.tv_usec - prev_t.tv_usec);
  752         /* Last tick vs standard tick difference (usec). */
  753         tick_delta = (tick_last * hz - 1000000) / hz;
  754         /* Accumulated tick difference (usec). */
  755         tick_delta_sum += tick_delta;
  756  
  757         prev_t = t;
  758  
  759         /*
  760          * Adjust curr_time if accumulated tick difference greater than
  761          * 'standard' tick. Since curr_time should be monotonically increasing,
  762          * we do positive adjustment as required and throttle curr_time in
  763          * case of negative adjustment.
  764          */
  765         curr_time++;
  766         if (tick_delta_sum - tick >= 0) {
  767                 int diff = tick_delta_sum / tick;
  768  
  769                 curr_time += diff;
  770                 tick_diff += diff;
  771                 tick_delta_sum %= tick;
  772                 tick_adjustment++;
  773         } else if (tick_delta_sum + tick <= 0) {
  774                 curr_time--;
  775                 tick_diff--;
  776                 tick_delta_sum += tick;
  777                 tick_adjustment++;
  778         }
  779 
  780         for (i = 0; i < 3; i++) {
  781                 h = heaps[i];
  782                 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
  783                         if (h->p[0].key > curr_time)
  784                                 printf("dummynet: warning, "
  785                                     "heap %d is %d ticks late\n",
  786                                     i, (int)(curr_time - h->p[0].key));
  787                         /* store a copy before heap_extract */
  788                         p = h->p[0].object;
  789                         /* need to extract before processing */
  790                         heap_extract(h, NULL);
  791                         if (i == 0)
  792                                 ready_event(p, &head, &tail);
  793                         else if (i == 1) {
  794                                 struct dn_pipe *pipe = p;
  795                                 if (pipe->if_name[0] != '\0')
  796                                         printf("dummynet: bad ready_event_wfq "
  797                                             "for pipe %s\n", pipe->if_name);
  798                                 else
  799                                         ready_event_wfq(p, &head, &tail);
  800                         } else
  801                                 transmit_event(p, &head, &tail);
  802                 }
  803         }
  804 
  805         /* Sweep pipes trying to expire idle flow_queues. */
  806         for (i = 0; i < HASHSIZE; i++)
  807                 SLIST_FOREACH(pipe, &pipehash[i], next)
  808                         if (pipe->idle_heap.elements > 0 &&
  809                             DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
  810                                 struct dn_flow_queue *q =
  811                                     pipe->idle_heap.p[0].object;
  812 
  813                                 heap_extract(&(pipe->idle_heap), NULL);
  814                                 /* Mark timestamp as invalid. */
  815                                 q->S = q->F + 1;
  816                                 pipe->sum -= q->fs->weight;
  817                         }
  818 
  819         DUMMYNET_UNLOCK();
  820 
  821         if (head != NULL)
  822                 dummynet_send(head);
  823 
  824         callout_reset(&dn_timeout, 1, dummynet, NULL);
  825 }
  826 
  827 static void
  828 dummynet_send(struct mbuf *m)
  829 {
  830         struct dn_pkt_tag *pkt;
  831         struct mbuf *n;
  832         struct ip *ip;
  833 
  834         for (; m != NULL; m = n) {
  835                 n = m->m_nextpkt;
  836                 m->m_nextpkt = NULL;
  837                 pkt = dn_tag_get(m);
  838                 switch (pkt->dn_dir) {
  839                 case DN_TO_IP_OUT:
  840                         ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
  841                         break ;
  842                   case DN_TO_IP_IN :
  843                         ip = mtod(m, struct ip *);
  844                         ip->ip_len = htons(ip->ip_len);
  845                         ip->ip_off = htons(ip->ip_off);
  846                         netisr_dispatch(NETISR_IP, m);
  847                         break;
  848 #ifdef INET6
  849                 case DN_TO_IP6_IN:
  850                         netisr_dispatch(NETISR_IPV6, m);
  851                         break;
  852 
  853                 case DN_TO_IP6_OUT:
  854                         ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
  855                         break;
  856 #endif
  857                 case DN_TO_IFB_FWD:
  858                         if (bridge_dn_p != NULL)
  859                                 ((*bridge_dn_p)(m, pkt->ifp));
  860                         else
  861                                 printf("dummynet: if_bridge not loaded\n");
  862 
  863                         break;
  864                 case DN_TO_ETH_DEMUX:
  865                         /*
  866                          * The Ethernet code assumes the Ethernet header is
  867                          * contiguous in the first mbuf header.
  868                          * Insure this is true.
  869                          */
  870                         if (m->m_len < ETHER_HDR_LEN &&
  871                             (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
  872                                 printf("dummynet/ether: pullup failed, "
  873                                     "dropping packet\n");
  874                                 break;
  875                         }
  876                         ether_demux(m->m_pkthdr.rcvif, m);
  877                         break;
  878                 case DN_TO_ETH_OUT:
  879                         ether_output_frame(pkt->ifp, m);
  880                         break;
  881                 default:
  882                         printf("dummynet: bad switch %d!\n", pkt->dn_dir);
  883                         m_freem(m);
  884                         break;
  885                 }
  886         }
  887 }
  888 
  889 /*
  890  * Unconditionally expire empty queues in case of shortage.
  891  * Returns the number of queues freed.
  892  */
  893 static int
  894 expire_queues(struct dn_flow_set *fs)
  895 {
  896     struct dn_flow_queue *q, *prev ;
  897     int i, initial_elements = fs->rq_elements ;
  898 
  899     if (fs->last_expired == time_uptime)
  900         return 0 ;
  901     fs->last_expired = time_uptime ;
  902     for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
  903         for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
  904             if (q->head != NULL || q->S != q->F+1) {
  905                 prev = q ;
  906                 q = q->next ;
  907             } else { /* entry is idle, expire it */
  908                 struct dn_flow_queue *old_q = q ;
  909 
  910                 if (prev != NULL)
  911                     prev->next = q = q->next ;
  912                 else
  913                     fs->rq[i] = q = q->next ;
  914                 fs->rq_elements-- ;
  915                 free(old_q, M_DUMMYNET);
  916             }
  917     return initial_elements - fs->rq_elements ;
  918 }
  919 
  920 /*
  921  * If room, create a new queue and put at head of slot i;
  922  * otherwise, create or use the default queue.
  923  */
  924 static struct dn_flow_queue *
  925 create_queue(struct dn_flow_set *fs, int i)
  926 {
  927     struct dn_flow_queue *q ;
  928 
  929     if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
  930             expire_queues(fs) == 0) {
  931         /*
  932          * No way to get room, use or create overflow queue.
  933          */
  934         i = fs->rq_size ;
  935         if ( fs->rq[i] != NULL )
  936             return fs->rq[i] ;
  937     }
  938     q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
  939     if (q == NULL) {
  940         printf("dummynet: sorry, cannot allocate queue for new flow\n");
  941         return NULL ;
  942     }
  943     q->fs = fs ;
  944     q->hash_slot = i ;
  945     q->next = fs->rq[i] ;
  946     q->S = q->F + 1;   /* hack - mark timestamp as invalid */
  947     fs->rq[i] = q ;
  948     fs->rq_elements++ ;
  949     return q ;
  950 }
  951 
  952 /*
  953  * Given a flow_set and a pkt in last_pkt, find a matching queue
  954  * after appropriate masking. The queue is moved to front
  955  * so that further searches take less time.
  956  */
  957 static struct dn_flow_queue *
  958 find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
  959 {
  960     int i = 0 ; /* we need i and q for new allocations */
  961     struct dn_flow_queue *q, *prev;
  962     int is_v6 = IS_IP6_FLOW_ID(id);
  963 
  964     if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
  965         q = fs->rq[0] ;
  966     else {
  967         /* first, do the masking, then hash */
  968         id->dst_port &= fs->flow_mask.dst_port ;
  969         id->src_port &= fs->flow_mask.src_port ;
  970         id->proto &= fs->flow_mask.proto ;
  971         id->flags = 0 ; /* we don't care about this one */
  972         if (is_v6) {
  973             APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
  974             APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
  975             id->flow_id6 &= fs->flow_mask.flow_id6;
  976 
  977             i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
  978                 ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
  979                 ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
  980                 ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
  981 
  982                 ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
  983                 ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
  984                 ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
  985                 ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
  986 
  987                 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
  988                 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
  989                 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
  990                 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
  991 
  992                 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
  993                 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
  994                 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
  995                 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
  996 
  997                 (id->dst_port << 1) ^ (id->src_port) ^
  998                 (id->proto ) ^
  999                 (id->flow_id6);
 1000         } else {
 1001             id->dst_ip &= fs->flow_mask.dst_ip ;
 1002             id->src_ip &= fs->flow_mask.src_ip ;
 1003 
 1004             i = ( (id->dst_ip) & 0xffff ) ^
 1005                 ( (id->dst_ip >> 15) & 0xffff ) ^
 1006                 ( (id->src_ip << 1) & 0xffff ) ^
 1007                 ( (id->src_ip >> 16 ) & 0xffff ) ^
 1008                 (id->dst_port << 1) ^ (id->src_port) ^
 1009                 (id->proto );
 1010         }
 1011         i = i % fs->rq_size ;
 1012         /* finally, scan the current list for a match */
 1013         searches++ ;
 1014         for (prev=NULL, q = fs->rq[i] ; q ; ) {
 1015             search_steps++;
 1016             if (is_v6 &&
 1017                     IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&  
 1018                     IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&  
 1019                     id->dst_port == q->id.dst_port &&
 1020                     id->src_port == q->id.src_port &&
 1021                     id->proto == q->id.proto &&
 1022                     id->flags == q->id.flags &&
 1023                     id->flow_id6 == q->id.flow_id6)
 1024                 break ; /* found */
 1025 
 1026             if (!is_v6 && id->dst_ip == q->id.dst_ip &&
 1027                     id->src_ip == q->id.src_ip &&
 1028                     id->dst_port == q->id.dst_port &&
 1029                     id->src_port == q->id.src_port &&
 1030                     id->proto == q->id.proto &&
 1031                     id->flags == q->id.flags)
 1032                 break ; /* found */
 1033 
 1034             /* No match. Check if we can expire the entry */
 1035             if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
 1036                 /* entry is idle and not in any heap, expire it */
 1037                 struct dn_flow_queue *old_q = q ;
 1038 
 1039                 if (prev != NULL)
 1040                     prev->next = q = q->next ;
 1041                 else
 1042                     fs->rq[i] = q = q->next ;
 1043                 fs->rq_elements-- ;
 1044                 free(old_q, M_DUMMYNET);
 1045                 continue ;
 1046             }
 1047             prev = q ;
 1048             q = q->next ;
 1049         }
 1050         if (q && prev != NULL) { /* found and not in front */
 1051             prev->next = q->next ;
 1052             q->next = fs->rq[i] ;
 1053             fs->rq[i] = q ;
 1054         }
 1055     }
 1056     if (q == NULL) { /* no match, need to allocate a new entry */
 1057         q = create_queue(fs, i);
 1058         if (q != NULL)
 1059         q->id = *id ;
 1060     }
 1061     return q ;
 1062 }
 1063 
 1064 static int
 1065 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
 1066 {
 1067         /*
 1068          * RED algorithm
 1069          *
 1070          * RED calculates the average queue size (avg) using a low-pass filter
 1071          * with an exponential weighted (w_q) moving average:
 1072          *      avg  <-  (1-w_q) * avg + w_q * q_size
 1073          * where q_size is the queue length (measured in bytes or * packets).
 1074          *
 1075          * If q_size == 0, we compute the idle time for the link, and set
 1076          *      avg = (1 - w_q)^(idle/s)
 1077          * where s is the time needed for transmitting a medium-sized packet.
 1078          *
 1079          * Now, if avg < min_th the packet is enqueued.
 1080          * If avg > max_th the packet is dropped. Otherwise, the packet is
 1081          * dropped with probability P function of avg.
 1082          */
 1083 
 1084         int64_t p_b = 0;
 1085 
 1086         /* Queue in bytes or packets? */
 1087         u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
 1088             q->len_bytes : q->len;
 1089 
 1090         DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
 1091 
 1092         /* Average queue size estimation. */
 1093         if (q_size != 0) {
 1094                 /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
 1095                 int diff = SCALE(q_size) - q->avg;
 1096                 int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
 1097 
 1098                 q->avg += (int)v;
 1099         } else {
 1100                 /*
 1101                  * Queue is empty, find for how long the queue has been
 1102                  * empty and use a lookup table for computing
 1103                  * (1 - * w_q)^(idle_time/s) where s is the time to send a
 1104                  * (small) packet.
 1105                  * XXX check wraps...
 1106                  */
 1107                 if (q->avg) {
 1108                         u_int t = (curr_time - q->q_time) / fs->lookup_step;
 1109 
 1110                         q->avg = (t < fs->lookup_depth) ?
 1111                             SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
 1112                 }
 1113         }
 1114         DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
 1115 
 1116         /* Should i drop? */
 1117         if (q->avg < fs->min_th) {
 1118                 q->count = -1;
 1119                 return (0);     /* accept packet */
 1120         }
 1121         if (q->avg >= fs->max_th) {     /* average queue >=  max threshold */
 1122                 if (fs->flags_fs & DN_IS_GENTLE_RED) {
 1123                         /*
 1124                          * According to Gentle-RED, if avg is greater than
 1125                          * max_th the packet is dropped with a probability
 1126                          *       p_b = c_3 * avg - c_4
 1127                          * where c_3 = (1 - max_p) / max_th
 1128                          *       c_4 = 1 - 2 * max_p
 1129                          */
 1130                         p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
 1131                             fs->c_4;
 1132                 } else {
 1133                         q->count = -1;
 1134                         DPRINTF(("dummynet: - drop"));
 1135                         return (1);
 1136                 }
 1137         } else if (q->avg > fs->min_th) {
 1138                 /*
 1139                  * We compute p_b using the linear dropping function
 1140                  *       p_b = c_1 * avg - c_2
 1141                  * where c_1 = max_p / (max_th - min_th)
 1142                  *       c_2 = max_p * min_th / (max_th - min_th)
 1143                  */
 1144                 p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
 1145         }
 1146 
 1147         if (fs->flags_fs & DN_QSIZE_IS_BYTES)
 1148                 p_b = (p_b * len) / fs->max_pkt_size;
 1149         if (++q->count == 0)
 1150                 q->random = random() & 0xffff;
 1151         else {
 1152                 /*
 1153                  * q->count counts packets arrived since last drop, so a greater
 1154                  * value of q->count means a greater packet drop probability.
 1155                  */
 1156                 if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
 1157                         q->count = 0;
 1158                         DPRINTF(("dummynet: - red drop"));
 1159                         /* After a drop we calculate a new random value. */
 1160                         q->random = random() & 0xffff;
 1161                         return (1);     /* drop */
 1162                 }
 1163         }
 1164         /* End of RED algorithm. */
 1165 
 1166         return (0);     /* accept */
 1167 }
 1168 
 1169 static __inline struct dn_flow_set *
 1170 locate_flowset(int fs_nr)
 1171 {
 1172         struct dn_flow_set *fs;
 1173 
 1174         SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
 1175                 if (fs->fs_nr == fs_nr)
 1176                         return (fs);
 1177 
 1178         return (NULL);
 1179 }
 1180 
 1181 static __inline struct dn_pipe *
 1182 locate_pipe(int pipe_nr)
 1183 {
 1184         struct dn_pipe *pipe;
 1185 
 1186         SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
 1187                 if (pipe->pipe_nr == pipe_nr)
 1188                         return (pipe);
 1189 
 1190         return (NULL);
 1191 }
 1192 
 1193 /*
 1194  * dummynet hook for packets. Below 'pipe' is a pipe or a queue
 1195  * depending on whether WF2Q or fixed bw is used.
 1196  *
 1197  * pipe_nr      pipe or queue the packet is destined for.
 1198  * dir          where shall we send the packet after dummynet.
 1199  * m            the mbuf with the packet
 1200  * ifp          the 'ifp' parameter from the caller.
 1201  *              NULL in ip_input, destination interface in ip_output,
 1202  * rule         matching rule, in case of multiple passes
 1203  *
 1204  */
 1205 static int
 1206 dummynet_io(struct mbuf *m, int dir, struct ip_fw_args *fwa)
 1207 {
 1208     struct mbuf *head = NULL, *tail = NULL;
 1209     struct dn_pkt_tag *pkt;
 1210     struct m_tag *mtag;
 1211     struct dn_flow_set *fs = NULL;
 1212     struct dn_pipe *pipe ;
 1213     u_int64_t len = m->m_pkthdr.len ;
 1214     struct dn_flow_queue *q = NULL ;
 1215     int is_pipe;
 1216     ipfw_insn *cmd = ACTION_PTR(fwa->rule);
 1217 
 1218     KASSERT(m->m_nextpkt == NULL,
 1219         ("dummynet_io: mbuf queue passed to dummynet"));
 1220 
 1221     if (cmd->opcode == O_LOG)
 1222         cmd += F_LEN(cmd);
 1223     if (cmd->opcode == O_ALTQ)
 1224         cmd += F_LEN(cmd);
 1225     if (cmd->opcode == O_TAG)
 1226         cmd += F_LEN(cmd);
 1227     is_pipe = (cmd->opcode == O_PIPE);
 1228 
 1229     DUMMYNET_LOCK();
 1230     /*
 1231      * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
 1232      *
 1233      * XXXGL: probably the pipe->fs and fs->pipe logic here
 1234      * below can be simplified.
 1235      */
 1236     if (is_pipe) {
 1237         pipe = locate_pipe(fwa->cookie);
 1238         if (pipe != NULL)
 1239                 fs = &(pipe->fs);
 1240     } else
 1241         fs = locate_flowset(fwa->cookie);
 1242 
 1243     if (fs == NULL)
 1244         goto dropit;    /* This queue/pipe does not exist! */
 1245     pipe = fs->pipe;
 1246     if (pipe == NULL) { /* Must be a queue, try find a matching pipe. */
 1247         pipe = locate_pipe(fs->parent_nr);
 1248         if (pipe != NULL)
 1249             fs->pipe = pipe;
 1250         else {
 1251             printf("dummynet: no pipe %d for queue %d, drop pkt\n",
 1252                 fs->parent_nr, fs->fs_nr);
 1253             goto dropit ;
 1254         }
 1255     }
 1256     q = find_queue(fs, &(fwa->f_id));
 1257     if ( q == NULL )
 1258         goto dropit ;           /* cannot allocate queue                */
 1259     /*
 1260      * update statistics, then check reasons to drop pkt
 1261      */
 1262     q->tot_bytes += len ;
 1263     q->tot_pkts++ ;
 1264     if ( fs->plr && random() < fs->plr )
 1265         goto dropit ;           /* random pkt drop                      */
 1266     if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
 1267         if (q->len_bytes > fs->qsize)
 1268             goto dropit ;       /* queue size overflow                  */
 1269     } else {
 1270         if (q->len >= fs->qsize)
 1271             goto dropit ;       /* queue count overflow                 */
 1272     }
 1273     if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
 1274         goto dropit ;
 1275 
 1276     /* XXX expensive to zero, see if we can remove it*/
 1277     mtag = m_tag_get(PACKET_TAG_DUMMYNET,
 1278                 sizeof(struct dn_pkt_tag), M_NOWAIT|M_ZERO);
 1279     if ( mtag == NULL )
 1280         goto dropit ;           /* cannot allocate packet header        */
 1281     m_tag_prepend(m, mtag);     /* attach to mbuf chain */
 1282 
 1283     pkt = (struct dn_pkt_tag *)(mtag+1);
 1284     /* ok, i can handle the pkt now... */
 1285     /* build and enqueue packet + parameters */
 1286     pkt->rule = fwa->rule ;
 1287     pkt->dn_dir = dir ;
 1288 
 1289     pkt->ifp = fwa->oif;
 1290 
 1291     if (q->head == NULL)
 1292         q->head = m;
 1293     else
 1294         q->tail->m_nextpkt = m;
 1295     q->tail = m;
 1296     q->len++;
 1297     q->len_bytes += len ;
 1298 
 1299     if ( q->head != m )         /* flow was not idle, we are done */
 1300         goto done;
 1301     /*
 1302      * If we reach this point the flow was previously idle, so we need
 1303      * to schedule it. This involves different actions for fixed-rate or
 1304      * WF2Q queues.
 1305      */
 1306     if (is_pipe) {
 1307         /*
 1308          * Fixed-rate queue: just insert into the ready_heap.
 1309          */
 1310         dn_key t = 0 ;
 1311         if (pipe->bandwidth)
 1312             t = SET_TICKS(m, q, pipe);
 1313         q->sched_time = curr_time ;
 1314         if (t == 0)     /* must process it now */
 1315             ready_event(q, &head, &tail);
 1316         else
 1317             heap_insert(&ready_heap, curr_time + t , q );
 1318     } else {
 1319         /*
 1320          * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
 1321          * set S to the virtual time V for the controlling pipe, and update
 1322          * the sum of weights for the pipe; otherwise, remove flow from
 1323          * idle_heap and set S to max(F,V).
 1324          * Second, compute finish time F = S + len/weight.
 1325          * Third, if pipe was idle, update V=max(S, V).
 1326          * Fourth, count one more backlogged flow.
 1327          */
 1328         if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
 1329             q->S = pipe->V ;
 1330             pipe->sum += fs->weight ; /* add weight of new queue */
 1331         } else {
 1332             heap_extract(&(pipe->idle_heap), q);
 1333             q->S = MAX64(q->F, pipe->V ) ;
 1334         }
 1335         q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
 1336 
 1337         if (pipe->not_eligible_heap.elements == 0 &&
 1338                 pipe->scheduler_heap.elements == 0)
 1339             pipe->V = MAX64 ( q->S, pipe->V );
 1340         fs->backlogged++ ;
 1341         /*
 1342          * Look at eligibility. A flow is not eligibile if S>V (when
 1343          * this happens, it means that there is some other flow already
 1344          * scheduled for the same pipe, so the scheduler_heap cannot be
 1345          * empty). If the flow is not eligible we just store it in the
 1346          * not_eligible_heap. Otherwise, we store in the scheduler_heap
 1347          * and possibly invoke ready_event_wfq() right now if there is
 1348          * leftover credit.
 1349          * Note that for all flows in scheduler_heap (SCH), S_i <= V,
 1350          * and for all flows in not_eligible_heap (NEH), S_i > V .
 1351          * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
 1352          * we only need to look into NEH.
 1353          */
 1354         if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
 1355             if (pipe->scheduler_heap.elements == 0)
 1356                 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
 1357             heap_insert(&(pipe->not_eligible_heap), q->S, q);
 1358         } else {
 1359             heap_insert(&(pipe->scheduler_heap), q->F, q);
 1360             if (pipe->numbytes >= 0) { /* pipe is idle */
 1361                 if (pipe->scheduler_heap.elements != 1)
 1362                     printf("dummynet: OUCH! pipe should have been idle!\n");
 1363                 DPRINTF(("dummynet: waking up pipe %d at %d\n",
 1364                         pipe->pipe_nr, (int)(q->F >> MY_M)));
 1365                 pipe->sched_time = curr_time ;
 1366                 ready_event_wfq(pipe, &head, &tail);
 1367             }
 1368         }
 1369     }
 1370 done:
 1371     DUMMYNET_UNLOCK();
 1372     if (head != NULL)
 1373         dummynet_send(head);
 1374     return 0;
 1375 
 1376 dropit:
 1377     if (q)
 1378         q->drops++ ;
 1379     DUMMYNET_UNLOCK();
 1380     m_freem(m);
 1381     return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
 1382 }
 1383 
 1384 /*
 1385  * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
 1386  * Doing this would probably save us the initial bzero of dn_pkt
 1387  */
 1388 #define DN_FREE_PKT(_m) do {                            \
 1389         m_freem(_m);                                    \
 1390 } while (0)
 1391 
 1392 /*
 1393  * Dispose all packets and flow_queues on a flow_set.
 1394  * If all=1, also remove red lookup table and other storage,
 1395  * including the descriptor itself.
 1396  * For the one in dn_pipe MUST also cleanup ready_heap...
 1397  */
 1398 static void
 1399 purge_flow_set(struct dn_flow_set *fs, int all)
 1400 {
 1401         struct dn_flow_queue *q, *qn;
 1402         int i;
 1403 
 1404         DUMMYNET_LOCK_ASSERT();
 1405 
 1406         for (i = 0; i <= fs->rq_size; i++) {
 1407                 for (q = fs->rq[i]; q != NULL; q = qn) {
 1408                         struct mbuf *m, *mnext;
 1409 
 1410                         mnext = q->head;
 1411                         while ((m = mnext) != NULL) {
 1412                                 mnext = m->m_nextpkt;
 1413                                 DN_FREE_PKT(m);
 1414                         }
 1415                         qn = q->next;
 1416                         free(q, M_DUMMYNET);
 1417                 }
 1418                 fs->rq[i] = NULL;
 1419         }
 1420 
 1421         fs->rq_elements = 0;
 1422         if (all) {
 1423                 /* RED - free lookup table. */
 1424                 if (fs->w_q_lookup != NULL)
 1425                         free(fs->w_q_lookup, M_DUMMYNET);
 1426                 if (fs->rq != NULL)
 1427                         free(fs->rq, M_DUMMYNET);
 1428                 /* If this fs is not part of a pipe, free it. */
 1429                 if (fs->pipe == NULL || fs != &(fs->pipe->fs))
 1430                         free(fs, M_DUMMYNET);
 1431         }
 1432 }
 1433 
 1434 /*
 1435  * Dispose all packets queued on a pipe (not a flow_set).
 1436  * Also free all resources associated to a pipe, which is about
 1437  * to be deleted.
 1438  */
 1439 static void
 1440 purge_pipe(struct dn_pipe *pipe)
 1441 {
 1442     struct mbuf *m, *mnext;
 1443 
 1444     purge_flow_set( &(pipe->fs), 1 );
 1445 
 1446     mnext = pipe->head;
 1447     while ((m = mnext) != NULL) {
 1448         mnext = m->m_nextpkt;
 1449         DN_FREE_PKT(m);
 1450     }
 1451 
 1452     heap_free( &(pipe->scheduler_heap) );
 1453     heap_free( &(pipe->not_eligible_heap) );
 1454     heap_free( &(pipe->idle_heap) );
 1455 }
 1456 
 1457 /*
 1458  * Delete all pipes and heaps returning memory. Must also
 1459  * remove references from all ipfw rules to all pipes.
 1460  */
 1461 static void
 1462 dummynet_flush(void)
 1463 {
 1464         struct dn_pipe *pipe, *pipe1;
 1465         struct dn_flow_set *fs, *fs1;
 1466         int i;
 1467 
 1468         DUMMYNET_LOCK();
 1469         /* Free heaps so we don't have unwanted events. */
 1470         heap_free(&ready_heap);
 1471         heap_free(&wfq_ready_heap);
 1472         heap_free(&extract_heap);
 1473 
 1474         /*
 1475          * Now purge all queued pkts and delete all pipes.
 1476          *
 1477          * XXXGL: can we merge the for(;;) cycles into one or not?
 1478          */
 1479         for (i = 0; i < HASHSIZE; i++)
 1480                 SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
 1481                         SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
 1482                         purge_flow_set(fs, 1);
 1483                 }
 1484         for (i = 0; i < HASHSIZE; i++)
 1485                 SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
 1486                         SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
 1487                         purge_pipe(pipe);
 1488                         free(pipe, M_DUMMYNET);
 1489                 }
 1490         DUMMYNET_UNLOCK();
 1491 }
 1492 
 1493 extern struct ip_fw *ip_fw_default_rule ;
 1494 static void
 1495 dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
 1496 {
 1497     int i ;
 1498     struct dn_flow_queue *q ;
 1499     struct mbuf *m ;
 1500 
 1501     for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
 1502         for (q = fs->rq[i] ; q ; q = q->next )
 1503             for (m = q->head ; m ; m = m->m_nextpkt ) {
 1504                 struct dn_pkt_tag *pkt = dn_tag_get(m) ;
 1505                 if (pkt->rule == r)
 1506                     pkt->rule = ip_fw_default_rule ;
 1507             }
 1508 }
 1509 /*
 1510  * when a firewall rule is deleted, scan all queues and remove the flow-id
 1511  * from packets matching this rule.
 1512  */
 1513 void
 1514 dn_rule_delete(void *r)
 1515 {
 1516     struct dn_pipe *pipe;
 1517     struct dn_flow_set *fs;
 1518     struct dn_pkt_tag *pkt;
 1519     struct mbuf *m;
 1520     int i;
 1521 
 1522     DUMMYNET_LOCK();
 1523     /*
 1524      * If the rule references a queue (dn_flow_set), then scan
 1525      * the flow set, otherwise scan pipes. Should do either, but doing
 1526      * both does not harm.
 1527      */
 1528     for (i = 0; i < HASHSIZE; i++)
 1529         SLIST_FOREACH(fs, &flowsethash[i], next)
 1530                 dn_rule_delete_fs(fs, r);
 1531 
 1532     for (i = 0; i < HASHSIZE; i++)
 1533         SLIST_FOREACH(pipe, &pipehash[i], next) {
 1534                 fs = &(pipe->fs);
 1535                 dn_rule_delete_fs(fs, r);
 1536                 for (m = pipe->head ; m ; m = m->m_nextpkt ) {
 1537                         pkt = dn_tag_get(m);
 1538                         if (pkt->rule == r)
 1539                                 pkt->rule = ip_fw_default_rule;
 1540                 }
 1541         }
 1542     DUMMYNET_UNLOCK();
 1543 }
 1544 
 1545 /*
 1546  * setup RED parameters
 1547  */
 1548 static int
 1549 config_red(struct dn_flow_set *p, struct dn_flow_set *x)
 1550 {
 1551         int i;
 1552 
 1553         x->w_q = p->w_q;
 1554         x->min_th = SCALE(p->min_th);
 1555         x->max_th = SCALE(p->max_th);
 1556         x->max_p = p->max_p;
 1557 
 1558         x->c_1 = p->max_p / (p->max_th - p->min_th);
 1559         x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
 1560 
 1561         if (x->flags_fs & DN_IS_GENTLE_RED) {
 1562                 x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
 1563                 x->c_4 = SCALE(1) - 2 * p->max_p;
 1564         }
 1565 
 1566         /* If the lookup table already exist, free and create it again. */
 1567         if (x->w_q_lookup) {
 1568                 free(x->w_q_lookup, M_DUMMYNET);
 1569                 x->w_q_lookup = NULL;
 1570         }
 1571         if (red_lookup_depth == 0) {
 1572                 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
 1573                     "must be > 0\n");
 1574                 free(x, M_DUMMYNET);
 1575                 return (EINVAL);
 1576         }
 1577         x->lookup_depth = red_lookup_depth;
 1578         x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
 1579             M_DUMMYNET, M_NOWAIT);
 1580         if (x->w_q_lookup == NULL) {
 1581                 printf("dummynet: sorry, cannot allocate red lookup table\n");
 1582                 free(x, M_DUMMYNET);
 1583                 return(ENOSPC);
 1584         }
 1585 
 1586         /* Fill the lookup table with (1 - w_q)^x */
 1587         x->lookup_step = p->lookup_step;
 1588         x->lookup_weight = p->lookup_weight;
 1589         x->w_q_lookup[0] = SCALE(1) - x->w_q;
 1590 
 1591         for (i = 1; i < x->lookup_depth; i++)
 1592                 x->w_q_lookup[i] =
 1593                     SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
 1594 
 1595         if (red_avg_pkt_size < 1)
 1596                 red_avg_pkt_size = 512;
 1597         x->avg_pkt_size = red_avg_pkt_size;
 1598         if (red_max_pkt_size < 1)
 1599                 red_max_pkt_size = 1500;
 1600         x->max_pkt_size = red_max_pkt_size;
 1601         return (0);
 1602 }
 1603 
 1604 static int
 1605 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
 1606 {
 1607     if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
 1608         int l = pfs->rq_size;
 1609 
 1610         if (l == 0)
 1611             l = dn_hash_size;
 1612         if (l < 4)
 1613             l = 4;
 1614         else if (l > DN_MAX_HASH_SIZE)
 1615             l = DN_MAX_HASH_SIZE;
 1616         x->rq_size = l;
 1617     } else                  /* one is enough for null mask */
 1618         x->rq_size = 1;
 1619     x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
 1620             M_DUMMYNET, M_NOWAIT | M_ZERO);
 1621     if (x->rq == NULL) {
 1622         printf("dummynet: sorry, cannot allocate queue\n");
 1623         return (ENOMEM);
 1624     }
 1625     x->rq_elements = 0;
 1626     return 0 ;
 1627 }
 1628 
 1629 static void
 1630 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
 1631 {
 1632         x->flags_fs = src->flags_fs;
 1633         x->qsize = src->qsize;
 1634         x->plr = src->plr;
 1635         x->flow_mask = src->flow_mask;
 1636         if (x->flags_fs & DN_QSIZE_IS_BYTES) {
 1637                 if (x->qsize > 1024 * 1024)
 1638                         x->qsize = 1024 * 1024;
 1639         } else {
 1640                 if (x->qsize == 0)
 1641                         x->qsize = 50;
 1642                 if (x->qsize > 100)
 1643                         x->qsize = 50;
 1644         }
 1645         /* Configuring RED. */
 1646         if (x->flags_fs & DN_IS_RED)
 1647                 config_red(src, x);     /* XXX should check errors */
 1648 }
 1649 
 1650 /*
 1651  * Setup pipe or queue parameters.
 1652  */
 1653 static int
 1654 config_pipe(struct dn_pipe *p)
 1655 {
 1656         struct dn_flow_set *pfs = &(p->fs);
 1657         struct dn_flow_queue *q;
 1658         int i, error;
 1659 
 1660         /*
 1661          * The config program passes parameters as follows:
 1662          * bw = bits/second (0 means no limits),
 1663          * delay = ms, must be translated into ticks.
 1664          * qsize = slots/bytes
 1665          */
 1666         p->delay = (p->delay * hz) / 1000;
 1667         /* We need either a pipe number or a flow_set number. */
 1668         if (p->pipe_nr == 0 && pfs->fs_nr == 0)
 1669                 return (EINVAL);
 1670         if (p->pipe_nr != 0 && pfs->fs_nr != 0)
 1671                 return (EINVAL);
 1672         if (p->pipe_nr != 0) {                  /* this is a pipe */
 1673                 struct dn_pipe *pipe;
 1674 
 1675                 DUMMYNET_LOCK();
 1676                 pipe = locate_pipe(p->pipe_nr); /* locate pipe */
 1677 
 1678                 if (pipe == NULL) {             /* new pipe */
 1679                         pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
 1680                             M_NOWAIT | M_ZERO);
 1681                         if (pipe == NULL) {
 1682                                 DUMMYNET_UNLOCK();
 1683                                 printf("dummynet: no memory for new pipe\n");
 1684                                 return (ENOMEM);
 1685                         }
 1686                         pipe->pipe_nr = p->pipe_nr;
 1687                         pipe->fs.pipe = pipe;
 1688                         /*
 1689                          * idle_heap is the only one from which
 1690                          * we extract from the middle.
 1691                          */
 1692                         pipe->idle_heap.size = pipe->idle_heap.elements = 0;
 1693                         pipe->idle_heap.offset =
 1694                             offsetof(struct dn_flow_queue, heap_pos);
 1695                 } else
 1696                         /* Flush accumulated credit for all queues. */
 1697                         for (i = 0; i <= pipe->fs.rq_size; i++)
 1698                                 for (q = pipe->fs.rq[i]; q; q = q->next)
 1699                                         q->numbytes = 0;
 1700 
 1701                 pipe->bandwidth = p->bandwidth;
 1702                 pipe->numbytes = 0;             /* just in case... */
 1703                 bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
 1704                 pipe->ifp = NULL;               /* reset interface ptr */
 1705                 pipe->delay = p->delay;
 1706                 set_fs_parms(&(pipe->fs), pfs);
 1707 
 1708                 if (pipe->fs.rq == NULL) {      /* a new pipe */
 1709                         error = alloc_hash(&(pipe->fs), pfs);
 1710                         if (error) {
 1711                                 DUMMYNET_UNLOCK();
 1712                                 free(pipe, M_DUMMYNET);
 1713                                 return (error);
 1714                         }
 1715                         SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
 1716                             pipe, next);
 1717                 }
 1718                 DUMMYNET_UNLOCK();
 1719         } else {                                /* config queue */
 1720                 struct dn_flow_set *fs;
 1721 
 1722                 DUMMYNET_LOCK();
 1723                 fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
 1724 
 1725                 if (fs == NULL) {               /* new */
 1726                         if (pfs->parent_nr == 0) { /* need link to a pipe */
 1727                                 DUMMYNET_UNLOCK();
 1728                                 return (EINVAL);
 1729                         }
 1730                         fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
 1731                             M_NOWAIT | M_ZERO);
 1732                         if (fs == NULL) {
 1733                                 DUMMYNET_UNLOCK();
 1734                                 printf(
 1735                                     "dummynet: no memory for new flow_set\n");
 1736                                 return (ENOMEM);
 1737                         }
 1738                         fs->fs_nr = pfs->fs_nr;
 1739                         fs->parent_nr = pfs->parent_nr;
 1740                         fs->weight = pfs->weight;
 1741                         if (fs->weight == 0)
 1742                                 fs->weight = 1;
 1743                         else if (fs->weight > 100)
 1744                                 fs->weight = 100;
 1745                 } else {
 1746                         /*
 1747                          * Change parent pipe not allowed;
 1748                          * must delete and recreate.
 1749                          */
 1750                         if (pfs->parent_nr != 0 &&
 1751                             fs->parent_nr != pfs->parent_nr) {
 1752                                 DUMMYNET_UNLOCK();
 1753                                 return (EINVAL);
 1754                         }
 1755                 }
 1756 
 1757                 set_fs_parms(fs, pfs);
 1758 
 1759                 if (fs->rq == NULL) {           /* a new flow_set */
 1760                         error = alloc_hash(fs, pfs);
 1761                         if (error) {
 1762                                 DUMMYNET_UNLOCK();
 1763                                 free(fs, M_DUMMYNET);
 1764                                 return (error);
 1765                         }
 1766                         SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
 1767                             fs, next);
 1768                 }
 1769                 DUMMYNET_UNLOCK();
 1770         }
 1771         return (0);
 1772 }
 1773 
 1774 /*
 1775  * Helper function to remove from a heap queues which are linked to
 1776  * a flow_set about to be deleted.
 1777  */
 1778 static void
 1779 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
 1780 {
 1781     int i = 0, found = 0 ;
 1782     for (; i < h->elements ;)
 1783         if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
 1784             h->elements-- ;
 1785             h->p[i] = h->p[h->elements] ;
 1786             found++ ;
 1787         } else
 1788             i++ ;
 1789     if (found)
 1790         heapify(h);
 1791 }
 1792 
 1793 /*
 1794  * helper function to remove a pipe from a heap (can be there at most once)
 1795  */
 1796 static void
 1797 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
 1798 {
 1799     if (h->elements > 0) {
 1800         int i = 0 ;
 1801         for (i=0; i < h->elements ; i++ ) {
 1802             if (h->p[i].object == p) { /* found it */
 1803                 h->elements-- ;
 1804                 h->p[i] = h->p[h->elements] ;
 1805                 heapify(h);
 1806                 break ;
 1807             }
 1808         }
 1809     }
 1810 }
 1811 
 1812 /*
 1813  * drain all queues. Called in case of severe mbuf shortage.
 1814  */
 1815 void
 1816 dummynet_drain(void)
 1817 {
 1818     struct dn_flow_set *fs;
 1819     struct dn_pipe *pipe;
 1820     struct mbuf *m, *mnext;
 1821     int i;
 1822 
 1823     DUMMYNET_LOCK_ASSERT();
 1824 
 1825     heap_free(&ready_heap);
 1826     heap_free(&wfq_ready_heap);
 1827     heap_free(&extract_heap);
 1828     /* remove all references to this pipe from flow_sets */
 1829     for (i = 0; i < HASHSIZE; i++)
 1830         SLIST_FOREACH(fs, &flowsethash[i], next)
 1831                 purge_flow_set(fs, 0);
 1832 
 1833     for (i = 0; i < HASHSIZE; i++) {
 1834         SLIST_FOREACH(pipe, &pipehash[i], next) {
 1835                 purge_flow_set(&(pipe->fs), 0);
 1836 
 1837                 mnext = pipe->head;
 1838                 while ((m = mnext) != NULL) {
 1839                         mnext = m->m_nextpkt;
 1840                         DN_FREE_PKT(m);
 1841                 }
 1842                 pipe->head = pipe->tail = NULL;
 1843         }
 1844     }
 1845 }
 1846 
 1847 /*
 1848  * Fully delete a pipe or a queue, cleaning up associated info.
 1849  */
 1850 static int
 1851 delete_pipe(struct dn_pipe *p)
 1852 {
 1853 
 1854     if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
 1855         return EINVAL ;
 1856     if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
 1857         return EINVAL ;
 1858     if (p->pipe_nr != 0) { /* this is an old-style pipe */
 1859         struct dn_pipe *pipe;
 1860         struct dn_flow_set *fs;
 1861         int i;
 1862 
 1863         DUMMYNET_LOCK();
 1864         pipe = locate_pipe(p->pipe_nr); /* locate pipe */
 1865 
 1866         if (pipe == NULL) {
 1867             DUMMYNET_UNLOCK();
 1868             return (ENOENT);    /* not found */
 1869         }
 1870 
 1871         /* Unlink from list of pipes. */
 1872         SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
 1873 
 1874         /* Remove all references to this pipe from flow_sets. */
 1875         for (i = 0; i < HASHSIZE; i++)
 1876             SLIST_FOREACH(fs, &flowsethash[i], next)
 1877                 if (fs->pipe == pipe) {
 1878                         printf("dummynet: ++ ref to pipe %d from fs %d\n",
 1879                             p->pipe_nr, fs->fs_nr);
 1880                         fs->pipe = NULL ;
 1881                         purge_flow_set(fs, 0);
 1882                 }
 1883         fs_remove_from_heap(&ready_heap, &(pipe->fs));
 1884         purge_pipe(pipe); /* remove all data associated to this pipe */
 1885         /* remove reference to here from extract_heap and wfq_ready_heap */
 1886         pipe_remove_from_heap(&extract_heap, pipe);
 1887         pipe_remove_from_heap(&wfq_ready_heap, pipe);
 1888         DUMMYNET_UNLOCK();
 1889 
 1890         free(pipe, M_DUMMYNET);
 1891     } else { /* this is a WF2Q queue (dn_flow_set) */
 1892         struct dn_flow_set *fs;
 1893 
 1894         DUMMYNET_LOCK();
 1895         fs = locate_flowset(p->fs.fs_nr); /* locate set */
 1896 
 1897         if (fs == NULL) {
 1898             DUMMYNET_UNLOCK();
 1899             return (ENOENT); /* not found */
 1900         }
 1901 
 1902         /* Unlink from list of flowsets. */
 1903         SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
 1904 
 1905         if (fs->pipe != NULL) {
 1906             /* Update total weight on parent pipe and cleanup parent heaps. */
 1907             fs->pipe->sum -= fs->weight * fs->backlogged ;
 1908             fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
 1909             fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
 1910 #if 1   /* XXX should i remove from idle_heap as well ? */
 1911             fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
 1912 #endif
 1913         }
 1914         purge_flow_set(fs, 1);
 1915         DUMMYNET_UNLOCK();
 1916     }
 1917     return 0 ;
 1918 }
 1919 
 1920 /*
 1921  * helper function used to copy data from kernel in DUMMYNET_GET
 1922  */
 1923 static char *
 1924 dn_copy_set(struct dn_flow_set *set, char *bp)
 1925 {
 1926     int i, copied = 0 ;
 1927     struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
 1928 
 1929     DUMMYNET_LOCK_ASSERT();
 1930 
 1931     for (i = 0 ; i <= set->rq_size ; i++)
 1932         for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
 1933             if (q->hash_slot != i)
 1934                 printf("dummynet: ++ at %d: wrong slot (have %d, "
 1935                     "should be %d)\n", copied, q->hash_slot, i);
 1936             if (q->fs != set)
 1937                 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
 1938                         i, q->fs, set);
 1939             copied++ ;
 1940             bcopy(q, qp, sizeof( *q ) );
 1941             /* cleanup pointers */
 1942             qp->next = NULL ;
 1943             qp->head = qp->tail = NULL ;
 1944             qp->fs = NULL ;
 1945         }
 1946     if (copied != set->rq_elements)
 1947         printf("dummynet: ++ wrong count, have %d should be %d\n",
 1948             copied, set->rq_elements);
 1949     return (char *)qp ;
 1950 }
 1951 
 1952 static size_t
 1953 dn_calc_size(void)
 1954 {
 1955     struct dn_flow_set *fs;
 1956     struct dn_pipe *pipe;
 1957     size_t size = 0;
 1958     int i;
 1959 
 1960     DUMMYNET_LOCK_ASSERT();
 1961     /*
 1962      * Compute size of data structures: list of pipes and flow_sets.
 1963      */
 1964     for (i = 0; i < HASHSIZE; i++) {
 1965         SLIST_FOREACH(pipe, &pipehash[i], next)
 1966                 size += sizeof(*pipe) +
 1967                     pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
 1968         SLIST_FOREACH(fs, &flowsethash[i], next)
 1969                 size += sizeof (*fs) +
 1970                     fs->rq_elements * sizeof(struct dn_flow_queue);
 1971     }
 1972     return size;
 1973 }
 1974 
 1975 static int
 1976 dummynet_get(struct sockopt *sopt)
 1977 {
 1978     char *buf, *bp ; /* bp is the "copy-pointer" */
 1979     size_t size ;
 1980     struct dn_flow_set *fs;
 1981     struct dn_pipe *pipe;
 1982     int error=0, i ;
 1983 
 1984     /* XXX lock held too long */
 1985     DUMMYNET_LOCK();
 1986     /*
 1987      * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
 1988      *      cannot use this flag while holding a mutex.
 1989      */
 1990     for (i = 0; i < 10; i++) {
 1991         size = dn_calc_size();
 1992         DUMMYNET_UNLOCK();
 1993         buf = malloc(size, M_TEMP, M_WAITOK);
 1994         DUMMYNET_LOCK();
 1995         if (size == dn_calc_size())
 1996                 break;
 1997         free(buf, M_TEMP);
 1998         buf = NULL;
 1999     }
 2000     if (buf == NULL) {
 2001         DUMMYNET_UNLOCK();
 2002         return ENOBUFS ;
 2003     }
 2004     bp = buf;
 2005     for (i = 0; i < HASHSIZE; i++)
 2006         SLIST_FOREACH(pipe, &pipehash[i], next) {
 2007                 struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
 2008 
 2009                 /*
 2010                  * Copy pipe descriptor into *bp, convert delay back to ms,
 2011                  * then copy the flow_set descriptor(s) one at a time.
 2012                  * After each flow_set, copy the queue descriptor it owns.
 2013                  */
 2014                 bcopy(pipe, bp, sizeof(*pipe));
 2015                 pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
 2016                 /*
 2017                  * XXX the following is a hack based on ->next being the
 2018                  * first field in dn_pipe and dn_flow_set. The correct
 2019                  * solution would be to move the dn_flow_set to the beginning
 2020                  * of struct dn_pipe.
 2021                  */
 2022                 pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
 2023                 /* Clean pointers. */
 2024                 pipe_bp->head = pipe_bp->tail = NULL;
 2025                 pipe_bp->fs.next.sle_next = NULL;
 2026                 pipe_bp->fs.pipe = NULL;
 2027                 pipe_bp->fs.rq = NULL;
 2028 
 2029                 bp += sizeof(*pipe) ;
 2030                 bp = dn_copy_set(&(pipe->fs), bp);
 2031         }
 2032 
 2033     for (i = 0; i < HASHSIZE; i++)
 2034         SLIST_FOREACH(fs, &flowsethash[i], next) {
 2035                 struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
 2036 
 2037                 bcopy(fs, bp, sizeof(*fs));
 2038                 /* XXX same hack as above */
 2039                 fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
 2040                 fs_bp->pipe = NULL;
 2041                 fs_bp->rq = NULL;
 2042                 bp += sizeof(*fs);
 2043                 bp = dn_copy_set(fs, bp);
 2044         }
 2045 
 2046     DUMMYNET_UNLOCK();
 2047 
 2048     error = sooptcopyout(sopt, buf, size);
 2049     free(buf, M_TEMP);
 2050     return error ;
 2051 }
 2052 
 2053 /*
 2054  * Handler for the various dummynet socket options (get, flush, config, del)
 2055  */
 2056 static int
 2057 ip_dn_ctl(struct sockopt *sopt)
 2058 {
 2059     int error = 0 ;
 2060     struct dn_pipe *p, tmp_pipe;
 2061 
 2062     /* Disallow sets in really-really secure mode. */
 2063     if (sopt->sopt_dir == SOPT_SET) {
 2064 #if __FreeBSD_version >= 500034
 2065         error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
 2066         if (error)
 2067             return (error);
 2068 #else
 2069         if (securelevel >= 3)
 2070             return (EPERM);
 2071 #endif
 2072     }
 2073 
 2074     switch (sopt->sopt_name) {
 2075     default :
 2076         printf("dummynet: -- unknown option %d", sopt->sopt_name);
 2077         return EINVAL ;
 2078 
 2079     case IP_DUMMYNET_GET :
 2080         error = dummynet_get(sopt);
 2081         break ;
 2082 
 2083     case IP_DUMMYNET_FLUSH :
 2084         dummynet_flush() ;
 2085         break ;
 2086 
 2087     case IP_DUMMYNET_CONFIGURE :
 2088         p = &tmp_pipe ;
 2089         error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
 2090         if (error)
 2091             break ;
 2092         error = config_pipe(p);
 2093         break ;
 2094 
 2095     case IP_DUMMYNET_DEL :      /* remove a pipe or queue */
 2096         p = &tmp_pipe ;
 2097         error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
 2098         if (error)
 2099             break ;
 2100 
 2101         error = delete_pipe(p);
 2102         break ;
 2103     }
 2104     return error ;
 2105 }
 2106 
 2107 static void
 2108 ip_dn_init(void)
 2109 {
 2110         int i;
 2111 
 2112         if (bootverbose)
 2113                 printf("DUMMYNET with IPv6 initialized (040826)\n");
 2114 
 2115         DUMMYNET_LOCK_INIT();
 2116 
 2117         for (i = 0; i < HASHSIZE; i++) {
 2118                 SLIST_INIT(&pipehash[i]);
 2119                 SLIST_INIT(&flowsethash[i]);
 2120         }
 2121         ready_heap.size = ready_heap.elements = 0;
 2122         ready_heap.offset = 0;
 2123 
 2124         wfq_ready_heap.size = wfq_ready_heap.elements = 0;
 2125         wfq_ready_heap.offset = 0;
 2126 
 2127         extract_heap.size = extract_heap.elements = 0;
 2128         extract_heap.offset = 0;
 2129 
 2130         ip_dn_ctl_ptr = ip_dn_ctl;
 2131         ip_dn_io_ptr = dummynet_io;
 2132         ip_dn_ruledel_ptr = dn_rule_delete;
 2133 
 2134         TASK_INIT(&dn_task, 0, dummynet_task, NULL);
 2135         dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
 2136             taskqueue_thread_enqueue, &dn_tq);
 2137         taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
 2138 
 2139         callout_init(&dn_timeout, CALLOUT_MPSAFE);
 2140         callout_reset(&dn_timeout, 1, dummynet, NULL);
 2141 
 2142         /* Initialize curr_time adjustment mechanics. */
 2143         getmicrouptime(&prev_t);
 2144 }
 2145 
 2146 #ifdef KLD_MODULE
 2147 static void
 2148 ip_dn_destroy(void)
 2149 {
 2150         ip_dn_ctl_ptr = NULL;
 2151         ip_dn_io_ptr = NULL;
 2152         ip_dn_ruledel_ptr = NULL;
 2153 
 2154         DUMMYNET_LOCK();
 2155         callout_stop(&dn_timeout);
 2156         DUMMYNET_UNLOCK();
 2157         taskqueue_drain(dn_tq, &dn_task);
 2158         taskqueue_free(dn_tq);
 2159 
 2160         dummynet_flush();
 2161 
 2162         DUMMYNET_LOCK_DESTROY();
 2163 }
 2164 #endif /* KLD_MODULE */
 2165 
 2166 static int
 2167 dummynet_modevent(module_t mod, int type, void *data)
 2168 {
 2169 
 2170         switch (type) {
 2171         case MOD_LOAD:
 2172                 if (DUMMYNET_LOADED) {
 2173                     printf("DUMMYNET already loaded\n");
 2174                     return EEXIST ;
 2175                 }
 2176                 ip_dn_init();
 2177                 break;
 2178 
 2179         case MOD_UNLOAD:
 2180 #if !defined(KLD_MODULE)
 2181                 printf("dummynet statically compiled, cannot unload\n");
 2182                 return EINVAL ;
 2183 #else
 2184                 ip_dn_destroy();
 2185 #endif
 2186                 break ;
 2187         default:
 2188                 return EOPNOTSUPP;
 2189                 break ;
 2190         }
 2191         return 0 ;
 2192 }
 2193 
 2194 static moduledata_t dummynet_mod = {
 2195         "dummynet",
 2196         dummynet_modevent,
 2197         NULL
 2198 };
 2199 DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 2200 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
 2201 MODULE_VERSION(dummynet, 1);

Cache object: a1aa3796e239ec31d5c97f4c3caa2ab5


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.