[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/ip_dummynet.c

Version: -  FREEBSD  -  FREEBSD7  -  FREEBSD70  -  FREEBSD6  -  FREEBSD64  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  OPENSOLARIS  -  minix-3-1-1  -  TRUSTEDBSD-SEBSD  -  FREEBSD-LIBC  -  FREEBSD7-LIBC  -  FREEBSD6-LIBC  -  GLIBC27 
SearchContext: -  none  -  excerpts  -  bigexcerpts 

  1 /*-
  2  * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
  3  * Portions Copyright (c) 2000 Akamba Corp.
  4  * All rights reserved
  5  *
  6  * Redistribution and use in source and binary forms, with or without
  7  * modification, are permitted provided that the following conditions
  8  * are met:
  9  * 1. Redistributions of source code must retain the above copyright
 10  *    notice, this list of conditions and the following disclaimer.
 11  * 2. Redistributions in binary form must reproduce the above copyright
 12  *    notice, this list of conditions and the following disclaimer in the
 13  *    documentation and/or other materials provided with the distribution.
 14  *
 15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25  * SUCH DAMAGE.
 26  */
 27 
 28 #include <sys/cdefs.h>
 29 __FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.116 2008/05/22 08:10:31 rwatson Exp $");
 30 
 31 #define DUMMYNET_DEBUG
 32 
 33 #include "opt_inet6.h"
 34 
 35 /*
 36  * This module implements IP dummynet, a bandwidth limiter/delay emulator
 37  * used in conjunction with the ipfw package.
 38  * Description of the data structures used is in ip_dummynet.h
 39  * Here you mainly find the following blocks of code:
 40  *  + variable declarations;
 41  *  + heap management functions;
 42  *  + scheduler and dummynet functions;
 43  *  + configuration and initialization.
 44  *
 45  * NOTA BENE: critical sections are protected by the "dummynet lock".
 46  *
 47  * Most important Changes:
 48  *
 49  * 011004: KLDable
 50  * 010124: Fixed WF2Q behaviour
 51  * 010122: Fixed spl protection.
 52  * 000601: WF2Q support
 53  * 000106: large rewrite, use heaps to handle very many pipes.
 54  * 980513:      initial release
 55  *
 56  * include files marked with XXX are probably not needed
 57  */
 58 
 59 #include <sys/limits.h>
 60 #include <sys/param.h>
 61 #include <sys/systm.h>
 62 #include <sys/malloc.h>
 63 #include <sys/mbuf.h>
 64 #include <sys/kernel.h>
 65 #include <sys/module.h>
 66 #include <sys/priv.h>
 67 #include <sys/proc.h>
 68 #include <sys/socket.h>
 69 #include <sys/socketvar.h>
 70 #include <sys/time.h>
 71 #include <sys/sysctl.h>
 72 #include <sys/taskqueue.h>
 73 #include <net/if.h>
 74 #include <net/netisr.h>
 75 #include <net/route.h>
 76 #include <netinet/in.h>
 77 #include <netinet/in_systm.h>
 78 #include <netinet/in_var.h>
 79 #include <netinet/ip.h>
 80 #include <netinet/ip_fw.h>
 81 #include <netinet/ip_dummynet.h>
 82 #include <netinet/ip_var.h>
 83 
 84 #include <netinet/if_ether.h> /* for struct arpcom */
 85 
 86 #include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
 87 #include <netinet6/ip6_var.h>
 88 
 89 /*
 90  * We keep a private variable for the simulation time, but we could
 91  * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
 92  */
 93 static dn_key curr_time = 0 ; /* current simulation time */
 94 
 95 static int dn_hash_size = 64 ;  /* default hash size */
 96 
 97 /* statistics on number of queue searches and search steps */
 98 static long searches, search_steps ;
 99 static int pipe_expire = 1 ;   /* expire queue if empty */
100 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
101 
102 static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
103 static long pipe_byte_limit = 1024 * 1024;
104 
105 static int red_lookup_depth = 256;      /* RED - default lookup table depth */
106 static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
107 static int red_max_pkt_size = 1500;     /* RED - default max packet size */
108 
109 static struct timeval prev_t, t;
110 static long tick_last;                  /* Last tick duration (usec). */
111 static long tick_delta;                 /* Last vs standard tick diff (usec). */
112 static long tick_delta_sum;             /* Accumulated tick difference (usec).*/
113 static long tick_adjustment;            /* Tick adjustments done. */
114 static long tick_lost;                  /* Lost(coalesced) ticks number. */
115 /* Adjusted vs non-adjusted curr_time difference (ticks). */
116 static long tick_diff;
117 
118 static int              io_fast;
119 static unsigned long    io_pkt;
120 static unsigned long    io_pkt_fast;
121 static unsigned long    io_pkt_drop;
122 
123 /*
124  * Three heaps contain queues and pipes that the scheduler handles:
125  *
126  * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
127  *
128  * wfq_ready_heap contains the pipes associated with WF2Q flows
129  *
130  * extract_heap contains pipes associated with delay lines.
131  *
132  */
133 
134 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
135 
136 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
137 
138 static int      heap_init(struct dn_heap *h, int size);
139 static int      heap_insert (struct dn_heap *h, dn_key key1, void *p);
140 static void     heap_extract(struct dn_heap *h, void *obj);
141 static void     transmit_event(struct dn_pipe *pipe, struct mbuf **head,
142                     struct mbuf **tail);
143 static void     ready_event(struct dn_flow_queue *q, struct mbuf **head,
144                     struct mbuf **tail);
145 static void     ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
146                     struct mbuf **tail);
147 
148 #define HASHSIZE        16
149 #define HASH(num)       ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
150 static struct dn_pipe_head      pipehash[HASHSIZE];     /* all pipes */
151 static struct dn_flow_set_head  flowsethash[HASHSIZE];  /* all flowsets */
152 
153 static struct callout dn_timeout;
154 
155 extern  void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
156 
157 #ifdef SYSCTL_NODE
158 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
159 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
160     CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
161 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
162     CTLFLAG_RD, &curr_time, 0, "Current tick");
163 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
164     CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
165 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
166     CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
167 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
168     CTLFLAG_RD, &searches, 0, "Number of queue searches");
169 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
170     CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
171 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
172     CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
173 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
174     CTLFLAG_RW, &dn_max_ratio, 0,
175     "Max ratio between dynamic queues and buckets");
176 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
177     CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
178 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
179     CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
180 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
181     CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
182 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
183     CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
184 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
185     CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
186 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
187     CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
188 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
189     CTLFLAG_RD, &tick_diff, 0,
190     "Adjusted vs non-adjusted curr_time difference (ticks).");
191 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
192     CTLFLAG_RD, &tick_lost, 0,
193     "Number of ticks coalesced by dummynet taskqueue.");
194 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
195     CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
196 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
197     CTLFLAG_RD, &io_pkt, 0,
198     "Number of packets passed to dummynet.");
199 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
200     CTLFLAG_RD, &io_pkt_fast, 0,
201     "Number of packets bypassed dummynet scheduler.");
202 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
203     CTLFLAG_RD, &io_pkt_drop, 0,
204     "Number of packets dropped by dummynet.");
205 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
206     CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
207 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
208     CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
209 #endif
210 
211 #ifdef DUMMYNET_DEBUG
212 int     dummynet_debug = 0;
213 #ifdef SYSCTL_NODE
214 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
215             0, "control debugging printfs");
216 #endif
217 #define DPRINTF(X)      if (dummynet_debug) printf X
218 #else
219 #define DPRINTF(X)
220 #endif
221 
222 static struct task      dn_task;
223 static struct taskqueue *dn_tq = NULL;
224 static void dummynet_task(void *, int);
225 
226 static struct mtx dummynet_mtx;
227 #define DUMMYNET_LOCK_INIT() \
228         mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
229 #define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
230 #define DUMMYNET_LOCK()         mtx_lock(&dummynet_mtx)
231 #define DUMMYNET_UNLOCK()       mtx_unlock(&dummynet_mtx)
232 #define DUMMYNET_LOCK_ASSERT()  mtx_assert(&dummynet_mtx, MA_OWNED)
233 
234 static int      config_pipe(struct dn_pipe *p);
235 static int      ip_dn_ctl(struct sockopt *sopt);
236 
237 static void     dummynet(void *);
238 static void     dummynet_flush(void);
239 static void     dummynet_send(struct mbuf *);
240 void            dummynet_drain(void);
241 static ip_dn_io_t dummynet_io;
242 static void     dn_rule_delete(void *);
243 
244 /*
245  * Heap management functions.
246  *
247  * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
248  * Some macros help finding parent/children so we can optimize them.
249  *
250  * heap_init() is called to expand the heap when needed.
251  * Increment size in blocks of 16 entries.
252  * XXX failure to allocate a new element is a pretty bad failure
253  * as we basically stall a whole queue forever!!
254  * Returns 1 on error, 0 on success
255  */
256 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
257 #define HEAP_LEFT(x) ( 2*(x) + 1 )
258 #define HEAP_IS_LEFT(x) ( (x) & 1 )
259 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
260 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
261 #define HEAP_INCREMENT  15
262 
263 static int
264 heap_init(struct dn_heap *h, int new_size)
265 {
266     struct dn_heap_entry *p;
267 
268     if (h->size >= new_size ) {
269         printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
270                 h->size, new_size);
271         return 0 ;
272     }
273     new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
274     p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
275     if (p == NULL) {
276         printf("dummynet: %s, resize %d failed\n", __func__, new_size );
277         return 1 ; /* error */
278     }
279     if (h->size > 0) {
280         bcopy(h->p, p, h->size * sizeof(*p) );
281         free(h->p, M_DUMMYNET);
282     }
283     h->p = p ;
284     h->size = new_size ;
285     return 0 ;
286 }
287 
288 /*
289  * Insert element in heap. Normally, p != NULL, we insert p in
290  * a new position and bubble up. If p == NULL, then the element is
291  * already in place, and key is the position where to start the
292  * bubble-up.
293  * Returns 1 on failure (cannot allocate new heap entry)
294  *
295  * If offset > 0 the position (index, int) of the element in the heap is
296  * also stored in the element itself at the given offset in bytes.
297  */
298 #define SET_OFFSET(heap, node) \
299     if (heap->offset > 0) \
300             *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
301 /*
302  * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
303  */
304 #define RESET_OFFSET(heap, node) \
305     if (heap->offset > 0) \
306             *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
307 static int
308 heap_insert(struct dn_heap *h, dn_key key1, void *p)
309 {
310     int son = h->elements ;
311 
312     if (p == NULL)      /* data already there, set starting point */
313         son = key1 ;
314     else {              /* insert new element at the end, possibly resize */
315         son = h->elements ;
316         if (son == h->size) /* need resize... */
317             if (heap_init(h, h->elements+1) )
318                 return 1 ; /* failure... */
319         h->p[son].object = p ;
320         h->p[son].key = key1 ;
321         h->elements++ ;
322     }
323     while (son > 0) {                           /* bubble up */
324         int father = HEAP_FATHER(son) ;
325         struct dn_heap_entry tmp  ;
326 
327         if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
328             break ; /* found right position */
329         /* son smaller than father, swap and repeat */
330         HEAP_SWAP(h->p[son], h->p[father], tmp) ;
331         SET_OFFSET(h, son);
332         son = father ;
333     }
334     SET_OFFSET(h, son);
335     return 0 ;
336 }
337 
338 /*
339  * remove top element from heap, or obj if obj != NULL
340  */
341 static void
342 heap_extract(struct dn_heap *h, void *obj)
343 {
344     int child, father, max = h->elements - 1 ;
345 
346     if (max < 0) {
347         printf("dummynet: warning, extract from empty heap 0x%p\n", h);
348         return ;
349     }
350     father = 0 ; /* default: move up smallest child */
351     if (obj != NULL) { /* extract specific element, index is at offset */
352         if (h->offset <= 0)
353             panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
354         father = *((int *)((char *)obj + h->offset)) ;
355         if (father < 0 || father >= h->elements) {
356             printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
357                 father, h->elements);
358             panic("dummynet: heap_extract");
359         }
360     }
361     RESET_OFFSET(h, father);
362     child = HEAP_LEFT(father) ;         /* left child */
363     while (child <= max) {              /* valid entry */
364         if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
365             child = child+1 ;           /* take right child, otherwise left */
366         h->p[father] = h->p[child] ;
367         SET_OFFSET(h, father);
368         father = child ;
369         child = HEAP_LEFT(child) ;   /* left child for next loop */
370     }
371     h->elements-- ;
372     if (father != max) {
373         /*
374          * Fill hole with last entry and bubble up, reusing the insert code
375          */
376         h->p[father] = h->p[max] ;
377         heap_insert(h, father, NULL); /* this one cannot fail */
378     }
379 }
380 
381 #if 0
382 /*
383  * change object position and update references
384  * XXX this one is never used!
385  */
386 static void
387 heap_move(struct dn_heap *h, dn_key new_key, void *object)
388 {
389     int temp;
390     int i ;
391     int max = h->elements-1 ;
392     struct dn_heap_entry buf ;
393 
394     if (h->offset <= 0)
395         panic("cannot move items on this heap");
396 
397     i = *((int *)((char *)object + h->offset));
398     if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
399         h->p[i].key = new_key ;
400         for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
401                  i = temp ) { /* bubble up */
402             HEAP_SWAP(h->p[i], h->p[temp], buf) ;
403             SET_OFFSET(h, i);
404         }
405     } else {            /* must move down */
406         h->p[i].key = new_key ;
407         while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
408             if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
409                 temp++ ; /* select child with min key */
410             if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
411                 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
412                 SET_OFFSET(h, i);
413             } else
414                 break ;
415             i = temp ;
416         }
417     }
418     SET_OFFSET(h, i);
419 }
420 #endif /* heap_move, unused */
421 
422 /*
423  * heapify() will reorganize data inside an array to maintain the
424  * heap property. It is needed when we delete a bunch of entries.
425  */
426 static void
427 heapify(struct dn_heap *h)
428 {
429     int i ;
430 
431     for (i = 0 ; i < h->elements ; i++ )
432         heap_insert(h, i , NULL) ;
433 }
434 
435 /*
436  * cleanup the heap and free data structure
437  */
438 static void
439 heap_free(struct dn_heap *h)
440 {
441     if (h->size >0 )
442         free(h->p, M_DUMMYNET);
443     bzero(h, sizeof(*h) );
444 }
445 
446 /*
447  * --- end of heap management functions ---
448  */
449 
450 /*
451  * Return the mbuf tag holding the dummynet state.  As an optimization
452  * this is assumed to be the first tag on the list.  If this turns out
453  * wrong we'll need to search the list.
454  */
455 static struct dn_pkt_tag *
456 dn_tag_get(struct mbuf *m)
457 {
458     struct m_tag *mtag = m_tag_first(m);
459     KASSERT(mtag != NULL &&
460             mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
461             mtag->m_tag_id == PACKET_TAG_DUMMYNET,
462             ("packet on dummynet queue w/o dummynet tag!"));
463     return (struct dn_pkt_tag *)(mtag+1);
464 }
465 
466 /*
467  * Scheduler functions:
468  *
469  * transmit_event() is called when the delay-line needs to enter
470  * the scheduler, either because of existing pkts getting ready,
471  * or new packets entering the queue. The event handled is the delivery
472  * time of the packet.
473  *
474  * ready_event() does something similar with fixed-rate queues, and the
475  * event handled is the finish time of the head pkt.
476  *
477  * wfq_ready_event() does something similar with WF2Q queues, and the
478  * event handled is the start time of the head pkt.
479  *
480  * In all cases, we make sure that the data structures are consistent
481  * before passing pkts out, because this might trigger recursive
482  * invocations of the procedures.
483  */
484 static void
485 transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
486 {
487         struct mbuf *m;
488         struct dn_pkt_tag *pkt;
489 
490         DUMMYNET_LOCK_ASSERT();
491 
492         while ((m = pipe->head) != NULL) {
493                 pkt = dn_tag_get(m);
494                 if (!DN_KEY_LEQ(pkt->output_time, curr_time))
495                         break;
496 
497                 pipe->head = m->m_nextpkt;
498                 if (*tail != NULL)
499                         (*tail)->m_nextpkt = m;
500                 else
501                         *head = m;
502                 *tail = m;
503         }
504         if (*tail != NULL)
505                 (*tail)->m_nextpkt = NULL;
506 
507         /* If there are leftover packets, put into the heap for next event. */
508         if ((m = pipe->head) != NULL) {
509                 pkt = dn_tag_get(m);
510                 /*
511                  * XXX Should check errors on heap_insert, by draining the
512                  * whole pipe p and hoping in the future we are more successful.
513                  */
514                 heap_insert(&extract_heap, pkt->output_time, pipe);
515         }
516 }
517 
518 /*
519  * the following macro computes how many ticks we have to wait
520  * before being able to transmit a packet. The credit is taken from
521  * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
522  */
523 #define SET_TICKS(_m, q, p)     \
524     ((_m)->m_pkthdr.len * 8 * hz - (q)->numbytes + p->bandwidth - 1) / \
525     p->bandwidth;
526 
527 /*
528  * extract pkt from queue, compute output time (could be now)
529  * and put into delay line (p_queue)
530  */
531 static void
532 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
533     int len)
534 {
535     struct dn_pkt_tag *dt = dn_tag_get(pkt);
536 
537     q->head = pkt->m_nextpkt ;
538     q->len-- ;
539     q->len_bytes -= len ;
540 
541     dt->output_time = curr_time + p->delay ;
542 
543     if (p->head == NULL)
544         p->head = pkt;
545     else
546         p->tail->m_nextpkt = pkt;
547     p->tail = pkt;
548     p->tail->m_nextpkt = NULL;
549 }
550 
551 /*
552  * ready_event() is invoked every time the queue must enter the
553  * scheduler, either because the first packet arrives, or because
554  * a previously scheduled event fired.
555  * On invokation, drain as many pkts as possible (could be 0) and then
556  * if there are leftover packets reinsert the pkt in the scheduler.
557  */
558 static void
559 ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
560 {
561         struct mbuf *pkt;
562         struct dn_pipe *p = q->fs->pipe;
563         int p_was_empty;
564 
565         DUMMYNET_LOCK_ASSERT();
566 
567         if (p == NULL) {
568                 printf("dummynet: ready_event- pipe is gone\n");
569                 return;
570         }
571         p_was_empty = (p->head == NULL);
572 
573         /*
574          * Schedule fixed-rate queues linked to this pipe:
575          * account for the bw accumulated since last scheduling, then
576          * drain as many pkts as allowed by q->numbytes and move to
577          * the delay line (in p) computing output time.
578          * bandwidth==0 (no limit) means we can drain the whole queue,
579          * setting len_scaled = 0 does the job.
580          */
581         q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
582         while ((pkt = q->head) != NULL) {
583                 int len = pkt->m_pkthdr.len;
584                 int len_scaled = p->bandwidth ? len * 8 * hz : 0;
585 
586                 if (len_scaled > q->numbytes)
587                         break;
588                 q->numbytes -= len_scaled;
589                 move_pkt(pkt, q, p, len);
590         }
591         /*
592          * If we have more packets queued, schedule next ready event
593          * (can only occur when bandwidth != 0, otherwise we would have
594          * flushed the whole queue in the previous loop).
595          * To this purpose we record the current time and compute how many
596          * ticks to go for the finish time of the packet.
597          */
598         if ((pkt = q->head) != NULL) {  /* this implies bandwidth != 0 */
599                 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
600 
601                 q->sched_time = curr_time;
602                 heap_insert(&ready_heap, curr_time + t, (void *)q);
603                 /*
604                  * XXX Should check errors on heap_insert, and drain the whole
605                  * queue on error hoping next time we are luckier.
606                  */
607         } else          /* RED needs to know when the queue becomes empty. */
608                 q->q_time = curr_time;
609 
610         /*
611          * If the delay line was empty call transmit_event() now.
612          * Otherwise, the scheduler will take care of it.
613          */
614         if (p_was_empty)
615                 transmit_event(p, head, tail);
616 }
617 
618 /*
619  * Called when we can transmit packets on WF2Q queues. Take pkts out of
620  * the queues at their start time, and enqueue into the delay line.
621  * Packets are drained until p->numbytes < 0. As long as
622  * len_scaled >= p->numbytes, the packet goes into the delay line
623  * with a deadline p->delay. For the last packet, if p->numbytes < 0,
624  * there is an additional delay.
625  */
626 static void
627 ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
628 {
629         int p_was_empty = (p->head == NULL);
630         struct dn_heap *sch = &(p->scheduler_heap);
631         struct dn_heap *neh = &(p->not_eligible_heap);
632         int64_t p_numbytes = p->numbytes;
633 
634         DUMMYNET_LOCK_ASSERT();
635 
636         if (p->if_name[0] == 0)         /* tx clock is simulated */
637                 /*
638                  * Since result may not fit into p->numbytes (32bit) we
639                  * are using 64bit var here.
640                  */
641                 p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
642         else {  /*
643                  * tx clock is for real,
644                  * the ifq must be empty or this is a NOP.
645                  */
646                 if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
647                         return;
648                 else {
649                         DPRINTF(("dummynet: pipe %d ready from %s --\n",
650                             p->pipe_nr, p->if_name));
651                 }
652         }
653 
654         /*
655          * While we have backlogged traffic AND credit, we need to do
656          * something on the queue.
657          */
658         while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
659                 if (sch->elements > 0) {
660                         /* Have some eligible pkts to send out. */
661                         struct dn_flow_queue *q = sch->p[0].object;
662                         struct mbuf *pkt = q->head;
663                         struct dn_flow_set *fs = q->fs;
664                         uint64_t len = pkt->m_pkthdr.len;
665                         int len_scaled = p->bandwidth ? len * 8 * hz : 0;
666 
667                         heap_extract(sch, NULL); /* Remove queue from heap. */
668                         p_numbytes -= len_scaled;
669                         move_pkt(pkt, q, p, len);
670 
671                         p->V += (len << MY_M) / p->sum; /* Update V. */
672                         q->S = q->F;                    /* Update start time. */
673                         if (q->len == 0) {
674                                 /* Flow not backlogged any more. */
675                                 fs->backlogged--;
676                                 heap_insert(&(p->idle_heap), q->F, q);
677                         } else {
678                                 /* Still backlogged. */
679 
680                                 /*
681                                  * Update F and position in backlogged queue,
682                                  * then put flow in not_eligible_heap
683                                  * (we will fix this later).
684                                  */
685                                 len = (q->head)->m_pkthdr.len;
686                                 q->F += (len << MY_M) / (uint64_t)fs->weight;
687                                 if (DN_KEY_LEQ(q->S, p->V))
688                                         heap_insert(neh, q->S, q);
689                                 else
690                                         heap_insert(sch, q->F, q);
691                         }
692                 }
693                 /*
694                  * Now compute V = max(V, min(S_i)). Remember that all elements
695                  * in sch have by definition S_i <= V so if sch is not empty,
696                  * V is surely the max and we must not update it. Conversely,
697                  * if sch is empty we only need to look at neh.
698                  */
699                 if (sch->elements == 0 && neh->elements > 0)
700                         p->V = MAX64(p->V, neh->p[0].key);
701                 /* Move from neh to sch any packets that have become eligible */
702                 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
703                         struct dn_flow_queue *q = neh->p[0].object;
704                         heap_extract(neh, NULL);
705                         heap_insert(sch, q->F, q);
706                 }
707 
708                 if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
709                         p_numbytes = -1;        /* Mark not ready for I/O. */
710                         break;
711                 }
712         }
713         if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0 &&
714             p->idle_heap.elements > 0) {
715                 /*
716                  * No traffic and no events scheduled.
717                  * We can get rid of idle-heap.
718                  */
719                 int i;
720 
721                 for (i = 0; i < p->idle_heap.elements; i++) {
722                         struct dn_flow_queue *q = p->idle_heap.p[i].object;
723 
724                         q->F = 0;
725                         q->S = q->F + 1;
726                 }
727                 p->sum = 0;
728                 p->V = 0;
729                 p->idle_heap.elements = 0;
730         }
731         /*
732          * If we are getting clocks from dummynet (not a real interface) and
733          * If we are under credit, schedule the next ready event.
734          * Also fix the delivery time of the last packet.
735          */
736         if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
737                 dn_key t = 0;           /* Number of ticks i have to wait. */
738 
739                 if (p->bandwidth > 0)
740                         t = (p->bandwidth - 1 - p_numbytes) / p->bandwidth;
741                 dn_tag_get(p->tail)->output_time += t;
742                 p->sched_time = curr_time;
743                 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
744                 /*
745                  * XXX Should check errors on heap_insert, and drain the whole
746                  * queue on error hoping next time we are luckier.
747                  */
748         }
749 
750         /* Fit (adjust if necessary) 64bit result into 32bit variable. */
751         if (p_numbytes > INT_MAX)
752                 p->numbytes = INT_MAX;
753         else if (p_numbytes < INT_MIN)
754                 p->numbytes = INT_MIN;
755         else
756                 p->numbytes = p_numbytes;
757 
758         /*
759          * If the delay line was empty call transmit_event() now.
760          * Otherwise, the scheduler will take care of it.
761          */
762         if (p_was_empty)
763                 transmit_event(p, head, tail);
764 }
765 
766 /*
767  * This is called one tick, after previous run. It is used to
768  * schedule next run.
769  */
770 static void
771 dummynet(void * __unused unused)
772 {
773 
774         taskqueue_enqueue(dn_tq, &dn_task);
775 }
776 
777 /*
778  * The main dummynet processing function.
779  */
780 static void
781 dummynet_task(void *context, int pending)
782 {
783         struct mbuf *head = NULL, *tail = NULL;
784         struct dn_pipe *pipe;
785         struct dn_heap *heaps[3];
786         struct dn_heap *h;
787         void *p;        /* generic parameter to handler */
788         int i;
789 
790         DUMMYNET_LOCK();
791 
792         heaps[0] = &ready_heap;                 /* fixed-rate queues */
793         heaps[1] = &wfq_ready_heap;             /* wfq queues */
794         heaps[2] = &extract_heap;               /* delay line */
795 
796         /* Update number of lost(coalesced) ticks. */
797         tick_lost += pending - 1;
798  
799         getmicrouptime(&t);
800         /* Last tick duration (usec). */
801         tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
802             (t.tv_usec - prev_t.tv_usec);
803         /* Last tick vs standard tick difference (usec). */
804         tick_delta = (tick_last * hz - 1000000) / hz;
805         /* Accumulated tick difference (usec). */
806         tick_delta_sum += tick_delta;
807  
808         prev_t = t;
809  
810         /*
811          * Adjust curr_time if accumulated tick difference greater than
812          * 'standard' tick. Since curr_time should be monotonically increasing,
813          * we do positive adjustment as required and throttle curr_time in
814          * case of negative adjustment.
815          */
816         curr_time++;
817         if (tick_delta_sum - tick >= 0) {
818                 int diff = tick_delta_sum / tick;
819  
820                 curr_time += diff;
821                 tick_diff += diff;
822                 tick_delta_sum %= tick;
823                 tick_adjustment++;
824         } else if (tick_delta_sum + tick <= 0) {
825                 curr_time--;
826                 tick_diff--;
827                 tick_delta_sum += tick;
828                 tick_adjustment++;
829         }
830 
831         for (i = 0; i < 3; i++) {
832                 h = heaps[i];
833                 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
834                         if (h->p[0].key > curr_time)
835                                 printf("dummynet: warning, "
836                                     "heap %d is %d ticks late\n",
837                                     i, (int)(curr_time - h->p[0].key));
838                         /* store a copy before heap_extract */
839                         p = h->p[0].object;
840                         /* need to extract before processing */
841                         heap_extract(h, NULL);
842                         if (i == 0)
843                                 ready_event(p, &head, &tail);
844                         else if (i == 1) {
845                                 struct dn_pipe *pipe = p;
846                                 if (pipe->if_name[0] != '\0')
847                                         printf("dummynet: bad ready_event_wfq "
848                                             "for pipe %s\n", pipe->if_name);
849                                 else
850                                         ready_event_wfq(p, &head, &tail);
851                         } else
852                                 transmit_event(p, &head, &tail);
853                 }
854         }
855 
856         /* Sweep pipes trying to expire idle flow_queues. */
857         for (i = 0; i < HASHSIZE; i++)
858                 SLIST_FOREACH(pipe, &pipehash[i], next)
859                         if (pipe->idle_heap.elements > 0 &&
860                             DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
861                                 struct dn_flow_queue *q =
862                                     pipe->idle_heap.p[0].object;
863 
864                                 heap_extract(&(pipe->idle_heap), NULL);
865                                 /* Mark timestamp as invalid. */
866                                 q->S = q->F + 1;
867                                 pipe->sum -= q->fs->weight;
868                         }
869 
870         DUMMYNET_UNLOCK();
871 
872         if (head != NULL)
873                 dummynet_send(head);
874 
875         callout_reset(&dn_timeout, 1, dummynet, NULL);
876 }
877 
878 static void
879 dummynet_send(struct mbuf *m)
880 {
881         struct dn_pkt_tag *pkt;
882         struct mbuf *n;
883         struct ip *ip;
884 
885         for (; m != NULL; m = n) {
886                 n = m->m_nextpkt;
887                 m->m_nextpkt = NULL;
888                 pkt = dn_tag_get(m);
889                 switch (pkt->dn_dir) {
890                 case DN_TO_IP_OUT:
891                         ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
892                         break ;
893                   case DN_TO_IP_IN :
894                         ip = mtod(m, struct ip *);
895                         ip->ip_len = htons(ip->ip_len);