1 /*-
2 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
3 * Portions Copyright (c) 2000 Akamba Corp.
4 * All rights reserved
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.116 2008/05/22 08:10:31 rwatson Exp $");
30
31 #define DUMMYNET_DEBUG
32
33 #include "opt_inet6.h"
34
35 /*
36 * This module implements IP dummynet, a bandwidth limiter/delay emulator
37 * used in conjunction with the ipfw package.
38 * Description of the data structures used is in ip_dummynet.h
39 * Here you mainly find the following blocks of code:
40 * + variable declarations;
41 * + heap management functions;
42 * + scheduler and dummynet functions;
43 * + configuration and initialization.
44 *
45 * NOTA BENE: critical sections are protected by the "dummynet lock".
46 *
47 * Most important Changes:
48 *
49 * 011004: KLDable
50 * 010124: Fixed WF2Q behaviour
51 * 010122: Fixed spl protection.
52 * 000601: WF2Q support
53 * 000106: large rewrite, use heaps to handle very many pipes.
54 * 980513: initial release
55 *
56 * include files marked with XXX are probably not needed
57 */
58
59 #include <sys/limits.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/malloc.h>
63 #include <sys/mbuf.h>
64 #include <sys/kernel.h>
65 #include <sys/module.h>
66 #include <sys/priv.h>
67 #include <sys/proc.h>
68 #include <sys/socket.h>
69 #include <sys/socketvar.h>
70 #include <sys/time.h>
71 #include <sys/sysctl.h>
72 #include <sys/taskqueue.h>
73 #include <net/if.h>
74 #include <net/netisr.h>
75 #include <net/route.h>
76 #include <netinet/in.h>
77 #include <netinet/in_systm.h>
78 #include <netinet/in_var.h>
79 #include <netinet/ip.h>
80 #include <netinet/ip_fw.h>
81 #include <netinet/ip_dummynet.h>
82 #include <netinet/ip_var.h>
83
84 #include <netinet/if_ether.h> /* for struct arpcom */
85
86 #include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
87 #include <netinet6/ip6_var.h>
88
89 /*
90 * We keep a private variable for the simulation time, but we could
91 * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
92 */
93 static dn_key curr_time = 0 ; /* current simulation time */
94
95 static int dn_hash_size = 64 ; /* default hash size */
96
97 /* statistics on number of queue searches and search steps */
98 static long searches, search_steps ;
99 static int pipe_expire = 1 ; /* expire queue if empty */
100 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
101
102 static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
103 static long pipe_byte_limit = 1024 * 1024;
104
105 static int red_lookup_depth = 256; /* RED - default lookup table depth */
106 static int red_avg_pkt_size = 512; /* RED - default medium packet size */
107 static int red_max_pkt_size = 1500; /* RED - default max packet size */
108
109 static struct timeval prev_t, t;
110 static long tick_last; /* Last tick duration (usec). */
111 static long tick_delta; /* Last vs standard tick diff (usec). */
112 static long tick_delta_sum; /* Accumulated tick difference (usec).*/
113 static long tick_adjustment; /* Tick adjustments done. */
114 static long tick_lost; /* Lost(coalesced) ticks number. */
115 /* Adjusted vs non-adjusted curr_time difference (ticks). */
116 static long tick_diff;
117
118 static int io_fast;
119 static unsigned long io_pkt;
120 static unsigned long io_pkt_fast;
121 static unsigned long io_pkt_drop;
122
123 /*
124 * Three heaps contain queues and pipes that the scheduler handles:
125 *
126 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
127 *
128 * wfq_ready_heap contains the pipes associated with WF2Q flows
129 *
130 * extract_heap contains pipes associated with delay lines.
131 *
132 */
133
134 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
135
136 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
137
138 static int heap_init(struct dn_heap *h, int size);
139 static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
140 static void heap_extract(struct dn_heap *h, void *obj);
141 static void transmit_event(struct dn_pipe *pipe, struct mbuf **head,
142 struct mbuf **tail);
143 static void ready_event(struct dn_flow_queue *q, struct mbuf **head,
144 struct mbuf **tail);
145 static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
146 struct mbuf **tail);
147
148 #define HASHSIZE 16
149 #define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
150 static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */
151 static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */
152
153 static struct callout dn_timeout;
154
155 extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
156
157 #ifdef SYSCTL_NODE
158 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
159 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
160 CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
161 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
162 CTLFLAG_RD, &curr_time, 0, "Current tick");
163 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
164 CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
165 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
166 CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
167 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
168 CTLFLAG_RD, &searches, 0, "Number of queue searches");
169 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
170 CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
171 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
172 CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
173 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
174 CTLFLAG_RW, &dn_max_ratio, 0,
175 "Max ratio between dynamic queues and buckets");
176 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
177 CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
178 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
179 CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
180 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
181 CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
182 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
183 CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
184 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
185 CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
186 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
187 CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
188 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
189 CTLFLAG_RD, &tick_diff, 0,
190 "Adjusted vs non-adjusted curr_time difference (ticks).");
191 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
192 CTLFLAG_RD, &tick_lost, 0,
193 "Number of ticks coalesced by dummynet taskqueue.");
194 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
195 CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
196 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
197 CTLFLAG_RD, &io_pkt, 0,
198 "Number of packets passed to dummynet.");
199 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
200 CTLFLAG_RD, &io_pkt_fast, 0,
201 "Number of packets bypassed dummynet scheduler.");
202 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
203 CTLFLAG_RD, &io_pkt_drop, 0,
204 "Number of packets dropped by dummynet.");
205 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
206 CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
207 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
208 CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
209 #endif
210
211 #ifdef DUMMYNET_DEBUG
212 int dummynet_debug = 0;
213 #ifdef SYSCTL_NODE
214 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
215 0, "control debugging printfs");
216 #endif
217 #define DPRINTF(X) if (dummynet_debug) printf X
218 #else
219 #define DPRINTF(X)
220 #endif
221
222 static struct task dn_task;
223 static struct taskqueue *dn_tq = NULL;
224 static void dummynet_task(void *, int);
225
226 static struct mtx dummynet_mtx;
227 #define DUMMYNET_LOCK_INIT() \
228 mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
229 #define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
230 #define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx)
231 #define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx)
232 #define DUMMYNET_LOCK_ASSERT() mtx_assert(&dummynet_mtx, MA_OWNED)
233
234 static int config_pipe(struct dn_pipe *p);
235 static int ip_dn_ctl(struct sockopt *sopt);
236
237 static void dummynet(void *);
238 static void dummynet_flush(void);
239 static void dummynet_send(struct mbuf *);
240 void dummynet_drain(void);
241 static ip_dn_io_t dummynet_io;
242 static void dn_rule_delete(void *);
243
244 /*
245 * Heap management functions.
246 *
247 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
248 * Some macros help finding parent/children so we can optimize them.
249 *
250 * heap_init() is called to expand the heap when needed.
251 * Increment size in blocks of 16 entries.
252 * XXX failure to allocate a new element is a pretty bad failure
253 * as we basically stall a whole queue forever!!
254 * Returns 1 on error, 0 on success
255 */
256 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
257 #define HEAP_LEFT(x) ( 2*(x) + 1 )
258 #define HEAP_IS_LEFT(x) ( (x) & 1 )
259 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
260 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
261 #define HEAP_INCREMENT 15
262
263 static int
264 heap_init(struct dn_heap *h, int new_size)
265 {
266 struct dn_heap_entry *p;
267
268 if (h->size >= new_size ) {
269 printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
270 h->size, new_size);
271 return 0 ;
272 }
273 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
274 p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
275 if (p == NULL) {
276 printf("dummynet: %s, resize %d failed\n", __func__, new_size );
277 return 1 ; /* error */
278 }
279 if (h->size > 0) {
280 bcopy(h->p, p, h->size * sizeof(*p) );
281 free(h->p, M_DUMMYNET);
282 }
283 h->p = p ;
284 h->size = new_size ;
285 return 0 ;
286 }
287
288 /*
289 * Insert element in heap. Normally, p != NULL, we insert p in
290 * a new position and bubble up. If p == NULL, then the element is
291 * already in place, and key is the position where to start the
292 * bubble-up.
293 * Returns 1 on failure (cannot allocate new heap entry)
294 *
295 * If offset > 0 the position (index, int) of the element in the heap is
296 * also stored in the element itself at the given offset in bytes.
297 */
298 #define SET_OFFSET(heap, node) \
299 if (heap->offset > 0) \
300 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
301 /*
302 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
303 */
304 #define RESET_OFFSET(heap, node) \
305 if (heap->offset > 0) \
306 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
307 static int
308 heap_insert(struct dn_heap *h, dn_key key1, void *p)
309 {
310 int son = h->elements ;
311
312 if (p == NULL) /* data already there, set starting point */
313 son = key1 ;
314 else { /* insert new element at the end, possibly resize */
315 son = h->elements ;
316 if (son == h->size) /* need resize... */
317 if (heap_init(h, h->elements+1) )
318 return 1 ; /* failure... */
319 h->p[son].object = p ;
320 h->p[son].key = key1 ;
321 h->elements++ ;
322 }
323 while (son > 0) { /* bubble up */
324 int father = HEAP_FATHER(son) ;
325 struct dn_heap_entry tmp ;
326
327 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
328 break ; /* found right position */
329 /* son smaller than father, swap and repeat */
330 HEAP_SWAP(h->p[son], h->p[father], tmp) ;
331 SET_OFFSET(h, son);
332 son = father ;
333 }
334 SET_OFFSET(h, son);
335 return 0 ;
336 }
337
338 /*
339 * remove top element from heap, or obj if obj != NULL
340 */
341 static void
342 heap_extract(struct dn_heap *h, void *obj)
343 {
344 int child, father, max = h->elements - 1 ;
345
346 if (max < 0) {
347 printf("dummynet: warning, extract from empty heap 0x%p\n", h);
348 return ;
349 }
350 father = 0 ; /* default: move up smallest child */
351 if (obj != NULL) { /* extract specific element, index is at offset */
352 if (h->offset <= 0)
353 panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
354 father = *((int *)((char *)obj + h->offset)) ;
355 if (father < 0 || father >= h->elements) {
356 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
357 father, h->elements);
358 panic("dummynet: heap_extract");
359 }
360 }
361 RESET_OFFSET(h, father);
362 child = HEAP_LEFT(father) ; /* left child */
363 while (child <= max) { /* valid entry */
364 if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
365 child = child+1 ; /* take right child, otherwise left */
366 h->p[father] = h->p[child] ;
367 SET_OFFSET(h, father);
368 father = child ;
369 child = HEAP_LEFT(child) ; /* left child for next loop */
370 }
371 h->elements-- ;
372 if (father != max) {
373 /*
374 * Fill hole with last entry and bubble up, reusing the insert code
375 */
376 h->p[father] = h->p[max] ;
377 heap_insert(h, father, NULL); /* this one cannot fail */
378 }
379 }
380
381 #if 0
382 /*
383 * change object position and update references
384 * XXX this one is never used!
385 */
386 static void
387 heap_move(struct dn_heap *h, dn_key new_key, void *object)
388 {
389 int temp;
390 int i ;
391 int max = h->elements-1 ;
392 struct dn_heap_entry buf ;
393
394 if (h->offset <= 0)
395 panic("cannot move items on this heap");
396
397 i = *((int *)((char *)object + h->offset));
398 if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
399 h->p[i].key = new_key ;
400 for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
401 i = temp ) { /* bubble up */
402 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
403 SET_OFFSET(h, i);
404 }
405 } else { /* must move down */
406 h->p[i].key = new_key ;
407 while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
408 if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
409 temp++ ; /* select child with min key */
410 if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
411 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
412 SET_OFFSET(h, i);
413 } else
414 break ;
415 i = temp ;
416 }
417 }
418 SET_OFFSET(h, i);
419 }
420 #endif /* heap_move, unused */
421
422 /*
423 * heapify() will reorganize data inside an array to maintain the
424 * heap property. It is needed when we delete a bunch of entries.
425 */
426 static void
427 heapify(struct dn_heap *h)
428 {
429 int i ;
430
431 for (i = 0 ; i < h->elements ; i++ )
432 heap_insert(h, i , NULL) ;
433 }
434
435 /*
436 * cleanup the heap and free data structure
437 */
438 static void
439 heap_free(struct dn_heap *h)
440 {
441 if (h->size >0 )
442 free(h->p, M_DUMMYNET);
443 bzero(h, sizeof(*h) );
444 }
445
446 /*
447 * --- end of heap management functions ---
448 */
449
450 /*
451 * Return the mbuf tag holding the dummynet state. As an optimization
452 * this is assumed to be the first tag on the list. If this turns out
453 * wrong we'll need to search the list.
454 */
455 static struct dn_pkt_tag *
456 dn_tag_get(struct mbuf *m)
457 {
458 struct m_tag *mtag = m_tag_first(m);
459 KASSERT(mtag != NULL &&
460 mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
461 mtag->m_tag_id == PACKET_TAG_DUMMYNET,
462 ("packet on dummynet queue w/o dummynet tag!"));
463 return (struct dn_pkt_tag *)(mtag+1);
464 }
465
466 /*
467 * Scheduler functions:
468 *
469 * transmit_event() is called when the delay-line needs to enter
470 * the scheduler, either because of existing pkts getting ready,
471 * or new packets entering the queue. The event handled is the delivery
472 * time of the packet.
473 *
474 * ready_event() does something similar with fixed-rate queues, and the
475 * event handled is the finish time of the head pkt.
476 *
477 * wfq_ready_event() does something similar with WF2Q queues, and the
478 * event handled is the start time of the head pkt.
479 *
480 * In all cases, we make sure that the data structures are consistent
481 * before passing pkts out, because this might trigger recursive
482 * invocations of the procedures.
483 */
484 static void
485 transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
486 {
487 struct mbuf *m;
488 struct dn_pkt_tag *pkt;
489
490 DUMMYNET_LOCK_ASSERT();
491
492 while ((m = pipe->head) != NULL) {
493 pkt = dn_tag_get(m);
494 if (!DN_KEY_LEQ(pkt->output_time, curr_time))
495 break;
496
497 pipe->head = m->m_nextpkt;
498 if (*tail != NULL)
499 (*tail)->m_nextpkt = m;
500 else
501 *head = m;
502 *tail = m;
503 }
504 if (*tail != NULL)
505 (*tail)->m_nextpkt = NULL;
506
507 /* If there are leftover packets, put into the heap for next event. */
508 if ((m = pipe->head) != NULL) {
509 pkt = dn_tag_get(m);
510 /*
511 * XXX Should check errors on heap_insert, by draining the
512 * whole pipe p and hoping in the future we are more successful.
513 */
514 heap_insert(&extract_heap, pkt->output_time, pipe);
515 }
516 }
517
518 /*
519 * the following macro computes how many ticks we have to wait
520 * before being able to transmit a packet. The credit is taken from
521 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
522 */
523 #define SET_TICKS(_m, q, p) \
524 ((_m)->m_pkthdr.len * 8 * hz - (q)->numbytes + p->bandwidth - 1) / \
525 p->bandwidth;
526
527 /*
528 * extract pkt from queue, compute output time (could be now)
529 * and put into delay line (p_queue)
530 */
531 static void
532 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
533 int len)
534 {
535 struct dn_pkt_tag *dt = dn_tag_get(pkt);
536
537 q->head = pkt->m_nextpkt ;
538 q->len-- ;
539 q->len_bytes -= len ;
540
541 dt->output_time = curr_time + p->delay ;
542
543 if (p->head == NULL)
544 p->head = pkt;
545 else
546 p->tail->m_nextpkt = pkt;
547 p->tail = pkt;
548 p->tail->m_nextpkt = NULL;
549 }
550
551 /*
552 * ready_event() is invoked every time the queue must enter the
553 * scheduler, either because the first packet arrives, or because
554 * a previously scheduled event fired.
555 * On invokation, drain as many pkts as possible (could be 0) and then
556 * if there are leftover packets reinsert the pkt in the scheduler.
557 */
558 static void
559 ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
560 {
561 struct mbuf *pkt;
562 struct dn_pipe *p = q->fs->pipe;
563 int p_was_empty;
564
565 DUMMYNET_LOCK_ASSERT();
566
567 if (p == NULL) {
568 printf("dummynet: ready_event- pipe is gone\n");
569 return;
570 }
571 p_was_empty = (p->head == NULL);
572
573 /*
574 * Schedule fixed-rate queues linked to this pipe:
575 * account for the bw accumulated since last scheduling, then
576 * drain as many pkts as allowed by q->numbytes and move to
577 * the delay line (in p) computing output time.
578 * bandwidth==0 (no limit) means we can drain the whole queue,
579 * setting len_scaled = 0 does the job.
580 */
581 q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
582 while ((pkt = q->head) != NULL) {
583 int len = pkt->m_pkthdr.len;
584 int len_scaled = p->bandwidth ? len * 8 * hz : 0;
585
586 if (len_scaled > q->numbytes)
587 break;
588 q->numbytes -= len_scaled;
589 move_pkt(pkt, q, p, len);
590 }
591 /*
592 * If we have more packets queued, schedule next ready event
593 * (can only occur when bandwidth != 0, otherwise we would have
594 * flushed the whole queue in the previous loop).
595 * To this purpose we record the current time and compute how many
596 * ticks to go for the finish time of the packet.
597 */
598 if ((pkt = q->head) != NULL) { /* this implies bandwidth != 0 */
599 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
600
601 q->sched_time = curr_time;
602 heap_insert(&ready_heap, curr_time + t, (void *)q);
603 /*
604 * XXX Should check errors on heap_insert, and drain the whole
605 * queue on error hoping next time we are luckier.
606 */
607 } else /* RED needs to know when the queue becomes empty. */
608 q->q_time = curr_time;
609
610 /*
611 * If the delay line was empty call transmit_event() now.
612 * Otherwise, the scheduler will take care of it.
613 */
614 if (p_was_empty)
615 transmit_event(p, head, tail);
616 }
617
618 /*
619 * Called when we can transmit packets on WF2Q queues. Take pkts out of
620 * the queues at their start time, and enqueue into the delay line.
621 * Packets are drained until p->numbytes < 0. As long as
622 * len_scaled >= p->numbytes, the packet goes into the delay line
623 * with a deadline p->delay. For the last packet, if p->numbytes < 0,
624 * there is an additional delay.
625 */
626 static void
627 ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
628 {
629 int p_was_empty = (p->head == NULL);
630 struct dn_heap *sch = &(p->scheduler_heap);
631 struct dn_heap *neh = &(p->not_eligible_heap);
632 int64_t p_numbytes = p->numbytes;
633
634 DUMMYNET_LOCK_ASSERT();
635
636 if (p->if_name[0] == 0) /* tx clock is simulated */
637 /*
638 * Since result may not fit into p->numbytes (32bit) we
639 * are using 64bit var here.
640 */
641 p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
642 else { /*
643 * tx clock is for real,
644 * the ifq must be empty or this is a NOP.
645 */
646 if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
647 return;
648 else {
649 DPRINTF(("dummynet: pipe %d ready from %s --\n",
650 p->pipe_nr, p->if_name));
651 }
652 }
653
654 /*
655 * While we have backlogged traffic AND credit, we need to do
656 * something on the queue.
657 */
658 while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
659 if (sch->elements > 0) {
660 /* Have some eligible pkts to send out. */
661 struct dn_flow_queue *q = sch->p[0].object;
662 struct mbuf *pkt = q->head;
663 struct dn_flow_set *fs = q->fs;
664 uint64_t len = pkt->m_pkthdr.len;
665 int len_scaled = p->bandwidth ? len * 8 * hz : 0;
666
667 heap_extract(sch, NULL); /* Remove queue from heap. */
668 p_numbytes -= len_scaled;
669 move_pkt(pkt, q, p, len);
670
671 p->V += (len << MY_M) / p->sum; /* Update V. */
672 q->S = q->F; /* Update start time. */
673 if (q->len == 0) {
674 /* Flow not backlogged any more. */
675 fs->backlogged--;
676 heap_insert(&(p->idle_heap), q->F, q);
677 } else {
678 /* Still backlogged. */
679
680 /*
681 * Update F and position in backlogged queue,
682 * then put flow in not_eligible_heap
683 * (we will fix this later).
684 */
685 len = (q->head)->m_pkthdr.len;
686 q->F += (len << MY_M) / (uint64_t)fs->weight;
687 if (DN_KEY_LEQ(q->S, p->V))
688 heap_insert(neh, q->S, q);
689 else
690 heap_insert(sch, q->F, q);
691 }
692 }
693 /*
694 * Now compute V = max(V, min(S_i)). Remember that all elements
695 * in sch have by definition S_i <= V so if sch is not empty,
696 * V is surely the max and we must not update it. Conversely,
697 * if sch is empty we only need to look at neh.
698 */
699 if (sch->elements == 0 && neh->elements > 0)
700 p->V = MAX64(p->V, neh->p[0].key);
701 /* Move from neh to sch any packets that have become eligible */
702 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
703 struct dn_flow_queue *q = neh->p[0].object;
704 heap_extract(neh, NULL);
705 heap_insert(sch, q->F, q);
706 }
707
708 if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
709 p_numbytes = -1; /* Mark not ready for I/O. */
710 break;
711 }
712 }
713 if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0 &&
714 p->idle_heap.elements > 0) {
715 /*
716 * No traffic and no events scheduled.
717 * We can get rid of idle-heap.
718 */
719 int i;
720
721 for (i = 0; i < p->idle_heap.elements; i++) {
722 struct dn_flow_queue *q = p->idle_heap.p[i].object;
723
724 q->F = 0;
725 q->S = q->F + 1;
726 }
727 p->sum = 0;
728 p->V = 0;
729 p->idle_heap.elements = 0;
730 }
731 /*
732 * If we are getting clocks from dummynet (not a real interface) and
733 * If we are under credit, schedule the next ready event.
734 * Also fix the delivery time of the last packet.
735 */
736 if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
737 dn_key t = 0; /* Number of ticks i have to wait. */
738
739 if (p->bandwidth > 0)
740 t = (p->bandwidth - 1 - p_numbytes) / p->bandwidth;
741 dn_tag_get(p->tail)->output_time += t;
742 p->sched_time = curr_time;
743 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
744 /*
745 * XXX Should check errors on heap_insert, and drain the whole
746 * queue on error hoping next time we are luckier.
747 */
748 }
749
750 /* Fit (adjust if necessary) 64bit result into 32bit variable. */
751 if (p_numbytes > INT_MAX)
752 p->numbytes = INT_MAX;
753 else if (p_numbytes < INT_MIN)
754 p->numbytes = INT_MIN;
755 else
756 p->numbytes = p_numbytes;
757
758 /*
759 * If the delay line was empty call transmit_event() now.
760 * Otherwise, the scheduler will take care of it.
761 */
762 if (p_was_empty)
763 transmit_event(p, head, tail);
764 }
765
766 /*
767 * This is called one tick, after previous run. It is used to
768 * schedule next run.
769 */
770 static void
771 dummynet(void * __unused unused)
772 {
773
774 taskqueue_enqueue(dn_tq, &dn_task);
775 }
776
777 /*
778 * The main dummynet processing function.
779 */
780 static void
781 dummynet_task(void *context, int pending)
782 {
783 struct mbuf *head = NULL, *tail = NULL;
784 struct dn_pipe *pipe;
785 struct dn_heap *heaps[3];
786 struct dn_heap *h;
787 void *p; /* generic parameter to handler */
788 int i;
789
790 DUMMYNET_LOCK();
791
792 heaps[0] = &ready_heap; /* fixed-rate queues */
793 heaps[1] = &wfq_ready_heap; /* wfq queues */
794 heaps[2] = &extract_heap; /* delay line */
795
796 /* Update number of lost(coalesced) ticks. */
797 tick_lost += pending - 1;
798
799 getmicrouptime(&t);
800 /* Last tick duration (usec). */
801 tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
802 (t.tv_usec - prev_t.tv_usec);
803 /* Last tick vs standard tick difference (usec). */
804 tick_delta = (tick_last * hz - 1000000) / hz;
805 /* Accumulated tick difference (usec). */
806 tick_delta_sum += tick_delta;
807
808 prev_t = t;
809
810 /*
811 * Adjust curr_time if accumulated tick difference greater than
812 * 'standard' tick. Since curr_time should be monotonically increasing,
813 * we do positive adjustment as required and throttle curr_time in
814 * case of negative adjustment.
815 */
816 curr_time++;
817 if (tick_delta_sum - tick >= 0) {
818 int diff = tick_delta_sum / tick;
819
820 curr_time += diff;
821 tick_diff += diff;
822 tick_delta_sum %= tick;
823 tick_adjustment++;
824 } else if (tick_delta_sum + tick <= 0) {
825 curr_time--;
826 tick_diff--;
827 tick_delta_sum += tick;
828 tick_adjustment++;
829 }
830
831 for (i = 0; i < 3; i++) {
832 h = heaps[i];
833 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
834 if (h->p[0].key > curr_time)
835 printf("dummynet: warning, "
836 "heap %d is %d ticks late\n",
837 i, (int)(curr_time - h->p[0].key));
838 /* store a copy before heap_extract */
839 p = h->p[0].object;
840 /* need to extract before processing */
841 heap_extract(h, NULL);
842 if (i == 0)
843 ready_event(p, &head, &tail);
844 else if (i == 1) {
845 struct dn_pipe *pipe = p;
846 if (pipe->if_name[0] != '\0')
847 printf("dummynet: bad ready_event_wfq "
848 "for pipe %s\n", pipe->if_name);
849 else
850 ready_event_wfq(p, &head, &tail);
851 } else
852 transmit_event(p, &head, &tail);
853 }
854 }
855
856 /* Sweep pipes trying to expire idle flow_queues. */
857 for (i = 0; i < HASHSIZE; i++)
858 SLIST_FOREACH(pipe, &pipehash[i], next)
859 if (pipe->idle_heap.elements > 0 &&
860 DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
861 struct dn_flow_queue *q =
862 pipe->idle_heap.p[0].object;
863
864 heap_extract(&(pipe->idle_heap), NULL);
865 /* Mark timestamp as invalid. */
866 q->S = q->F + 1;
867 pipe->sum -= q->fs->weight;
868 }
869
870 DUMMYNET_UNLOCK();
871
872 if (head != NULL)
873 dummynet_send(head);
874
875 callout_reset(&dn_timeout, 1, dummynet, NULL);
876 }
877
878 static void
879 dummynet_send(struct mbuf *m)
880 {
881 struct dn_pkt_tag *pkt;
882 struct mbuf *n;
883 struct ip *ip;
884
885 for (; m != NULL; m = n) {
886 n = m->m_nextpkt;
887 m->m_nextpkt = NULL;
888 pkt = dn_tag_get(m);
889 switch (pkt->dn_dir) {
890 case DN_TO_IP_OUT:
891 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
892 break ;
893 case DN_TO_IP_IN :
894 ip = mtod(m, struct ip *);
895 ip->ip_len = htons(ip->ip_len);
|