1 /*-
2 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
3 * Portions Copyright (c) 2000 Akamba Corp.
4 * All rights reserved
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #define DUMMYNET_DEBUG
32
33 #include "opt_inet6.h"
34
35 /*
36 * This module implements IP dummynet, a bandwidth limiter/delay emulator
37 * used in conjunction with the ipfw package.
38 * Description of the data structures used is in ip_dummynet.h
39 * Here you mainly find the following blocks of code:
40 * + variable declarations;
41 * + heap management functions;
42 * + scheduler and dummynet functions;
43 * + configuration and initialization.
44 *
45 * NOTA BENE: critical sections are protected by the "dummynet lock".
46 *
47 * Most important Changes:
48 *
49 * 011004: KLDable
50 * 010124: Fixed WF2Q behaviour
51 * 010122: Fixed spl protection.
52 * 000601: WF2Q support
53 * 000106: large rewrite, use heaps to handle very many pipes.
54 * 980513: initial release
55 *
56 * include files marked with XXX are probably not needed
57 */
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/malloc.h>
62 #include <sys/mbuf.h>
63 #include <sys/kernel.h>
64 #include <sys/module.h>
65 #include <sys/proc.h>
66 #include <sys/socket.h>
67 #include <sys/socketvar.h>
68 #include <sys/time.h>
69 #include <sys/sysctl.h>
70 #include <sys/taskqueue.h>
71 #include <net/if.h>
72 #include <net/netisr.h>
73 #include <net/route.h>
74 #include <netinet/in.h>
75 #include <netinet/in_systm.h>
76 #include <netinet/in_var.h>
77 #include <netinet/ip.h>
78 #include <netinet/ip_fw.h>
79 #include <netinet/ip_dummynet.h>
80 #include <netinet/ip_var.h>
81
82 #include <netinet/if_ether.h> /* for struct arpcom */
83
84 #include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
85 #include <netinet6/ip6_var.h>
86
87 /*
88 * We keep a private variable for the simulation time, but we could
89 * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
90 */
91 static dn_key curr_time = 0 ; /* current simulation time */
92
93 static int dn_hash_size = 64 ; /* default hash size */
94
95 /* statistics on number of queue searches and search steps */
96 static long searches, search_steps ;
97 static int pipe_expire = 1 ; /* expire queue if empty */
98 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
99
100 static int red_lookup_depth = 256; /* RED - default lookup table depth */
101 static int red_avg_pkt_size = 512; /* RED - default medium packet size */
102 static int red_max_pkt_size = 1500; /* RED - default max packet size */
103
104 static struct timeval prev_t, t;
105 static long tick_last; /* Last tick duration (usec). */
106 static long tick_delta; /* Last vs standard tick diff (usec). */
107 static long tick_delta_sum; /* Accumulated tick difference (usec).*/
108 static long tick_adjustment; /* Tick adjustments done. */
109 static long tick_lost; /* Lost(coalesced) ticks number. */
110 /* Adjusted vs non-adjusted curr_time difference (ticks). */
111 static long tick_diff;
112
113 /*
114 * Three heaps contain queues and pipes that the scheduler handles:
115 *
116 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
117 *
118 * wfq_ready_heap contains the pipes associated with WF2Q flows
119 *
120 * extract_heap contains pipes associated with delay lines.
121 *
122 */
123
124 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
125
126 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
127
128 static int heap_init(struct dn_heap *h, int size);
129 static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
130 static void heap_extract(struct dn_heap *h, void *obj);
131 static void transmit_event(struct dn_pipe *pipe, struct mbuf **head,
132 struct mbuf **tail);
133 static void ready_event(struct dn_flow_queue *q, struct mbuf **head,
134 struct mbuf **tail);
135 static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
136 struct mbuf **tail);
137
138 #define HASHSIZE 16
139 #define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
140 static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */
141 static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */
142
143 static struct callout dn_timeout;
144
145 extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
146
147 #ifdef SYSCTL_NODE
148 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
149 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
150 CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
151 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
152 CTLFLAG_RD, &curr_time, 0, "Current tick");
153 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
154 CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
155 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
156 CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
157 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
158 CTLFLAG_RD, &searches, 0, "Number of queue searches");
159 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
160 CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
161 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
162 CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
163 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
164 CTLFLAG_RW, &dn_max_ratio, 0,
165 "Max ratio between dynamic queues and buckets");
166 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
167 CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
168 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
169 CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
170 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
171 CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
172 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
173 CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
174 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
175 CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
176 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
177 CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
178 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
179 CTLFLAG_RD, &tick_diff, 0,
180 "Adjusted vs non-adjusted curr_time difference (ticks).");
181 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
182 CTLFLAG_RD, &tick_lost, 0,
183 "Number of ticks coalesced by dummynet taskqueue.");
184 #endif
185
186 #ifdef DUMMYNET_DEBUG
187 int dummynet_debug = 0;
188 #ifdef SYSCTL_NODE
189 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
190 0, "control debugging printfs");
191 #endif
192 #define DPRINTF(X) if (dummynet_debug) printf X
193 #else
194 #define DPRINTF(X)
195 #endif
196
197 static struct task dn_task;
198 static struct taskqueue *dn_tq = NULL;
199 static void dummynet_task(void *, int);
200
201 static struct mtx dummynet_mtx;
202 #define DUMMYNET_LOCK_INIT() \
203 mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
204 #define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
205 #define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx)
206 #define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx)
207 #define DUMMYNET_LOCK_ASSERT() mtx_assert(&dummynet_mtx, MA_OWNED)
208
209 static int config_pipe(struct dn_pipe *p);
210 static int ip_dn_ctl(struct sockopt *sopt);
211
212 static void dummynet(void *);
213 static void dummynet_flush(void);
214 static void dummynet_send(struct mbuf *);
215 void dummynet_drain(void);
216 static ip_dn_io_t dummynet_io;
217 static void dn_rule_delete(void *);
218
219 /*
220 * Heap management functions.
221 *
222 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
223 * Some macros help finding parent/children so we can optimize them.
224 *
225 * heap_init() is called to expand the heap when needed.
226 * Increment size in blocks of 16 entries.
227 * XXX failure to allocate a new element is a pretty bad failure
228 * as we basically stall a whole queue forever!!
229 * Returns 1 on error, 0 on success
230 */
231 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
232 #define HEAP_LEFT(x) ( 2*(x) + 1 )
233 #define HEAP_IS_LEFT(x) ( (x) & 1 )
234 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
235 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
236 #define HEAP_INCREMENT 15
237
238 static int
239 heap_init(struct dn_heap *h, int new_size)
240 {
241 struct dn_heap_entry *p;
242
243 if (h->size >= new_size ) {
244 printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
245 h->size, new_size);
246 return 0 ;
247 }
248 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
249 p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
250 if (p == NULL) {
251 printf("dummynet: %s, resize %d failed\n", __func__, new_size );
252 return 1 ; /* error */
253 }
254 if (h->size > 0) {
255 bcopy(h->p, p, h->size * sizeof(*p) );
256 free(h->p, M_DUMMYNET);
257 }
258 h->p = p ;
259 h->size = new_size ;
260 return 0 ;
261 }
262
263 /*
264 * Insert element in heap. Normally, p != NULL, we insert p in
265 * a new position and bubble up. If p == NULL, then the element is
266 * already in place, and key is the position where to start the
267 * bubble-up.
268 * Returns 1 on failure (cannot allocate new heap entry)
269 *
270 * If offset > 0 the position (index, int) of the element in the heap is
271 * also stored in the element itself at the given offset in bytes.
272 */
273 #define SET_OFFSET(heap, node) \
274 if (heap->offset > 0) \
275 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
276 /*
277 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
278 */
279 #define RESET_OFFSET(heap, node) \
280 if (heap->offset > 0) \
281 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
282 static int
283 heap_insert(struct dn_heap *h, dn_key key1, void *p)
284 {
285 int son = h->elements ;
286
287 if (p == NULL) /* data already there, set starting point */
288 son = key1 ;
289 else { /* insert new element at the end, possibly resize */
290 son = h->elements ;
291 if (son == h->size) /* need resize... */
292 if (heap_init(h, h->elements+1) )
293 return 1 ; /* failure... */
294 h->p[son].object = p ;
295 h->p[son].key = key1 ;
296 h->elements++ ;
297 }
298 while (son > 0) { /* bubble up */
299 int father = HEAP_FATHER(son) ;
300 struct dn_heap_entry tmp ;
301
302 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
303 break ; /* found right position */
304 /* son smaller than father, swap and repeat */
305 HEAP_SWAP(h->p[son], h->p[father], tmp) ;
306 SET_OFFSET(h, son);
307 son = father ;
308 }
309 SET_OFFSET(h, son);
310 return 0 ;
311 }
312
313 /*
314 * remove top element from heap, or obj if obj != NULL
315 */
316 static void
317 heap_extract(struct dn_heap *h, void *obj)
318 {
319 int child, father, max = h->elements - 1 ;
320
321 if (max < 0) {
322 printf("dummynet: warning, extract from empty heap 0x%p\n", h);
323 return ;
324 }
325 father = 0 ; /* default: move up smallest child */
326 if (obj != NULL) { /* extract specific element, index is at offset */
327 if (h->offset <= 0)
328 panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
329 father = *((int *)((char *)obj + h->offset)) ;
330 if (father < 0 || father >= h->elements) {
331 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
332 father, h->elements);
333 panic("dummynet: heap_extract");
334 }
335 }
336 RESET_OFFSET(h, father);
337 child = HEAP_LEFT(father) ; /* left child */
338 while (child <= max) { /* valid entry */
339 if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
340 child = child+1 ; /* take right child, otherwise left */
341 h->p[father] = h->p[child] ;
342 SET_OFFSET(h, father);
343 father = child ;
344 child = HEAP_LEFT(child) ; /* left child for next loop */
345 }
346 h->elements-- ;
347 if (father != max) {
348 /*
349 * Fill hole with last entry and bubble up, reusing the insert code
350 */
351 h->p[father] = h->p[max] ;
352 heap_insert(h, father, NULL); /* this one cannot fail */
353 }
354 }
355
356 #if 0
357 /*
358 * change object position and update references
359 * XXX this one is never used!
360 */
361 static void
362 heap_move(struct dn_heap *h, dn_key new_key, void *object)
363 {
364 int temp;
365 int i ;
366 int max = h->elements-1 ;
367 struct dn_heap_entry buf ;
368
369 if (h->offset <= 0)
370 panic("cannot move items on this heap");
371
372 i = *((int *)((char *)object + h->offset));
373 if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
374 h->p[i].key = new_key ;
375 for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
376 i = temp ) { /* bubble up */
377 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
378 SET_OFFSET(h, i);
379 }
380 } else { /* must move down */
381 h->p[i].key = new_key ;
382 while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
383 if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
384 temp++ ; /* select child with min key */
385 if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
386 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
387 SET_OFFSET(h, i);
388 } else
389 break ;
390 i = temp ;
391 }
392 }
393 SET_OFFSET(h, i);
394 }
395 #endif /* heap_move, unused */
396
397 /*
398 * heapify() will reorganize data inside an array to maintain the
399 * heap property. It is needed when we delete a bunch of entries.
400 */
401 static void
402 heapify(struct dn_heap *h)
403 {
404 int i ;
405
406 for (i = 0 ; i < h->elements ; i++ )
407 heap_insert(h, i , NULL) ;
408 }
409
410 /*
411 * cleanup the heap and free data structure
412 */
413 static void
414 heap_free(struct dn_heap *h)
415 {
416 if (h->size >0 )
417 free(h->p, M_DUMMYNET);
418 bzero(h, sizeof(*h) );
419 }
420
421 /*
422 * --- end of heap management functions ---
423 */
424
425 /*
426 * Return the mbuf tag holding the dummynet state. As an optimization
427 * this is assumed to be the first tag on the list. If this turns out
428 * wrong we'll need to search the list.
429 */
430 static struct dn_pkt_tag *
431 dn_tag_get(struct mbuf *m)
432 {
433 struct m_tag *mtag = m_tag_first(m);
434 KASSERT(mtag != NULL &&
435 mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
436 mtag->m_tag_id == PACKET_TAG_DUMMYNET,
437 ("packet on dummynet queue w/o dummynet tag!"));
438 return (struct dn_pkt_tag *)(mtag+1);
439 }
440
441 /*
442 * Scheduler functions:
443 *
444 * transmit_event() is called when the delay-line needs to enter
445 * the scheduler, either because of existing pkts getting ready,
446 * or new packets entering the queue. The event handled is the delivery
447 * time of the packet.
448 *
449 * ready_event() does something similar with fixed-rate queues, and the
450 * event handled is the finish time of the head pkt.
451 *
452 * wfq_ready_event() does something similar with WF2Q queues, and the
453 * event handled is the start time of the head pkt.
454 *
455 * In all cases, we make sure that the data structures are consistent
456 * before passing pkts out, because this might trigger recursive
457 * invocations of the procedures.
458 */
459 static void
460 transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
461 {
462 struct mbuf *m;
463 struct dn_pkt_tag *pkt;
464
465 DUMMYNET_LOCK_ASSERT();
466
467 while ((m = pipe->head) != NULL) {
468 pkt = dn_tag_get(m);
469 if (!DN_KEY_LEQ(pkt->output_time, curr_time))
470 break;
471
472 pipe->head = m->m_nextpkt;
473 if (*tail != NULL)
474 (*tail)->m_nextpkt = m;
475 else
476 *head = m;
477 *tail = m;
478 }
479 if (*tail != NULL)
480 (*tail)->m_nextpkt = NULL;
481
482 /* If there are leftover packets, put into the heap for next event. */
483 if ((m = pipe->head) != NULL) {
484 pkt = dn_tag_get(m);
485 /*
486 * XXX: Should check errors on heap_insert, by draining the
487 * whole pipe p and hoping in the future we are more successful.
488 */
489 heap_insert(&extract_heap, pkt->output_time, pipe);
490 }
491 }
492
493 /*
494 * the following macro computes how many ticks we have to wait
495 * before being able to transmit a packet. The credit is taken from
496 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
497 */
498 #define SET_TICKS(_m, q, p) \
499 ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
500 p->bandwidth ;
501
502 /*
503 * extract pkt from queue, compute output time (could be now)
504 * and put into delay line (p_queue)
505 */
506 static void
507 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
508 int len)
509 {
510 struct dn_pkt_tag *dt = dn_tag_get(pkt);
511
512 q->head = pkt->m_nextpkt ;
513 q->len-- ;
514 q->len_bytes -= len ;
515
516 dt->output_time = curr_time + p->delay ;
517
518 if (p->head == NULL)
519 p->head = pkt;
520 else
521 p->tail->m_nextpkt = pkt;
522 p->tail = pkt;
523 p->tail->m_nextpkt = NULL;
524 }
525
526 /*
527 * ready_event() is invoked every time the queue must enter the
528 * scheduler, either because the first packet arrives, or because
529 * a previously scheduled event fired.
530 * On invokation, drain as many pkts as possible (could be 0) and then
531 * if there are leftover packets reinsert the pkt in the scheduler.
532 */
533 static void
534 ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
535 {
536 struct mbuf *pkt;
537 struct dn_pipe *p = q->fs->pipe ;
538 int p_was_empty ;
539
540 DUMMYNET_LOCK_ASSERT();
541
542 if (p == NULL) {
543 printf("dummynet: ready_event- pipe is gone\n");
544 return ;
545 }
546 p_was_empty = (p->head == NULL) ;
547
548 /*
549 * schedule fixed-rate queues linked to this pipe:
550 * Account for the bw accumulated since last scheduling, then
551 * drain as many pkts as allowed by q->numbytes and move to
552 * the delay line (in p) computing output time.
553 * bandwidth==0 (no limit) means we can drain the whole queue,
554 * setting len_scaled = 0 does the job.
555 */
556 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
557 while ( (pkt = q->head) != NULL ) {
558 int len = pkt->m_pkthdr.len;
559 int len_scaled = p->bandwidth ? len*8*hz : 0 ;
560 if (len_scaled > q->numbytes )
561 break ;
562 q->numbytes -= len_scaled ;
563 move_pkt(pkt, q, p, len);
564 }
565 /*
566 * If we have more packets queued, schedule next ready event
567 * (can only occur when bandwidth != 0, otherwise we would have
568 * flushed the whole queue in the previous loop).
569 * To this purpose we record the current time and compute how many
570 * ticks to go for the finish time of the packet.
571 */
572 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
573 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
574 q->sched_time = curr_time ;
575 heap_insert(&ready_heap, curr_time + t, (void *)q );
576 /* XXX should check errors on heap_insert, and drain the whole
577 * queue on error hoping next time we are luckier.
578 */
579 } else { /* RED needs to know when the queue becomes empty */
580 q->q_time = curr_time;
581 q->numbytes = 0;
582 }
583 /*
584 * If the delay line was empty call transmit_event() now.
585 * Otherwise, the scheduler will take care of it.
586 */
587 if (p_was_empty)
588 transmit_event(p, head, tail);
589 }
590
591 /*
592 * Called when we can transmit packets on WF2Q queues. Take pkts out of
593 * the queues at their start time, and enqueue into the delay line.
594 * Packets are drained until p->numbytes < 0. As long as
595 * len_scaled >= p->numbytes, the packet goes into the delay line
596 * with a deadline p->delay. For the last packet, if p->numbytes<0,
597 * there is an additional delay.
598 */
599 static void
600 ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
601 {
602 int p_was_empty = (p->head == NULL) ;
603 struct dn_heap *sch = &(p->scheduler_heap);
604 struct dn_heap *neh = &(p->not_eligible_heap) ;
605
606 DUMMYNET_LOCK_ASSERT();
607
608 if (p->if_name[0] == 0) /* tx clock is simulated */
609 p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
610 else { /* tx clock is for real, the ifq must be empty or this is a NOP */
611 if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
612 return ;
613 else {
614 DPRINTF(("dummynet: pipe %d ready from %s --\n",
615 p->pipe_nr, p->if_name));
616 }
617 }
618
619 /*
620 * While we have backlogged traffic AND credit, we need to do
621 * something on the queue.
622 */
623 while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
624 if (sch->elements > 0) { /* have some eligible pkts to send out */
625 struct dn_flow_queue *q = sch->p[0].object ;
626 struct mbuf *pkt = q->head;
627 struct dn_flow_set *fs = q->fs;
628 u_int64_t len = pkt->m_pkthdr.len;
629 int len_scaled = p->bandwidth ? len*8*hz : 0 ;
630
631 heap_extract(sch, NULL); /* remove queue from heap */
632 p->numbytes -= len_scaled ;
633 move_pkt(pkt, q, p, len);
634
635 p->V += (len<<MY_M) / p->sum ; /* update V */
636 q->S = q->F ; /* update start time */
637 if (q->len == 0) { /* Flow not backlogged any more */
638 fs->backlogged-- ;
639 heap_insert(&(p->idle_heap), q->F, q);
640 } else { /* still backlogged */
641 /*
642 * update F and position in backlogged queue, then
643 * put flow in not_eligible_heap (we will fix this later).
644 */
645 len = (q->head)->m_pkthdr.len;
646 q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
647 if (DN_KEY_LEQ(q->S, p->V))
648 heap_insert(neh, q->S, q);
649 else
650 heap_insert(sch, q->F, q);
651 }
652 }
653 /*
654 * now compute V = max(V, min(S_i)). Remember that all elements in sch
655 * have by definition S_i <= V so if sch is not empty, V is surely
656 * the max and we must not update it. Conversely, if sch is empty
657 * we only need to look at neh.
658 */
659 if (sch->elements == 0 && neh->elements > 0)
660 p->V = MAX64 ( p->V, neh->p[0].key );
661 /* move from neh to sch any packets that have become eligible */
662 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
663 struct dn_flow_queue *q = neh->p[0].object ;
664 heap_extract(neh, NULL);
665 heap_insert(sch, q->F, q);
666 }
667
668 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
669 p->numbytes = -1 ; /* mark not ready for I/O */
670 break ;
671 }
672 }
673 if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
674 && p->idle_heap.elements > 0) {
675 /*
676 * no traffic and no events scheduled. We can get rid of idle-heap.
677 */
678 int i ;
679
680 for (i = 0 ; i < p->idle_heap.elements ; i++) {
681 struct dn_flow_queue *q = p->idle_heap.p[i].object ;
682
683 q->F = 0 ;
684 q->S = q->F + 1 ;
685 }
686 p->sum = 0 ;
687 p->V = 0 ;
688 p->idle_heap.elements = 0 ;
689 }
690 /*
691 * If we are getting clocks from dummynet (not a real interface) and
692 * If we are under credit, schedule the next ready event.
693 * Also fix the delivery time of the last packet.
694 */
695 if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
696 dn_key t=0 ; /* number of ticks i have to wait */
697
698 if (p->bandwidth > 0)
699 t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
700 dn_tag_get(p->tail)->output_time += t ;
701 p->sched_time = curr_time ;
702 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
703 /* XXX should check errors on heap_insert, and drain the whole
704 * queue on error hoping next time we are luckier.
705 */
706 }
707 /*
708 * If the delay line was empty call transmit_event() now.
709 * Otherwise, the scheduler will take care of it.
710 */
711 if (p_was_empty)
712 transmit_event(p, head, tail);
713 }
714
715 /*
716 * This is called one tick, after previous run. It is used to
717 * schedule next run.
718 */
719 static void
720 dummynet(void * __unused unused)
721 {
722
723 taskqueue_enqueue(dn_tq, &dn_task);
724 }
725
726 /*
727 * The main dummynet processing function.
728 */
729 static void
730 dummynet_task(void *context, int pending)
731 {
732 struct mbuf *head = NULL, *tail = NULL;
733 struct dn_pipe *pipe;
734 struct dn_heap *heaps[3];
735 struct dn_heap *h;
736 void *p; /* generic parameter to handler */
737 int i;
738
739 DUMMYNET_LOCK();
740
741 heaps[0] = &ready_heap; /* fixed-rate queues */
742 heaps[1] = &wfq_ready_heap; /* wfq queues */
743 heaps[2] = &extract_heap; /* delay line */
744
745 /* Update number of lost(coalesced) ticks. */
746 tick_lost += pending - 1;
747
748 getmicrouptime(&t);
749 /* Last tick duration (usec). */
750 tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
751 (t.tv_usec - prev_t.tv_usec);
752 /* Last tick vs standard tick difference (usec). */
753 tick_delta = (tick_last * hz - 1000000) / hz;
754 /* Accumulated tick difference (usec). */
755 tick_delta_sum += tick_delta;
756
757 prev_t = t;
758
759 /*
760 * Adjust curr_time if accumulated tick difference greater than
761 * 'standard' tick. Since curr_time should be monotonically increasing,
762 * we do positive adjustment as required and throttle curr_time in
763 * case of negative adjustment.
764 */
765 curr_time++;
766 if (tick_delta_sum - tick >= 0) {
767 int diff = tick_delta_sum / tick;
768
769 curr_time += diff;
770 tick_diff += diff;
771 tick_delta_sum %= tick;
772 tick_adjustment++;
773 } else if (tick_delta_sum + tick <= 0) {
774 curr_time--;
775 tick_diff--;
776 tick_delta_sum += tick;
777 tick_adjustment++;
778 }
779
780 for (i = 0; i < 3; i++) {
781 h = heaps[i];
782 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
783 if (h->p[0].key > curr_time)
784 printf("dummynet: warning, "
785 "heap %d is %d ticks late\n",
786 i, (int)(curr_time - h->p[0].key));
787 /* store a copy before heap_extract */
788 p = h->p[0].object;
789 /* need to extract before processing */
790 heap_extract(h, NULL);
791 if (i == 0)
792 ready_event(p, &head, &tail);
793 else if (i == 1) {
794 struct dn_pipe *pipe = p;
795 if (pipe->if_name[0] != '\0')
796 printf("dummynet: bad ready_event_wfq "
797 "for pipe %s\n", pipe->if_name);
798 else
799 ready_event_wfq(p, &head, &tail);
800 } else
801 transmit_event(p, &head, &tail);
802 }
803 }
804
805 /* Sweep pipes trying to expire idle flow_queues. */
806 for (i = 0; i < HASHSIZE; i++)
807 SLIST_FOREACH(pipe, &pipehash[i], next)
808 if (pipe->idle_heap.elements > 0 &&
809 DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
810 struct dn_flow_queue *q =
811 pipe->idle_heap.p[0].object;
812
813 heap_extract(&(pipe->idle_heap), NULL);
814 /* Mark timestamp as invalid. */
815 q->S = q->F + 1;
816 pipe->sum -= q->fs->weight;
817 }
818
819 DUMMYNET_UNLOCK();
820
821 if (head != NULL)
822 dummynet_send(head);
823
824 callout_reset(&dn_timeout, 1, dummynet, NULL);
825 }
826
827 static void
828 dummynet_send(struct mbuf *m)
829 {
830 struct dn_pkt_tag *pkt;
831 struct mbuf *n;
832 struct ip *ip;
833
834 for (; m != NULL; m = n) {
835 n = m->m_nextpkt;
836 m->m_nextpkt = NULL;
837 pkt = dn_tag_get(m);
838 switch (pkt->dn_dir) {
839 case DN_TO_IP_OUT:
840 ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
841 break ;
842 case DN_TO_IP_IN :
843 ip = mtod(m, struct ip *);
844 ip->ip_len = htons(ip->ip_len);
845 ip->ip_off = htons(ip->ip_off);
846 netisr_dispatch(NETISR_IP, m);
847 break;
848 #ifdef INET6
849 case DN_TO_IP6_IN:
850 netisr_dispatch(NETISR_IPV6, m);
851 break;
852
853 case DN_TO_IP6_OUT:
854 ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
855 break;
856 #endif
857 case DN_TO_IFB_FWD:
858 if (bridge_dn_p != NULL)
859 ((*bridge_dn_p)(m, pkt->ifp));
860 else
861 printf("dummynet: if_bridge not loaded\n");
862
863 break;
864 case DN_TO_ETH_DEMUX:
865 /*
866 * The Ethernet code assumes the Ethernet header is
867 * contiguous in the first mbuf header.
868 * Insure this is true.
869 */
870 if (m->m_len < ETHER_HDR_LEN &&
871 (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
872 printf("dummynet/ether: pullup failed, "
873 "dropping packet\n");
874 break;
875 }
876 ether_demux(m->m_pkthdr.rcvif, m);
877 break;
878 case DN_TO_ETH_OUT:
879 ether_output_frame(pkt->ifp, m);
880 break;
881 default:
882 printf("dummynet: bad switch %d!\n", pkt->dn_dir);
883 m_freem(m);
884 break;
885 }
886 }
887 }
888
889 /*
890 * Unconditionally expire empty queues in case of shortage.
891 * Returns the number of queues freed.
892 */
893 static int
894 expire_queues(struct dn_flow_set *fs)
895 {
896 struct dn_flow_queue *q, *prev ;
897 int i, initial_elements = fs->rq_elements ;
898
899 if (fs->last_expired == time_uptime)
900 return 0 ;
901 fs->last_expired = time_uptime ;
902 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
903 for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
904 if (q->head != NULL || q->S != q->F+1) {
905 prev = q ;
906 q = q->next ;
907 } else { /* entry is idle, expire it */
908 struct dn_flow_queue *old_q = q ;
909
910 if (prev != NULL)
911 prev->next = q = q->next ;
912 else
913 fs->rq[i] = q = q->next ;
914 fs->rq_elements-- ;
915 free(old_q, M_DUMMYNET);
916 }
917 return initial_elements - fs->rq_elements ;
918 }
919
920 /*
921 * If room, create a new queue and put at head of slot i;
922 * otherwise, create or use the default queue.
923 */
924 static struct dn_flow_queue *
925 create_queue(struct dn_flow_set *fs, int i)
926 {
927 struct dn_flow_queue *q ;
928
929 if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
930 expire_queues(fs) == 0) {
931 /*
932 * No way to get room, use or create overflow queue.
933 */
934 i = fs->rq_size ;
935 if ( fs->rq[i] != NULL )
936 return fs->rq[i] ;
937 }
938 q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
939 if (q == NULL) {
940 printf("dummynet: sorry, cannot allocate queue for new flow\n");
941 return NULL ;
942 }
943 q->fs = fs ;
944 q->hash_slot = i ;
945 q->next = fs->rq[i] ;
946 q->S = q->F + 1; /* hack - mark timestamp as invalid */
947 fs->rq[i] = q ;
948 fs->rq_elements++ ;
949 return q ;
950 }
951
952 /*
953 * Given a flow_set and a pkt in last_pkt, find a matching queue
954 * after appropriate masking. The queue is moved to front
955 * so that further searches take less time.
956 */
957 static struct dn_flow_queue *
958 find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
959 {
960 int i = 0 ; /* we need i and q for new allocations */
961 struct dn_flow_queue *q, *prev;
962 int is_v6 = IS_IP6_FLOW_ID(id);
963
964 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
965 q = fs->rq[0] ;
966 else {
967 /* first, do the masking, then hash */
968 id->dst_port &= fs->flow_mask.dst_port ;
969 id->src_port &= fs->flow_mask.src_port ;
970 id->proto &= fs->flow_mask.proto ;
971 id->flags = 0 ; /* we don't care about this one */
972 if (is_v6) {
973 APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
974 APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
975 id->flow_id6 &= fs->flow_mask.flow_id6;
976
977 i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
978 ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
979 ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
980 ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
981
982 ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
983 ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
984 ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
985 ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
986
987 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
988 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
989 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
990 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
991
992 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
993 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
994 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
995 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
996
997 (id->dst_port << 1) ^ (id->src_port) ^
998 (id->proto ) ^
999 (id->flow_id6);
1000 } else {
1001 id->dst_ip &= fs->flow_mask.dst_ip ;
1002 id->src_ip &= fs->flow_mask.src_ip ;
1003
1004 i = ( (id->dst_ip) & 0xffff ) ^
1005 ( (id->dst_ip >> 15) & 0xffff ) ^
1006 ( (id->src_ip << 1) & 0xffff ) ^
1007 ( (id->src_ip >> 16 ) & 0xffff ) ^
1008 (id->dst_port << 1) ^ (id->src_port) ^
1009 (id->proto );
1010 }
1011 i = i % fs->rq_size ;
1012 /* finally, scan the current list for a match */
1013 searches++ ;
1014 for (prev=NULL, q = fs->rq[i] ; q ; ) {
1015 search_steps++;
1016 if (is_v6 &&
1017 IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&
1018 IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&
1019 id->dst_port == q->id.dst_port &&
1020 id->src_port == q->id.src_port &&
1021 id->proto == q->id.proto &&
1022 id->flags == q->id.flags &&
1023 id->flow_id6 == q->id.flow_id6)
1024 break ; /* found */
1025
1026 if (!is_v6 && id->dst_ip == q->id.dst_ip &&
1027 id->src_ip == q->id.src_ip &&
1028 id->dst_port == q->id.dst_port &&
1029 id->src_port == q->id.src_port &&
1030 id->proto == q->id.proto &&
1031 id->flags == q->id.flags)
1032 break ; /* found */
1033
1034 /* No match. Check if we can expire the entry */
1035 if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
1036 /* entry is idle and not in any heap, expire it */
1037 struct dn_flow_queue *old_q = q ;
1038
1039 if (prev != NULL)
1040 prev->next = q = q->next ;
1041 else
1042 fs->rq[i] = q = q->next ;
1043 fs->rq_elements-- ;
1044 free(old_q, M_DUMMYNET);
1045 continue ;
1046 }
1047 prev = q ;
1048 q = q->next ;
1049 }
1050 if (q && prev != NULL) { /* found and not in front */
1051 prev->next = q->next ;
1052 q->next = fs->rq[i] ;
1053 fs->rq[i] = q ;
1054 }
1055 }
1056 if (q == NULL) { /* no match, need to allocate a new entry */
1057 q = create_queue(fs, i);
1058 if (q != NULL)
1059 q->id = *id ;
1060 }
1061 return q ;
1062 }
1063
1064 static int
1065 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
1066 {
1067 /*
1068 * RED algorithm
1069 *
1070 * RED calculates the average queue size (avg) using a low-pass filter
1071 * with an exponential weighted (w_q) moving average:
1072 * avg <- (1-w_q) * avg + w_q * q_size
1073 * where q_size is the queue length (measured in bytes or * packets).
1074 *
1075 * If q_size == 0, we compute the idle time for the link, and set
1076 * avg = (1 - w_q)^(idle/s)
1077 * where s is the time needed for transmitting a medium-sized packet.
1078 *
1079 * Now, if avg < min_th the packet is enqueued.
1080 * If avg > max_th the packet is dropped. Otherwise, the packet is
1081 * dropped with probability P function of avg.
1082 */
1083
1084 int64_t p_b = 0;
1085
1086 /* Queue in bytes or packets? */
1087 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
1088 q->len_bytes : q->len;
1089
1090 DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
1091
1092 /* Average queue size estimation. */
1093 if (q_size != 0) {
1094 /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
1095 int diff = SCALE(q_size) - q->avg;
1096 int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
1097
1098 q->avg += (int)v;
1099 } else {
1100 /*
1101 * Queue is empty, find for how long the queue has been
1102 * empty and use a lookup table for computing
1103 * (1 - * w_q)^(idle_time/s) where s is the time to send a
1104 * (small) packet.
1105 * XXX check wraps...
1106 */
1107 if (q->avg) {
1108 u_int t = (curr_time - q->q_time) / fs->lookup_step;
1109
1110 q->avg = (t < fs->lookup_depth) ?
1111 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
1112 }
1113 }
1114 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1115
1116 /* Should i drop? */
1117 if (q->avg < fs->min_th) {
1118 q->count = -1;
1119 return (0); /* accept packet */
1120 }
1121 if (q->avg >= fs->max_th) { /* average queue >= max threshold */
1122 if (fs->flags_fs & DN_IS_GENTLE_RED) {
1123 /*
1124 * According to Gentle-RED, if avg is greater than
1125 * max_th the packet is dropped with a probability
1126 * p_b = c_3 * avg - c_4
1127 * where c_3 = (1 - max_p) / max_th
1128 * c_4 = 1 - 2 * max_p
1129 */
1130 p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
1131 fs->c_4;
1132 } else {
1133 q->count = -1;
1134 DPRINTF(("dummynet: - drop"));
1135 return (1);
1136 }
1137 } else if (q->avg > fs->min_th) {
1138 /*
1139 * We compute p_b using the linear dropping function
1140 * p_b = c_1 * avg - c_2
1141 * where c_1 = max_p / (max_th - min_th)
1142 * c_2 = max_p * min_th / (max_th - min_th)
1143 */
1144 p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
1145 }
1146
1147 if (fs->flags_fs & DN_QSIZE_IS_BYTES)
1148 p_b = (p_b * len) / fs->max_pkt_size;
1149 if (++q->count == 0)
1150 q->random = random() & 0xffff;
1151 else {
1152 /*
1153 * q->count counts packets arrived since last drop, so a greater
1154 * value of q->count means a greater packet drop probability.
1155 */
1156 if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
1157 q->count = 0;
1158 DPRINTF(("dummynet: - red drop"));
1159 /* After a drop we calculate a new random value. */
1160 q->random = random() & 0xffff;
1161 return (1); /* drop */
1162 }
1163 }
1164 /* End of RED algorithm. */
1165
1166 return (0); /* accept */
1167 }
1168
1169 static __inline struct dn_flow_set *
1170 locate_flowset(int fs_nr)
1171 {
1172 struct dn_flow_set *fs;
1173
1174 SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
1175 if (fs->fs_nr == fs_nr)
1176 return (fs);
1177
1178 return (NULL);
1179 }
1180
1181 static __inline struct dn_pipe *
1182 locate_pipe(int pipe_nr)
1183 {
1184 struct dn_pipe *pipe;
1185
1186 SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
1187 if (pipe->pipe_nr == pipe_nr)
1188 return (pipe);
1189
1190 return (NULL);
1191 }
1192
1193 /*
1194 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1195 * depending on whether WF2Q or fixed bw is used.
1196 *
1197 * pipe_nr pipe or queue the packet is destined for.
1198 * dir where shall we send the packet after dummynet.
1199 * m the mbuf with the packet
1200 * ifp the 'ifp' parameter from the caller.
1201 * NULL in ip_input, destination interface in ip_output,
1202 * rule matching rule, in case of multiple passes
1203 *
1204 */
1205 static int
1206 dummynet_io(struct mbuf *m, int dir, struct ip_fw_args *fwa)
1207 {
1208 struct mbuf *head = NULL, *tail = NULL;
1209 struct dn_pkt_tag *pkt;
1210 struct m_tag *mtag;
1211 struct dn_flow_set *fs = NULL;
1212 struct dn_pipe *pipe ;
1213 u_int64_t len = m->m_pkthdr.len ;
1214 struct dn_flow_queue *q = NULL ;
1215 int is_pipe;
1216 ipfw_insn *cmd = ACTION_PTR(fwa->rule);
1217
1218 KASSERT(m->m_nextpkt == NULL,
1219 ("dummynet_io: mbuf queue passed to dummynet"));
1220
1221 if (cmd->opcode == O_LOG)
1222 cmd += F_LEN(cmd);
1223 if (cmd->opcode == O_ALTQ)
1224 cmd += F_LEN(cmd);
1225 if (cmd->opcode == O_TAG)
1226 cmd += F_LEN(cmd);
1227 is_pipe = (cmd->opcode == O_PIPE);
1228
1229 DUMMYNET_LOCK();
1230 /*
1231 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1232 *
1233 * XXXGL: probably the pipe->fs and fs->pipe logic here
1234 * below can be simplified.
1235 */
1236 if (is_pipe) {
1237 pipe = locate_pipe(fwa->cookie);
1238 if (pipe != NULL)
1239 fs = &(pipe->fs);
1240 } else
1241 fs = locate_flowset(fwa->cookie);
1242
1243 if (fs == NULL)
1244 goto dropit; /* This queue/pipe does not exist! */
1245 pipe = fs->pipe;
1246 if (pipe == NULL) { /* Must be a queue, try find a matching pipe. */
1247 pipe = locate_pipe(fs->parent_nr);
1248 if (pipe != NULL)
1249 fs->pipe = pipe;
1250 else {
1251 printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1252 fs->parent_nr, fs->fs_nr);
1253 goto dropit ;
1254 }
1255 }
1256 q = find_queue(fs, &(fwa->f_id));
1257 if ( q == NULL )
1258 goto dropit ; /* cannot allocate queue */
1259 /*
1260 * update statistics, then check reasons to drop pkt
1261 */
1262 q->tot_bytes += len ;
1263 q->tot_pkts++ ;
1264 if ( fs->plr && random() < fs->plr )
1265 goto dropit ; /* random pkt drop */
1266 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
1267 if (q->len_bytes > fs->qsize)
1268 goto dropit ; /* queue size overflow */
1269 } else {
1270 if (q->len >= fs->qsize)
1271 goto dropit ; /* queue count overflow */
1272 }
1273 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
1274 goto dropit ;
1275
1276 /* XXX expensive to zero, see if we can remove it*/
1277 mtag = m_tag_get(PACKET_TAG_DUMMYNET,
1278 sizeof(struct dn_pkt_tag), M_NOWAIT|M_ZERO);
1279 if ( mtag == NULL )
1280 goto dropit ; /* cannot allocate packet header */
1281 m_tag_prepend(m, mtag); /* attach to mbuf chain */
1282
1283 pkt = (struct dn_pkt_tag *)(mtag+1);
1284 /* ok, i can handle the pkt now... */
1285 /* build and enqueue packet + parameters */
1286 pkt->rule = fwa->rule ;
1287 pkt->dn_dir = dir ;
1288
1289 pkt->ifp = fwa->oif;
1290
1291 if (q->head == NULL)
1292 q->head = m;
1293 else
1294 q->tail->m_nextpkt = m;
1295 q->tail = m;
1296 q->len++;
1297 q->len_bytes += len ;
1298
1299 if ( q->head != m ) /* flow was not idle, we are done */
1300 goto done;
1301 /*
1302 * If we reach this point the flow was previously idle, so we need
1303 * to schedule it. This involves different actions for fixed-rate or
1304 * WF2Q queues.
1305 */
1306 if (is_pipe) {
1307 /*
1308 * Fixed-rate queue: just insert into the ready_heap.
1309 */
1310 dn_key t = 0 ;
1311 if (pipe->bandwidth)
1312 t = SET_TICKS(m, q, pipe);
1313 q->sched_time = curr_time ;
1314 if (t == 0) /* must process it now */
1315 ready_event(q, &head, &tail);
1316 else
1317 heap_insert(&ready_heap, curr_time + t , q );
1318 } else {
1319 /*
1320 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1321 * set S to the virtual time V for the controlling pipe, and update
1322 * the sum of weights for the pipe; otherwise, remove flow from
1323 * idle_heap and set S to max(F,V).
1324 * Second, compute finish time F = S + len/weight.
1325 * Third, if pipe was idle, update V=max(S, V).
1326 * Fourth, count one more backlogged flow.
1327 */
1328 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
1329 q->S = pipe->V ;
1330 pipe->sum += fs->weight ; /* add weight of new queue */
1331 } else {
1332 heap_extract(&(pipe->idle_heap), q);
1333 q->S = MAX64(q->F, pipe->V ) ;
1334 }
1335 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
1336
1337 if (pipe->not_eligible_heap.elements == 0 &&
1338 pipe->scheduler_heap.elements == 0)
1339 pipe->V = MAX64 ( q->S, pipe->V );
1340 fs->backlogged++ ;
1341 /*
1342 * Look at eligibility. A flow is not eligibile if S>V (when
1343 * this happens, it means that there is some other flow already
1344 * scheduled for the same pipe, so the scheduler_heap cannot be
1345 * empty). If the flow is not eligible we just store it in the
1346 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1347 * and possibly invoke ready_event_wfq() right now if there is
1348 * leftover credit.
1349 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1350 * and for all flows in not_eligible_heap (NEH), S_i > V .
1351 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1352 * we only need to look into NEH.
1353 */
1354 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
1355 if (pipe->scheduler_heap.elements == 0)
1356 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1357 heap_insert(&(pipe->not_eligible_heap), q->S, q);
1358 } else {
1359 heap_insert(&(pipe->scheduler_heap), q->F, q);
1360 if (pipe->numbytes >= 0) { /* pipe is idle */
1361 if (pipe->scheduler_heap.elements != 1)
1362 printf("dummynet: OUCH! pipe should have been idle!\n");
1363 DPRINTF(("dummynet: waking up pipe %d at %d\n",
1364 pipe->pipe_nr, (int)(q->F >> MY_M)));
1365 pipe->sched_time = curr_time ;
1366 ready_event_wfq(pipe, &head, &tail);
1367 }
1368 }
1369 }
1370 done:
1371 DUMMYNET_UNLOCK();
1372 if (head != NULL)
1373 dummynet_send(head);
1374 return 0;
1375
1376 dropit:
1377 if (q)
1378 q->drops++ ;
1379 DUMMYNET_UNLOCK();
1380 m_freem(m);
1381 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
1382 }
1383
1384 /*
1385 * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1386 * Doing this would probably save us the initial bzero of dn_pkt
1387 */
1388 #define DN_FREE_PKT(_m) do { \
1389 m_freem(_m); \
1390 } while (0)
1391
1392 /*
1393 * Dispose all packets and flow_queues on a flow_set.
1394 * If all=1, also remove red lookup table and other storage,
1395 * including the descriptor itself.
1396 * For the one in dn_pipe MUST also cleanup ready_heap...
1397 */
1398 static void
1399 purge_flow_set(struct dn_flow_set *fs, int all)
1400 {
1401 struct dn_flow_queue *q, *qn;
1402 int i;
1403
1404 DUMMYNET_LOCK_ASSERT();
1405
1406 for (i = 0; i <= fs->rq_size; i++) {
1407 for (q = fs->rq[i]; q != NULL; q = qn) {
1408 struct mbuf *m, *mnext;
1409
1410 mnext = q->head;
1411 while ((m = mnext) != NULL) {
1412 mnext = m->m_nextpkt;
1413 DN_FREE_PKT(m);
1414 }
1415 qn = q->next;
1416 free(q, M_DUMMYNET);
1417 }
1418 fs->rq[i] = NULL;
1419 }
1420
1421 fs->rq_elements = 0;
1422 if (all) {
1423 /* RED - free lookup table. */
1424 if (fs->w_q_lookup != NULL)
1425 free(fs->w_q_lookup, M_DUMMYNET);
1426 if (fs->rq != NULL)
1427 free(fs->rq, M_DUMMYNET);
1428 /* If this fs is not part of a pipe, free it. */
1429 if (fs->pipe == NULL || fs != &(fs->pipe->fs))
1430 free(fs, M_DUMMYNET);
1431 }
1432 }
1433
1434 /*
1435 * Dispose all packets queued on a pipe (not a flow_set).
1436 * Also free all resources associated to a pipe, which is about
1437 * to be deleted.
1438 */
1439 static void
1440 purge_pipe(struct dn_pipe *pipe)
1441 {
1442 struct mbuf *m, *mnext;
1443
1444 purge_flow_set( &(pipe->fs), 1 );
1445
1446 mnext = pipe->head;
1447 while ((m = mnext) != NULL) {
1448 mnext = m->m_nextpkt;
1449 DN_FREE_PKT(m);
1450 }
1451
1452 heap_free( &(pipe->scheduler_heap) );
1453 heap_free( &(pipe->not_eligible_heap) );
1454 heap_free( &(pipe->idle_heap) );
1455 }
1456
1457 /*
1458 * Delete all pipes and heaps returning memory. Must also
1459 * remove references from all ipfw rules to all pipes.
1460 */
1461 static void
1462 dummynet_flush(void)
1463 {
1464 struct dn_pipe *pipe, *pipe1;
1465 struct dn_flow_set *fs, *fs1;
1466 int i;
1467
1468 DUMMYNET_LOCK();
1469 /* Free heaps so we don't have unwanted events. */
1470 heap_free(&ready_heap);
1471 heap_free(&wfq_ready_heap);
1472 heap_free(&extract_heap);
1473
1474 /*
1475 * Now purge all queued pkts and delete all pipes.
1476 *
1477 * XXXGL: can we merge the for(;;) cycles into one or not?
1478 */
1479 for (i = 0; i < HASHSIZE; i++)
1480 SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
1481 SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
1482 purge_flow_set(fs, 1);
1483 }
1484 for (i = 0; i < HASHSIZE; i++)
1485 SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
1486 SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
1487 purge_pipe(pipe);
1488 free(pipe, M_DUMMYNET);
1489 }
1490 DUMMYNET_UNLOCK();
1491 }
1492
1493 extern struct ip_fw *ip_fw_default_rule ;
1494 static void
1495 dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
1496 {
1497 int i ;
1498 struct dn_flow_queue *q ;
1499 struct mbuf *m ;
1500
1501 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
1502 for (q = fs->rq[i] ; q ; q = q->next )
1503 for (m = q->head ; m ; m = m->m_nextpkt ) {
1504 struct dn_pkt_tag *pkt = dn_tag_get(m) ;
1505 if (pkt->rule == r)
1506 pkt->rule = ip_fw_default_rule ;
1507 }
1508 }
1509 /*
1510 * when a firewall rule is deleted, scan all queues and remove the flow-id
1511 * from packets matching this rule.
1512 */
1513 void
1514 dn_rule_delete(void *r)
1515 {
1516 struct dn_pipe *pipe;
1517 struct dn_flow_set *fs;
1518 struct dn_pkt_tag *pkt;
1519 struct mbuf *m;
1520 int i;
1521
1522 DUMMYNET_LOCK();
1523 /*
1524 * If the rule references a queue (dn_flow_set), then scan
1525 * the flow set, otherwise scan pipes. Should do either, but doing
1526 * both does not harm.
1527 */
1528 for (i = 0; i < HASHSIZE; i++)
1529 SLIST_FOREACH(fs, &flowsethash[i], next)
1530 dn_rule_delete_fs(fs, r);
1531
1532 for (i = 0; i < HASHSIZE; i++)
1533 SLIST_FOREACH(pipe, &pipehash[i], next) {
1534 fs = &(pipe->fs);
1535 dn_rule_delete_fs(fs, r);
1536 for (m = pipe->head ; m ; m = m->m_nextpkt ) {
1537 pkt = dn_tag_get(m);
1538 if (pkt->rule == r)
1539 pkt->rule = ip_fw_default_rule;
1540 }
1541 }
1542 DUMMYNET_UNLOCK();
1543 }
1544
1545 /*
1546 * setup RED parameters
1547 */
1548 static int
1549 config_red(struct dn_flow_set *p, struct dn_flow_set *x)
1550 {
1551 int i;
1552
1553 x->w_q = p->w_q;
1554 x->min_th = SCALE(p->min_th);
1555 x->max_th = SCALE(p->max_th);
1556 x->max_p = p->max_p;
1557
1558 x->c_1 = p->max_p / (p->max_th - p->min_th);
1559 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1560
1561 if (x->flags_fs & DN_IS_GENTLE_RED) {
1562 x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
1563 x->c_4 = SCALE(1) - 2 * p->max_p;
1564 }
1565
1566 /* If the lookup table already exist, free and create it again. */
1567 if (x->w_q_lookup) {
1568 free(x->w_q_lookup, M_DUMMYNET);
1569 x->w_q_lookup = NULL;
1570 }
1571 if (red_lookup_depth == 0) {
1572 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
1573 "must be > 0\n");
1574 free(x, M_DUMMYNET);
1575 return (EINVAL);
1576 }
1577 x->lookup_depth = red_lookup_depth;
1578 x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
1579 M_DUMMYNET, M_NOWAIT);
1580 if (x->w_q_lookup == NULL) {
1581 printf("dummynet: sorry, cannot allocate red lookup table\n");
1582 free(x, M_DUMMYNET);
1583 return(ENOSPC);
1584 }
1585
1586 /* Fill the lookup table with (1 - w_q)^x */
1587 x->lookup_step = p->lookup_step;
1588 x->lookup_weight = p->lookup_weight;
1589 x->w_q_lookup[0] = SCALE(1) - x->w_q;
1590
1591 for (i = 1; i < x->lookup_depth; i++)
1592 x->w_q_lookup[i] =
1593 SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
1594
1595 if (red_avg_pkt_size < 1)
1596 red_avg_pkt_size = 512;
1597 x->avg_pkt_size = red_avg_pkt_size;
1598 if (red_max_pkt_size < 1)
1599 red_max_pkt_size = 1500;
1600 x->max_pkt_size = red_max_pkt_size;
1601 return (0);
1602 }
1603
1604 static int
1605 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
1606 {
1607 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
1608 int l = pfs->rq_size;
1609
1610 if (l == 0)
1611 l = dn_hash_size;
1612 if (l < 4)
1613 l = 4;
1614 else if (l > DN_MAX_HASH_SIZE)
1615 l = DN_MAX_HASH_SIZE;
1616 x->rq_size = l;
1617 } else /* one is enough for null mask */
1618 x->rq_size = 1;
1619 x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
1620 M_DUMMYNET, M_NOWAIT | M_ZERO);
1621 if (x->rq == NULL) {
1622 printf("dummynet: sorry, cannot allocate queue\n");
1623 return (ENOMEM);
1624 }
1625 x->rq_elements = 0;
1626 return 0 ;
1627 }
1628
1629 static void
1630 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
1631 {
1632 x->flags_fs = src->flags_fs;
1633 x->qsize = src->qsize;
1634 x->plr = src->plr;
1635 x->flow_mask = src->flow_mask;
1636 if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1637 if (x->qsize > 1024 * 1024)
1638 x->qsize = 1024 * 1024;
1639 } else {
1640 if (x->qsize == 0)
1641 x->qsize = 50;
1642 if (x->qsize > 100)
1643 x->qsize = 50;
1644 }
1645 /* Configuring RED. */
1646 if (x->flags_fs & DN_IS_RED)
1647 config_red(src, x); /* XXX should check errors */
1648 }
1649
1650 /*
1651 * Setup pipe or queue parameters.
1652 */
1653 static int
1654 config_pipe(struct dn_pipe *p)
1655 {
1656 struct dn_flow_set *pfs = &(p->fs);
1657 struct dn_flow_queue *q;
1658 int i, error;
1659
1660 /*
1661 * The config program passes parameters as follows:
1662 * bw = bits/second (0 means no limits),
1663 * delay = ms, must be translated into ticks.
1664 * qsize = slots/bytes
1665 */
1666 p->delay = (p->delay * hz) / 1000;
1667 /* We need either a pipe number or a flow_set number. */
1668 if (p->pipe_nr == 0 && pfs->fs_nr == 0)
1669 return (EINVAL);
1670 if (p->pipe_nr != 0 && pfs->fs_nr != 0)
1671 return (EINVAL);
1672 if (p->pipe_nr != 0) { /* this is a pipe */
1673 struct dn_pipe *pipe;
1674
1675 DUMMYNET_LOCK();
1676 pipe = locate_pipe(p->pipe_nr); /* locate pipe */
1677
1678 if (pipe == NULL) { /* new pipe */
1679 pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
1680 M_NOWAIT | M_ZERO);
1681 if (pipe == NULL) {
1682 DUMMYNET_UNLOCK();
1683 printf("dummynet: no memory for new pipe\n");
1684 return (ENOMEM);
1685 }
1686 pipe->pipe_nr = p->pipe_nr;
1687 pipe->fs.pipe = pipe;
1688 /*
1689 * idle_heap is the only one from which
1690 * we extract from the middle.
1691 */
1692 pipe->idle_heap.size = pipe->idle_heap.elements = 0;
1693 pipe->idle_heap.offset =
1694 offsetof(struct dn_flow_queue, heap_pos);
1695 } else
1696 /* Flush accumulated credit for all queues. */
1697 for (i = 0; i <= pipe->fs.rq_size; i++)
1698 for (q = pipe->fs.rq[i]; q; q = q->next)
1699 q->numbytes = 0;
1700
1701 pipe->bandwidth = p->bandwidth;
1702 pipe->numbytes = 0; /* just in case... */
1703 bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
1704 pipe->ifp = NULL; /* reset interface ptr */
1705 pipe->delay = p->delay;
1706 set_fs_parms(&(pipe->fs), pfs);
1707
1708 if (pipe->fs.rq == NULL) { /* a new pipe */
1709 error = alloc_hash(&(pipe->fs), pfs);
1710 if (error) {
1711 DUMMYNET_UNLOCK();
1712 free(pipe, M_DUMMYNET);
1713 return (error);
1714 }
1715 SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
1716 pipe, next);
1717 }
1718 DUMMYNET_UNLOCK();
1719 } else { /* config queue */
1720 struct dn_flow_set *fs;
1721
1722 DUMMYNET_LOCK();
1723 fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
1724
1725 if (fs == NULL) { /* new */
1726 if (pfs->parent_nr == 0) { /* need link to a pipe */
1727 DUMMYNET_UNLOCK();
1728 return (EINVAL);
1729 }
1730 fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
1731 M_NOWAIT | M_ZERO);
1732 if (fs == NULL) {
1733 DUMMYNET_UNLOCK();
1734 printf(
1735 "dummynet: no memory for new flow_set\n");
1736 return (ENOMEM);
1737 }
1738 fs->fs_nr = pfs->fs_nr;
1739 fs->parent_nr = pfs->parent_nr;
1740 fs->weight = pfs->weight;
1741 if (fs->weight == 0)
1742 fs->weight = 1;
1743 else if (fs->weight > 100)
1744 fs->weight = 100;
1745 } else {
1746 /*
1747 * Change parent pipe not allowed;
1748 * must delete and recreate.
1749 */
1750 if (pfs->parent_nr != 0 &&
1751 fs->parent_nr != pfs->parent_nr) {
1752 DUMMYNET_UNLOCK();
1753 return (EINVAL);
1754 }
1755 }
1756
1757 set_fs_parms(fs, pfs);
1758
1759 if (fs->rq == NULL) { /* a new flow_set */
1760 error = alloc_hash(fs, pfs);
1761 if (error) {
1762 DUMMYNET_UNLOCK();
1763 free(fs, M_DUMMYNET);
1764 return (error);
1765 }
1766 SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
1767 fs, next);
1768 }
1769 DUMMYNET_UNLOCK();
1770 }
1771 return (0);
1772 }
1773
1774 /*
1775 * Helper function to remove from a heap queues which are linked to
1776 * a flow_set about to be deleted.
1777 */
1778 static void
1779 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
1780 {
1781 int i = 0, found = 0 ;
1782 for (; i < h->elements ;)
1783 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
1784 h->elements-- ;
1785 h->p[i] = h->p[h->elements] ;
1786 found++ ;
1787 } else
1788 i++ ;
1789 if (found)
1790 heapify(h);
1791 }
1792
1793 /*
1794 * helper function to remove a pipe from a heap (can be there at most once)
1795 */
1796 static void
1797 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
1798 {
1799 if (h->elements > 0) {
1800 int i = 0 ;
1801 for (i=0; i < h->elements ; i++ ) {
1802 if (h->p[i].object == p) { /* found it */
1803 h->elements-- ;
1804 h->p[i] = h->p[h->elements] ;
1805 heapify(h);
1806 break ;
1807 }
1808 }
1809 }
1810 }
1811
1812 /*
1813 * drain all queues. Called in case of severe mbuf shortage.
1814 */
1815 void
1816 dummynet_drain(void)
1817 {
1818 struct dn_flow_set *fs;
1819 struct dn_pipe *pipe;
1820 struct mbuf *m, *mnext;
1821 int i;
1822
1823 DUMMYNET_LOCK_ASSERT();
1824
1825 heap_free(&ready_heap);
1826 heap_free(&wfq_ready_heap);
1827 heap_free(&extract_heap);
1828 /* remove all references to this pipe from flow_sets */
1829 for (i = 0; i < HASHSIZE; i++)
1830 SLIST_FOREACH(fs, &flowsethash[i], next)
1831 purge_flow_set(fs, 0);
1832
1833 for (i = 0; i < HASHSIZE; i++) {
1834 SLIST_FOREACH(pipe, &pipehash[i], next) {
1835 purge_flow_set(&(pipe->fs), 0);
1836
1837 mnext = pipe->head;
1838 while ((m = mnext) != NULL) {
1839 mnext = m->m_nextpkt;
1840 DN_FREE_PKT(m);
1841 }
1842 pipe->head = pipe->tail = NULL;
1843 }
1844 }
1845 }
1846
1847 /*
1848 * Fully delete a pipe or a queue, cleaning up associated info.
1849 */
1850 static int
1851 delete_pipe(struct dn_pipe *p)
1852 {
1853
1854 if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
1855 return EINVAL ;
1856 if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
1857 return EINVAL ;
1858 if (p->pipe_nr != 0) { /* this is an old-style pipe */
1859 struct dn_pipe *pipe;
1860 struct dn_flow_set *fs;
1861 int i;
1862
1863 DUMMYNET_LOCK();
1864 pipe = locate_pipe(p->pipe_nr); /* locate pipe */
1865
1866 if (pipe == NULL) {
1867 DUMMYNET_UNLOCK();
1868 return (ENOENT); /* not found */
1869 }
1870
1871 /* Unlink from list of pipes. */
1872 SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
1873
1874 /* Remove all references to this pipe from flow_sets. */
1875 for (i = 0; i < HASHSIZE; i++)
1876 SLIST_FOREACH(fs, &flowsethash[i], next)
1877 if (fs->pipe == pipe) {
1878 printf("dummynet: ++ ref to pipe %d from fs %d\n",
1879 p->pipe_nr, fs->fs_nr);
1880 fs->pipe = NULL ;
1881 purge_flow_set(fs, 0);
1882 }
1883 fs_remove_from_heap(&ready_heap, &(pipe->fs));
1884 purge_pipe(pipe); /* remove all data associated to this pipe */
1885 /* remove reference to here from extract_heap and wfq_ready_heap */
1886 pipe_remove_from_heap(&extract_heap, pipe);
1887 pipe_remove_from_heap(&wfq_ready_heap, pipe);
1888 DUMMYNET_UNLOCK();
1889
1890 free(pipe, M_DUMMYNET);
1891 } else { /* this is a WF2Q queue (dn_flow_set) */
1892 struct dn_flow_set *fs;
1893
1894 DUMMYNET_LOCK();
1895 fs = locate_flowset(p->fs.fs_nr); /* locate set */
1896
1897 if (fs == NULL) {
1898 DUMMYNET_UNLOCK();
1899 return (ENOENT); /* not found */
1900 }
1901
1902 /* Unlink from list of flowsets. */
1903 SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
1904
1905 if (fs->pipe != NULL) {
1906 /* Update total weight on parent pipe and cleanup parent heaps. */
1907 fs->pipe->sum -= fs->weight * fs->backlogged ;
1908 fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
1909 fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
1910 #if 1 /* XXX should i remove from idle_heap as well ? */
1911 fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
1912 #endif
1913 }
1914 purge_flow_set(fs, 1);
1915 DUMMYNET_UNLOCK();
1916 }
1917 return 0 ;
1918 }
1919
1920 /*
1921 * helper function used to copy data from kernel in DUMMYNET_GET
1922 */
1923 static char *
1924 dn_copy_set(struct dn_flow_set *set, char *bp)
1925 {
1926 int i, copied = 0 ;
1927 struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
1928
1929 DUMMYNET_LOCK_ASSERT();
1930
1931 for (i = 0 ; i <= set->rq_size ; i++)
1932 for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
1933 if (q->hash_slot != i)
1934 printf("dummynet: ++ at %d: wrong slot (have %d, "
1935 "should be %d)\n", copied, q->hash_slot, i);
1936 if (q->fs != set)
1937 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
1938 i, q->fs, set);
1939 copied++ ;
1940 bcopy(q, qp, sizeof( *q ) );
1941 /* cleanup pointers */
1942 qp->next = NULL ;
1943 qp->head = qp->tail = NULL ;
1944 qp->fs = NULL ;
1945 }
1946 if (copied != set->rq_elements)
1947 printf("dummynet: ++ wrong count, have %d should be %d\n",
1948 copied, set->rq_elements);
1949 return (char *)qp ;
1950 }
1951
1952 static size_t
1953 dn_calc_size(void)
1954 {
1955 struct dn_flow_set *fs;
1956 struct dn_pipe *pipe;
1957 size_t size = 0;
1958 int i;
1959
1960 DUMMYNET_LOCK_ASSERT();
1961 /*
1962 * Compute size of data structures: list of pipes and flow_sets.
1963 */
1964 for (i = 0; i < HASHSIZE; i++) {
1965 SLIST_FOREACH(pipe, &pipehash[i], next)
1966 size += sizeof(*pipe) +
1967 pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
1968 SLIST_FOREACH(fs, &flowsethash[i], next)
1969 size += sizeof (*fs) +
1970 fs->rq_elements * sizeof(struct dn_flow_queue);
1971 }
1972 return size;
1973 }
1974
1975 static int
1976 dummynet_get(struct sockopt *sopt)
1977 {
1978 char *buf, *bp ; /* bp is the "copy-pointer" */
1979 size_t size ;
1980 struct dn_flow_set *fs;
1981 struct dn_pipe *pipe;
1982 int error=0, i ;
1983
1984 /* XXX lock held too long */
1985 DUMMYNET_LOCK();
1986 /*
1987 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
1988 * cannot use this flag while holding a mutex.
1989 */
1990 for (i = 0; i < 10; i++) {
1991 size = dn_calc_size();
1992 DUMMYNET_UNLOCK();
1993 buf = malloc(size, M_TEMP, M_WAITOK);
1994 DUMMYNET_LOCK();
1995 if (size == dn_calc_size())
1996 break;
1997 free(buf, M_TEMP);
1998 buf = NULL;
1999 }
2000 if (buf == NULL) {
2001 DUMMYNET_UNLOCK();
2002 return ENOBUFS ;
2003 }
2004 bp = buf;
2005 for (i = 0; i < HASHSIZE; i++)
2006 SLIST_FOREACH(pipe, &pipehash[i], next) {
2007 struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
2008
2009 /*
2010 * Copy pipe descriptor into *bp, convert delay back to ms,
2011 * then copy the flow_set descriptor(s) one at a time.
2012 * After each flow_set, copy the queue descriptor it owns.
2013 */
2014 bcopy(pipe, bp, sizeof(*pipe));
2015 pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
2016 /*
2017 * XXX the following is a hack based on ->next being the
2018 * first field in dn_pipe and dn_flow_set. The correct
2019 * solution would be to move the dn_flow_set to the beginning
2020 * of struct dn_pipe.
2021 */
2022 pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
2023 /* Clean pointers. */
2024 pipe_bp->head = pipe_bp->tail = NULL;
2025 pipe_bp->fs.next.sle_next = NULL;
2026 pipe_bp->fs.pipe = NULL;
2027 pipe_bp->fs.rq = NULL;
2028
2029 bp += sizeof(*pipe) ;
2030 bp = dn_copy_set(&(pipe->fs), bp);
2031 }
2032
2033 for (i = 0; i < HASHSIZE; i++)
2034 SLIST_FOREACH(fs, &flowsethash[i], next) {
2035 struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
2036
2037 bcopy(fs, bp, sizeof(*fs));
2038 /* XXX same hack as above */
2039 fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
2040 fs_bp->pipe = NULL;
2041 fs_bp->rq = NULL;
2042 bp += sizeof(*fs);
2043 bp = dn_copy_set(fs, bp);
2044 }
2045
2046 DUMMYNET_UNLOCK();
2047
2048 error = sooptcopyout(sopt, buf, size);
2049 free(buf, M_TEMP);
2050 return error ;
2051 }
2052
2053 /*
2054 * Handler for the various dummynet socket options (get, flush, config, del)
2055 */
2056 static int
2057 ip_dn_ctl(struct sockopt *sopt)
2058 {
2059 int error = 0 ;
2060 struct dn_pipe *p, tmp_pipe;
2061
2062 /* Disallow sets in really-really secure mode. */
2063 if (sopt->sopt_dir == SOPT_SET) {
2064 #if __FreeBSD_version >= 500034
2065 error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
2066 if (error)
2067 return (error);
2068 #else
2069 if (securelevel >= 3)
2070 return (EPERM);
2071 #endif
2072 }
2073
2074 switch (sopt->sopt_name) {
2075 default :
2076 printf("dummynet: -- unknown option %d", sopt->sopt_name);
2077 return EINVAL ;
2078
2079 case IP_DUMMYNET_GET :
2080 error = dummynet_get(sopt);
2081 break ;
2082
2083 case IP_DUMMYNET_FLUSH :
2084 dummynet_flush() ;
2085 break ;
2086
2087 case IP_DUMMYNET_CONFIGURE :
2088 p = &tmp_pipe ;
2089 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2090 if (error)
2091 break ;
2092 error = config_pipe(p);
2093 break ;
2094
2095 case IP_DUMMYNET_DEL : /* remove a pipe or queue */
2096 p = &tmp_pipe ;
2097 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2098 if (error)
2099 break ;
2100
2101 error = delete_pipe(p);
2102 break ;
2103 }
2104 return error ;
2105 }
2106
2107 static void
2108 ip_dn_init(void)
2109 {
2110 int i;
2111
2112 if (bootverbose)
2113 printf("DUMMYNET with IPv6 initialized (040826)\n");
2114
2115 DUMMYNET_LOCK_INIT();
2116
2117 for (i = 0; i < HASHSIZE; i++) {
2118 SLIST_INIT(&pipehash[i]);
2119 SLIST_INIT(&flowsethash[i]);
2120 }
2121 ready_heap.size = ready_heap.elements = 0;
2122 ready_heap.offset = 0;
2123
2124 wfq_ready_heap.size = wfq_ready_heap.elements = 0;
2125 wfq_ready_heap.offset = 0;
2126
2127 extract_heap.size = extract_heap.elements = 0;
2128 extract_heap.offset = 0;
2129
2130 ip_dn_ctl_ptr = ip_dn_ctl;
2131 ip_dn_io_ptr = dummynet_io;
2132 ip_dn_ruledel_ptr = dn_rule_delete;
2133
2134 TASK_INIT(&dn_task, 0, dummynet_task, NULL);
2135 dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
2136 taskqueue_thread_enqueue, &dn_tq);
2137 taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
2138
2139 callout_init(&dn_timeout, CALLOUT_MPSAFE);
2140 callout_reset(&dn_timeout, 1, dummynet, NULL);
2141
2142 /* Initialize curr_time adjustment mechanics. */
2143 getmicrouptime(&prev_t);
2144 }
2145
2146 #ifdef KLD_MODULE
2147 static void
2148 ip_dn_destroy(void)
2149 {
2150 ip_dn_ctl_ptr = NULL;
2151 ip_dn_io_ptr = NULL;
2152 ip_dn_ruledel_ptr = NULL;
2153
2154 DUMMYNET_LOCK();
2155 callout_stop(&dn_timeout);
2156 DUMMYNET_UNLOCK();
2157 taskqueue_drain(dn_tq, &dn_task);
2158 taskqueue_free(dn_tq);
2159
2160 dummynet_flush();
2161
2162 DUMMYNET_LOCK_DESTROY();
2163 }
2164 #endif /* KLD_MODULE */
2165
2166 static int
2167 dummynet_modevent(module_t mod, int type, void *data)
2168 {
2169
2170 switch (type) {
2171 case MOD_LOAD:
2172 if (DUMMYNET_LOADED) {
2173 printf("DUMMYNET already loaded\n");
2174 return EEXIST ;
2175 }
2176 ip_dn_init();
2177 break;
2178
2179 case MOD_UNLOAD:
2180 #if !defined(KLD_MODULE)
2181 printf("dummynet statically compiled, cannot unload\n");
2182 return EINVAL ;
2183 #else
2184 ip_dn_destroy();
2185 #endif
2186 break ;
2187 default:
2188 return EOPNOTSUPP;
2189 break ;
2190 }
2191 return 0 ;
2192 }
2193
2194 static moduledata_t dummynet_mod = {
2195 "dummynet",
2196 dummynet_modevent,
2197 NULL
2198 };
2199 DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
2200 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
2201 MODULE_VERSION(dummynet, 1);
Cache object: a1aa3796e239ec31d5c97f4c3caa2ab5
|