1 /*-
2 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
3 * Portions Copyright (c) 2000 Akamba Corp.
4 * All rights reserved
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: releng/6.0/sys/netinet/ip_dummynet.c 147205 2005-06-10 01:25:22Z thompsa $
28 */
29
30 #define DUMMYNET_DEBUG
31
32 #if !defined(KLD_MODULE)
33 #include "opt_inet6.h"
34 #endif
35
36 /*
37 * This module implements IP dummynet, a bandwidth limiter/delay emulator
38 * used in conjunction with the ipfw package.
39 * Description of the data structures used is in ip_dummynet.h
40 * Here you mainly find the following blocks of code:
41 * + variable declarations;
42 * + heap management functions;
43 * + scheduler and dummynet functions;
44 * + configuration and initialization.
45 *
46 * NOTA BENE: critical sections are protected by the "dummynet lock".
47 *
48 * Most important Changes:
49 *
50 * 011004: KLDable
51 * 010124: Fixed WF2Q behaviour
52 * 010122: Fixed spl protection.
53 * 000601: WF2Q support
54 * 000106: large rewrite, use heaps to handle very many pipes.
55 * 980513: initial release
56 *
57 * include files marked with XXX are probably not needed
58 */
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/malloc.h>
63 #include <sys/mbuf.h>
64 #include <sys/kernel.h>
65 #include <sys/module.h>
66 #include <sys/proc.h>
67 #include <sys/socket.h>
68 #include <sys/socketvar.h>
69 #include <sys/time.h>
70 #include <sys/sysctl.h>
71 #include <net/if.h>
72 #include <net/route.h>
73 #include <netinet/in.h>
74 #include <netinet/in_systm.h>
75 #include <netinet/in_var.h>
76 #include <netinet/ip.h>
77 #include <netinet/ip_fw.h>
78 #include <netinet/ip_dummynet.h>
79 #include <netinet/ip_var.h>
80
81 #include <netinet/if_ether.h> /* for struct arpcom */
82 #include <net/bridge.h>
83
84 #include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
85 #include <netinet6/ip6_var.h>
86
87 /*
88 * We keep a private variable for the simulation time, but we could
89 * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
90 */
91 static dn_key curr_time = 0 ; /* current simulation time */
92
93 static int dn_hash_size = 64 ; /* default hash size */
94
95 /* statistics on number of queue searches and search steps */
96 static int searches, search_steps ;
97 static int pipe_expire = 1 ; /* expire queue if empty */
98 static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
99
100 static int red_lookup_depth = 256; /* RED - default lookup table depth */
101 static int red_avg_pkt_size = 512; /* RED - default medium packet size */
102 static int red_max_pkt_size = 1500; /* RED - default max packet size */
103
104 /*
105 * Three heaps contain queues and pipes that the scheduler handles:
106 *
107 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
108 *
109 * wfq_ready_heap contains the pipes associated with WF2Q flows
110 *
111 * extract_heap contains pipes associated with delay lines.
112 *
113 */
114
115 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
116
117 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
118
119 static int heap_init(struct dn_heap *h, int size) ;
120 static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
121 static void heap_extract(struct dn_heap *h, void *obj);
122
123 static void transmit_event(struct dn_pipe *pipe);
124 static void ready_event(struct dn_flow_queue *q);
125
126 static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */
127 static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */
128
129 static struct callout dn_timeout;
130
131 extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
132
133 #ifdef SYSCTL_NODE
134 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
135 CTLFLAG_RW, 0, "Dummynet");
136 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
137 CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
138 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
139 CTLFLAG_RD, &curr_time, 0, "Current tick");
140 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
141 CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
142 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
143 CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
144 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
145 CTLFLAG_RD, &searches, 0, "Number of queue searches");
146 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
147 CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
148 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
149 CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
150 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
151 CTLFLAG_RW, &dn_max_ratio, 0,
152 "Max ratio between dynamic queues and buckets");
153 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
154 CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
155 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
156 CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
157 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
158 CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
159 #endif
160
161 #ifdef DUMMYNET_DEBUG
162 int dummynet_debug = 0;
163 #ifdef SYSCTL_NODE
164 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
165 0, "control debugging printfs");
166 #endif
167 #define DPRINTF(X) if (dummynet_debug) printf X
168 #else
169 #define DPRINTF(X)
170 #endif
171
172 static struct mtx dummynet_mtx;
173 /*
174 * NB: Recursion is needed to deal with re-entry via ICMP. That is,
175 * a packet may be dispatched via ip_input from dummynet_io and
176 * re-enter through ip_output. Yech.
177 */
178 #define DUMMYNET_LOCK_INIT() \
179 mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF | MTX_RECURSE)
180 #define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
181 #define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx)
182 #define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx)
183 #define DUMMYNET_LOCK_ASSERT() do { \
184 mtx_assert(&dummynet_mtx, MA_OWNED); \
185 NET_ASSERT_GIANT(); \
186 } while (0)
187
188 static int config_pipe(struct dn_pipe *p);
189 static int ip_dn_ctl(struct sockopt *sopt);
190
191 static void dummynet(void *);
192 static void dummynet_flush(void);
193 void dummynet_drain(void);
194 static ip_dn_io_t dummynet_io;
195 static void dn_rule_delete(void *);
196
197 int if_tx_rdy(struct ifnet *ifp);
198
199 /*
200 * Heap management functions.
201 *
202 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
203 * Some macros help finding parent/children so we can optimize them.
204 *
205 * heap_init() is called to expand the heap when needed.
206 * Increment size in blocks of 16 entries.
207 * XXX failure to allocate a new element is a pretty bad failure
208 * as we basically stall a whole queue forever!!
209 * Returns 1 on error, 0 on success
210 */
211 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
212 #define HEAP_LEFT(x) ( 2*(x) + 1 )
213 #define HEAP_IS_LEFT(x) ( (x) & 1 )
214 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
215 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
216 #define HEAP_INCREMENT 15
217
218 static int
219 heap_init(struct dn_heap *h, int new_size)
220 {
221 struct dn_heap_entry *p;
222
223 if (h->size >= new_size ) {
224 printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
225 h->size, new_size);
226 return 0 ;
227 }
228 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
229 p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
230 if (p == NULL) {
231 printf("dummynet: %s, resize %d failed\n", __func__, new_size );
232 return 1 ; /* error */
233 }
234 if (h->size > 0) {
235 bcopy(h->p, p, h->size * sizeof(*p) );
236 free(h->p, M_DUMMYNET);
237 }
238 h->p = p ;
239 h->size = new_size ;
240 return 0 ;
241 }
242
243 /*
244 * Insert element in heap. Normally, p != NULL, we insert p in
245 * a new position and bubble up. If p == NULL, then the element is
246 * already in place, and key is the position where to start the
247 * bubble-up.
248 * Returns 1 on failure (cannot allocate new heap entry)
249 *
250 * If offset > 0 the position (index, int) of the element in the heap is
251 * also stored in the element itself at the given offset in bytes.
252 */
253 #define SET_OFFSET(heap, node) \
254 if (heap->offset > 0) \
255 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
256 /*
257 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
258 */
259 #define RESET_OFFSET(heap, node) \
260 if (heap->offset > 0) \
261 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
262 static int
263 heap_insert(struct dn_heap *h, dn_key key1, void *p)
264 {
265 int son = h->elements ;
266
267 if (p == NULL) /* data already there, set starting point */
268 son = key1 ;
269 else { /* insert new element at the end, possibly resize */
270 son = h->elements ;
271 if (son == h->size) /* need resize... */
272 if (heap_init(h, h->elements+1) )
273 return 1 ; /* failure... */
274 h->p[son].object = p ;
275 h->p[son].key = key1 ;
276 h->elements++ ;
277 }
278 while (son > 0) { /* bubble up */
279 int father = HEAP_FATHER(son) ;
280 struct dn_heap_entry tmp ;
281
282 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
283 break ; /* found right position */
284 /* son smaller than father, swap and repeat */
285 HEAP_SWAP(h->p[son], h->p[father], tmp) ;
286 SET_OFFSET(h, son);
287 son = father ;
288 }
289 SET_OFFSET(h, son);
290 return 0 ;
291 }
292
293 /*
294 * remove top element from heap, or obj if obj != NULL
295 */
296 static void
297 heap_extract(struct dn_heap *h, void *obj)
298 {
299 int child, father, max = h->elements - 1 ;
300
301 if (max < 0) {
302 printf("dummynet: warning, extract from empty heap 0x%p\n", h);
303 return ;
304 }
305 father = 0 ; /* default: move up smallest child */
306 if (obj != NULL) { /* extract specific element, index is at offset */
307 if (h->offset <= 0)
308 panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
309 father = *((int *)((char *)obj + h->offset)) ;
310 if (father < 0 || father >= h->elements) {
311 printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
312 father, h->elements);
313 panic("dummynet: heap_extract");
314 }
315 }
316 RESET_OFFSET(h, father);
317 child = HEAP_LEFT(father) ; /* left child */
318 while (child <= max) { /* valid entry */
319 if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
320 child = child+1 ; /* take right child, otherwise left */
321 h->p[father] = h->p[child] ;
322 SET_OFFSET(h, father);
323 father = child ;
324 child = HEAP_LEFT(child) ; /* left child for next loop */
325 }
326 h->elements-- ;
327 if (father != max) {
328 /*
329 * Fill hole with last entry and bubble up, reusing the insert code
330 */
331 h->p[father] = h->p[max] ;
332 heap_insert(h, father, NULL); /* this one cannot fail */
333 }
334 }
335
336 #if 0
337 /*
338 * change object position and update references
339 * XXX this one is never used!
340 */
341 static void
342 heap_move(struct dn_heap *h, dn_key new_key, void *object)
343 {
344 int temp;
345 int i ;
346 int max = h->elements-1 ;
347 struct dn_heap_entry buf ;
348
349 if (h->offset <= 0)
350 panic("cannot move items on this heap");
351
352 i = *((int *)((char *)object + h->offset));
353 if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
354 h->p[i].key = new_key ;
355 for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
356 i = temp ) { /* bubble up */
357 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
358 SET_OFFSET(h, i);
359 }
360 } else { /* must move down */
361 h->p[i].key = new_key ;
362 while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
363 if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
364 temp++ ; /* select child with min key */
365 if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
366 HEAP_SWAP(h->p[i], h->p[temp], buf) ;
367 SET_OFFSET(h, i);
368 } else
369 break ;
370 i = temp ;
371 }
372 }
373 SET_OFFSET(h, i);
374 }
375 #endif /* heap_move, unused */
376
377 /*
378 * heapify() will reorganize data inside an array to maintain the
379 * heap property. It is needed when we delete a bunch of entries.
380 */
381 static void
382 heapify(struct dn_heap *h)
383 {
384 int i ;
385
386 for (i = 0 ; i < h->elements ; i++ )
387 heap_insert(h, i , NULL) ;
388 }
389
390 /*
391 * cleanup the heap and free data structure
392 */
393 static void
394 heap_free(struct dn_heap *h)
395 {
396 if (h->size >0 )
397 free(h->p, M_DUMMYNET);
398 bzero(h, sizeof(*h) );
399 }
400
401 /*
402 * --- end of heap management functions ---
403 */
404
405 /*
406 * Return the mbuf tag holding the dummynet state. As an optimization
407 * this is assumed to be the first tag on the list. If this turns out
408 * wrong we'll need to search the list.
409 */
410 static struct dn_pkt_tag *
411 dn_tag_get(struct mbuf *m)
412 {
413 struct m_tag *mtag = m_tag_first(m);
414 KASSERT(mtag != NULL &&
415 mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
416 mtag->m_tag_id == PACKET_TAG_DUMMYNET,
417 ("packet on dummynet queue w/o dummynet tag!"));
418 return (struct dn_pkt_tag *)(mtag+1);
419 }
420
421 /*
422 * Scheduler functions:
423 *
424 * transmit_event() is called when the delay-line needs to enter
425 * the scheduler, either because of existing pkts getting ready,
426 * or new packets entering the queue. The event handled is the delivery
427 * time of the packet.
428 *
429 * ready_event() does something similar with fixed-rate queues, and the
430 * event handled is the finish time of the head pkt.
431 *
432 * wfq_ready_event() does something similar with WF2Q queues, and the
433 * event handled is the start time of the head pkt.
434 *
435 * In all cases, we make sure that the data structures are consistent
436 * before passing pkts out, because this might trigger recursive
437 * invocations of the procedures.
438 */
439 static void
440 transmit_event(struct dn_pipe *pipe)
441 {
442 struct mbuf *m ;
443 struct dn_pkt_tag *pkt ;
444 struct ip *ip;
445
446 DUMMYNET_LOCK_ASSERT();
447
448 while ( (m = pipe->head) ) {
449 pkt = dn_tag_get(m);
450 if ( !DN_KEY_LEQ(pkt->output_time, curr_time) )
451 break;
452 /*
453 * first unlink, then call procedures, since ip_input() can invoke
454 * ip_output() and viceversa, thus causing nested calls
455 */
456 pipe->head = m->m_nextpkt ;
457 m->m_nextpkt = NULL;
458
459 /* XXX: drop the lock for now to avoid LOR's */
460 DUMMYNET_UNLOCK();
461 switch (pkt->dn_dir) {
462 case DN_TO_IP_OUT:
463 (void)ip_output(m, NULL, NULL, pkt->flags, NULL, NULL);
464 break ;
465
466 case DN_TO_IP_IN :
467 ip = mtod(m, struct ip *);
468 ip->ip_len = htons(ip->ip_len);
469 ip->ip_off = htons(ip->ip_off);
470 ip_input(m) ;
471 break ;
472
473 #ifdef INET6
474 case DN_TO_IP6_IN:
475 ip6_input(m) ;
476 break ;
477
478 case DN_TO_IP6_OUT:
479 (void)ip6_output(m, NULL, NULL, pkt->flags, NULL, NULL, NULL);
480 break ;
481 #endif
482
483 case DN_TO_IFB_FWD:
484 if (bridge_dn_p != NULL)
485 ((*bridge_dn_p)(m, pkt->ifp));
486 else
487 printf("dummynet: if_bridge not loaded\n");
488
489 break;
490
491 case DN_TO_BDG_FWD :
492 /*
493 * The bridge requires/assumes the Ethernet header is
494 * contiguous in the first mbuf header. Insure this is true.
495 */
496 if (BDG_LOADED) {
497 if (m->m_len < ETHER_HDR_LEN &&
498 (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
499 printf("dummynet/bridge: pullup fail, dropping pkt\n");
500 break;
501 }
502 m = bdg_forward_ptr(m, pkt->ifp);
503 } else {
504 /* somebody unloaded the bridge module. Drop pkt */
505 /* XXX rate limit */
506 printf("dummynet: dropping bridged packet trapped in pipe\n");
507 }
508 if (m)
509 m_freem(m);
510 break;
511
512 case DN_TO_ETH_DEMUX:
513 /*
514 * The Ethernet code assumes the Ethernet header is
515 * contiguous in the first mbuf header. Insure this is true.
516 */
517 if (m->m_len < ETHER_HDR_LEN &&
518 (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
519 printf("dummynet/ether: pullup fail, dropping pkt\n");
520 break;
521 }
522 ether_demux(m->m_pkthdr.rcvif, m); /* which consumes the mbuf */
523 break ;
524
525 case DN_TO_ETH_OUT:
526 ether_output_frame(pkt->ifp, m);
527 break;
528
529 default:
530 printf("dummynet: bad switch %d!\n", pkt->dn_dir);
531 m_freem(m);
532 break ;
533 }
534 DUMMYNET_LOCK();
535 }
536 /* if there are leftover packets, put into the heap for next event */
537 if ( (m = pipe->head) ) {
538 pkt = dn_tag_get(m) ;
539 /* XXX should check errors on heap_insert, by draining the
540 * whole pipe p and hoping in the future we are more successful
541 */
542 heap_insert(&extract_heap, pkt->output_time, pipe ) ;
543 }
544 }
545
546 /*
547 * the following macro computes how many ticks we have to wait
548 * before being able to transmit a packet. The credit is taken from
549 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
550 */
551 #define SET_TICKS(_m, q, p) \
552 ((_m)->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
553 p->bandwidth ;
554
555 /*
556 * extract pkt from queue, compute output time (could be now)
557 * and put into delay line (p_queue)
558 */
559 static void
560 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q,
561 struct dn_pipe *p, int len)
562 {
563 struct dn_pkt_tag *dt = dn_tag_get(pkt);
564
565 q->head = pkt->m_nextpkt ;
566 q->len-- ;
567 q->len_bytes -= len ;
568
569 dt->output_time = curr_time + p->delay ;
570
571 if (p->head == NULL)
572 p->head = pkt;
573 else
574 p->tail->m_nextpkt = pkt;
575 p->tail = pkt;
576 p->tail->m_nextpkt = NULL;
577 }
578
579 /*
580 * ready_event() is invoked every time the queue must enter the
581 * scheduler, either because the first packet arrives, or because
582 * a previously scheduled event fired.
583 * On invokation, drain as many pkts as possible (could be 0) and then
584 * if there are leftover packets reinsert the pkt in the scheduler.
585 */
586 static void
587 ready_event(struct dn_flow_queue *q)
588 {
589 struct mbuf *pkt;
590 struct dn_pipe *p = q->fs->pipe ;
591 int p_was_empty ;
592
593 DUMMYNET_LOCK_ASSERT();
594
595 if (p == NULL) {
596 printf("dummynet: ready_event- pipe is gone\n");
597 return ;
598 }
599 p_was_empty = (p->head == NULL) ;
600
601 /*
602 * schedule fixed-rate queues linked to this pipe:
603 * Account for the bw accumulated since last scheduling, then
604 * drain as many pkts as allowed by q->numbytes and move to
605 * the delay line (in p) computing output time.
606 * bandwidth==0 (no limit) means we can drain the whole queue,
607 * setting len_scaled = 0 does the job.
608 */
609 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
610 while ( (pkt = q->head) != NULL ) {
611 int len = pkt->m_pkthdr.len;
612 int len_scaled = p->bandwidth ? len*8*hz : 0 ;
613 if (len_scaled > q->numbytes )
614 break ;
615 q->numbytes -= len_scaled ;
616 move_pkt(pkt, q, p, len);
617 }
618 /*
619 * If we have more packets queued, schedule next ready event
620 * (can only occur when bandwidth != 0, otherwise we would have
621 * flushed the whole queue in the previous loop).
622 * To this purpose we record the current time and compute how many
623 * ticks to go for the finish time of the packet.
624 */
625 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
626 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
627 q->sched_time = curr_time ;
628 heap_insert(&ready_heap, curr_time + t, (void *)q );
629 /* XXX should check errors on heap_insert, and drain the whole
630 * queue on error hoping next time we are luckier.
631 */
632 } else { /* RED needs to know when the queue becomes empty */
633 q->q_time = curr_time;
634 q->numbytes = 0;
635 }
636 /*
637 * If the delay line was empty call transmit_event(p) now.
638 * Otherwise, the scheduler will take care of it.
639 */
640 if (p_was_empty)
641 transmit_event(p);
642 }
643
644 /*
645 * Called when we can transmit packets on WF2Q queues. Take pkts out of
646 * the queues at their start time, and enqueue into the delay line.
647 * Packets are drained until p->numbytes < 0. As long as
648 * len_scaled >= p->numbytes, the packet goes into the delay line
649 * with a deadline p->delay. For the last packet, if p->numbytes<0,
650 * there is an additional delay.
651 */
652 static void
653 ready_event_wfq(struct dn_pipe *p)
654 {
655 int p_was_empty = (p->head == NULL) ;
656 struct dn_heap *sch = &(p->scheduler_heap);
657 struct dn_heap *neh = &(p->not_eligible_heap) ;
658
659 DUMMYNET_LOCK_ASSERT();
660
661 if (p->if_name[0] == 0) /* tx clock is simulated */
662 p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
663 else { /* tx clock is for real, the ifq must be empty or this is a NOP */
664 if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
665 return ;
666 else {
667 DPRINTF(("dummynet: pipe %d ready from %s --\n",
668 p->pipe_nr, p->if_name));
669 }
670 }
671
672 /*
673 * While we have backlogged traffic AND credit, we need to do
674 * something on the queue.
675 */
676 while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
677 if (sch->elements > 0) { /* have some eligible pkts to send out */
678 struct dn_flow_queue *q = sch->p[0].object ;
679 struct mbuf *pkt = q->head;
680 struct dn_flow_set *fs = q->fs;
681 u_int64_t len = pkt->m_pkthdr.len;
682 int len_scaled = p->bandwidth ? len*8*hz : 0 ;
683
684 heap_extract(sch, NULL); /* remove queue from heap */
685 p->numbytes -= len_scaled ;
686 move_pkt(pkt, q, p, len);
687
688 p->V += (len<<MY_M) / p->sum ; /* update V */
689 q->S = q->F ; /* update start time */
690 if (q->len == 0) { /* Flow not backlogged any more */
691 fs->backlogged-- ;
692 heap_insert(&(p->idle_heap), q->F, q);
693 } else { /* still backlogged */
694 /*
695 * update F and position in backlogged queue, then
696 * put flow in not_eligible_heap (we will fix this later).
697 */
698 len = (q->head)->m_pkthdr.len;
699 q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
700 if (DN_KEY_LEQ(q->S, p->V))
701 heap_insert(neh, q->S, q);
702 else
703 heap_insert(sch, q->F, q);
704 }
705 }
706 /*
707 * now compute V = max(V, min(S_i)). Remember that all elements in sch
708 * have by definition S_i <= V so if sch is not empty, V is surely
709 * the max and we must not update it. Conversely, if sch is empty
710 * we only need to look at neh.
711 */
712 if (sch->elements == 0 && neh->elements > 0)
713 p->V = MAX64 ( p->V, neh->p[0].key );
714 /* move from neh to sch any packets that have become eligible */
715 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
716 struct dn_flow_queue *q = neh->p[0].object ;
717 heap_extract(neh, NULL);
718 heap_insert(sch, q->F, q);
719 }
720
721 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
722 p->numbytes = -1 ; /* mark not ready for I/O */
723 break ;
724 }
725 }
726 if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
727 && p->idle_heap.elements > 0) {
728 /*
729 * no traffic and no events scheduled. We can get rid of idle-heap.
730 */
731 int i ;
732
733 for (i = 0 ; i < p->idle_heap.elements ; i++) {
734 struct dn_flow_queue *q = p->idle_heap.p[i].object ;
735
736 q->F = 0 ;
737 q->S = q->F + 1 ;
738 }
739 p->sum = 0 ;
740 p->V = 0 ;
741 p->idle_heap.elements = 0 ;
742 }
743 /*
744 * If we are getting clocks from dummynet (not a real interface) and
745 * If we are under credit, schedule the next ready event.
746 * Also fix the delivery time of the last packet.
747 */
748 if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
749 dn_key t=0 ; /* number of ticks i have to wait */
750
751 if (p->bandwidth > 0)
752 t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
753 dn_tag_get(p->tail)->output_time += t ;
754 p->sched_time = curr_time ;
755 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
756 /* XXX should check errors on heap_insert, and drain the whole
757 * queue on error hoping next time we are luckier.
758 */
759 }
760 /*
761 * If the delay line was empty call transmit_event(p) now.
762 * Otherwise, the scheduler will take care of it.
763 */
764 if (p_was_empty)
765 transmit_event(p);
766 }
767
768 /*
769 * This is called once per tick, or HZ times per second. It is used to
770 * increment the current tick counter and schedule expired events.
771 */
772 static void
773 dummynet(void * __unused unused)
774 {
775 void *p ; /* generic parameter to handler */
776 struct dn_heap *h ;
777 struct dn_heap *heaps[3];
778 int i;
779 struct dn_pipe *pe ;
780
781 heaps[0] = &ready_heap ; /* fixed-rate queues */
782 heaps[1] = &wfq_ready_heap ; /* wfq queues */
783 heaps[2] = &extract_heap ; /* delay line */
784
785 DUMMYNET_LOCK();
786 curr_time++ ;
787 for (i=0; i < 3 ; i++) {
788 h = heaps[i];
789 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
790 if (h->p[0].key > curr_time)
791 printf("dummynet: warning, heap %d is %d ticks late\n",
792 i, (int)(curr_time - h->p[0].key));
793 p = h->p[0].object ; /* store a copy before heap_extract */
794 heap_extract(h, NULL); /* need to extract before processing */
795 if (i == 0)
796 ready_event(p) ;
797 else if (i == 1) {
798 struct dn_pipe *pipe = p;
799 if (pipe->if_name[0] != '\0')
800 printf("dummynet: bad ready_event_wfq for pipe %s\n",
801 pipe->if_name);
802 else
803 ready_event_wfq(p) ;
804 } else
805 transmit_event(p);
806 }
807 }
808 /* sweep pipes trying to expire idle flow_queues */
809 for (pe = all_pipes; pe ; pe = pe->next )
810 if (pe->idle_heap.elements > 0 &&
811 DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
812 struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
813
814 heap_extract(&(pe->idle_heap), NULL);
815 q->S = q->F + 1 ; /* mark timestamp as invalid */
816 pe->sum -= q->fs->weight ;
817 }
818 DUMMYNET_UNLOCK();
819
820 callout_reset(&dn_timeout, 1, dummynet, NULL);
821 }
822
823 /*
824 * called by an interface when tx_rdy occurs.
825 */
826 int
827 if_tx_rdy(struct ifnet *ifp)
828 {
829 struct dn_pipe *p;
830
831 DUMMYNET_LOCK();
832 for (p = all_pipes; p ; p = p->next )
833 if (p->ifp == ifp)
834 break ;
835 if (p == NULL) {
836 for (p = all_pipes; p ; p = p->next )
837 if (!strcmp(p->if_name, ifp->if_xname) ) {
838 p->ifp = ifp ;
839 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n",
840 ifp->if_xname));
841 break ;
842 }
843 }
844 if (p != NULL) {
845 DPRINTF(("dummynet: ++ tx rdy from %s - qlen %d\n", ifp->if_xname,
846 ifp->if_snd.ifq_len));
847 p->numbytes = 0 ; /* mark ready for I/O */
848 ready_event_wfq(p);
849 }
850 DUMMYNET_UNLOCK();
851
852 return 0;
853 }
854
855 /*
856 * Unconditionally expire empty queues in case of shortage.
857 * Returns the number of queues freed.
858 */
859 static int
860 expire_queues(struct dn_flow_set *fs)
861 {
862 struct dn_flow_queue *q, *prev ;
863 int i, initial_elements = fs->rq_elements ;
864
865 if (fs->last_expired == time_second)
866 return 0 ;
867 fs->last_expired = time_second ;
868 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
869 for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
870 if (q->head != NULL || q->S != q->F+1) {
871 prev = q ;
872 q = q->next ;
873 } else { /* entry is idle, expire it */
874 struct dn_flow_queue *old_q = q ;
875
876 if (prev != NULL)
877 prev->next = q = q->next ;
878 else
879 fs->rq[i] = q = q->next ;
880 fs->rq_elements-- ;
881 free(old_q, M_DUMMYNET);
882 }
883 return initial_elements - fs->rq_elements ;
884 }
885
886 /*
887 * If room, create a new queue and put at head of slot i;
888 * otherwise, create or use the default queue.
889 */
890 static struct dn_flow_queue *
891 create_queue(struct dn_flow_set *fs, int i)
892 {
893 struct dn_flow_queue *q ;
894
895 if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
896 expire_queues(fs) == 0) {
897 /*
898 * No way to get room, use or create overflow queue.
899 */
900 i = fs->rq_size ;
901 if ( fs->rq[i] != NULL )
902 return fs->rq[i] ;
903 }
904 q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
905 if (q == NULL) {
906 printf("dummynet: sorry, cannot allocate queue for new flow\n");
907 return NULL ;
908 }
909 q->fs = fs ;
910 q->hash_slot = i ;
911 q->next = fs->rq[i] ;
912 q->S = q->F + 1; /* hack - mark timestamp as invalid */
913 fs->rq[i] = q ;
914 fs->rq_elements++ ;
915 return q ;
916 }
917
918 /*
919 * Given a flow_set and a pkt in last_pkt, find a matching queue
920 * after appropriate masking. The queue is moved to front
921 * so that further searches take less time.
922 */
923 static struct dn_flow_queue *
924 find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
925 {
926 int i = 0 ; /* we need i and q for new allocations */
927 struct dn_flow_queue *q, *prev;
928 int is_v6 = IS_IP6_FLOW_ID(id);
929
930 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
931 q = fs->rq[0] ;
932 else {
933 /* first, do the masking, then hash */
934 id->dst_port &= fs->flow_mask.dst_port ;
935 id->src_port &= fs->flow_mask.src_port ;
936 id->proto &= fs->flow_mask.proto ;
937 id->flags = 0 ; /* we don't care about this one */
938 if (is_v6) {
939 APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
940 APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
941 id->flow_id6 &= fs->flow_mask.flow_id6;
942
943 i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
944 ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
945 ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
946 ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
947
948 ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
949 ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
950 ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
951 ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
952
953 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
954 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
955 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
956 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
957
958 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
959 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
960 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
961 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
962
963 (id->dst_port << 1) ^ (id->src_port) ^
964 (id->proto ) ^
965 (id->flow_id6);
966 } else {
967 id->dst_ip &= fs->flow_mask.dst_ip ;
968 id->src_ip &= fs->flow_mask.src_ip ;
969
970 i = ( (id->dst_ip) & 0xffff ) ^
971 ( (id->dst_ip >> 15) & 0xffff ) ^
972 ( (id->src_ip << 1) & 0xffff ) ^
973 ( (id->src_ip >> 16 ) & 0xffff ) ^
974 (id->dst_port << 1) ^ (id->src_port) ^
975 (id->proto );
976 }
977 i = i % fs->rq_size ;
978 /* finally, scan the current list for a match */
979 searches++ ;
980 for (prev=NULL, q = fs->rq[i] ; q ; ) {
981 search_steps++;
982 if (is_v6 &&
983 IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&
984 IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&
985 id->dst_port == q->id.dst_port &&
986 id->src_port == q->id.src_port &&
987 id->proto == q->id.proto &&
988 id->flags == q->id.flags &&
989 id->flow_id6 == q->id.flow_id6)
990 break ; /* found */
991
992 if (!is_v6 && id->dst_ip == q->id.dst_ip &&
993 id->src_ip == q->id.src_ip &&
994 id->dst_port == q->id.dst_port &&
995 id->src_port == q->id.src_port &&
996 id->proto == q->id.proto &&
997 id->flags == q->id.flags)
998 break ; /* found */
999
1000 /* No match. Check if we can expire the entry */
1001 if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
1002 /* entry is idle and not in any heap, expire it */
1003 struct dn_flow_queue *old_q = q ;
1004
1005 if (prev != NULL)
1006 prev->next = q = q->next ;
1007 else
1008 fs->rq[i] = q = q->next ;
1009 fs->rq_elements-- ;
1010 free(old_q, M_DUMMYNET);
1011 continue ;
1012 }
1013 prev = q ;
1014 q = q->next ;
1015 }
1016 if (q && prev != NULL) { /* found and not in front */
1017 prev->next = q->next ;
1018 q->next = fs->rq[i] ;
1019 fs->rq[i] = q ;
1020 }
1021 }
1022 if (q == NULL) { /* no match, need to allocate a new entry */
1023 q = create_queue(fs, i);
1024 if (q != NULL)
1025 q->id = *id ;
1026 }
1027 return q ;
1028 }
1029
1030 static int
1031 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
1032 {
1033 /*
1034 * RED algorithm
1035 *
1036 * RED calculates the average queue size (avg) using a low-pass filter
1037 * with an exponential weighted (w_q) moving average:
1038 * avg <- (1-w_q) * avg + w_q * q_size
1039 * where q_size is the queue length (measured in bytes or * packets).
1040 *
1041 * If q_size == 0, we compute the idle time for the link, and set
1042 * avg = (1 - w_q)^(idle/s)
1043 * where s is the time needed for transmitting a medium-sized packet.
1044 *
1045 * Now, if avg < min_th the packet is enqueued.
1046 * If avg > max_th the packet is dropped. Otherwise, the packet is
1047 * dropped with probability P function of avg.
1048 *
1049 */
1050
1051 int64_t p_b = 0;
1052 /* queue in bytes or packets ? */
1053 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
1054
1055 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
1056
1057 /* average queue size estimation */
1058 if (q_size != 0) {
1059 /*
1060 * queue is not empty, avg <- avg + (q_size - avg) * w_q
1061 */
1062 int diff = SCALE(q_size) - q->avg;
1063 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
1064
1065 q->avg += (int) v;
1066 } else {
1067 /*
1068 * queue is empty, find for how long the queue has been
1069 * empty and use a lookup table for computing
1070 * (1 - * w_q)^(idle_time/s) where s is the time to send a
1071 * (small) packet.
1072 * XXX check wraps...
1073 */
1074 if (q->avg) {
1075 u_int t = (curr_time - q->q_time) / fs->lookup_step;
1076
1077 q->avg = (t < fs->lookup_depth) ?
1078 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
1079 }
1080 }
1081 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1082
1083 /* should i drop ? */
1084
1085 if (q->avg < fs->min_th) {
1086 q->count = -1;
1087 return 0; /* accept packet ; */
1088 }
1089 if (q->avg >= fs->max_th) { /* average queue >= max threshold */
1090 if (fs->flags_fs & DN_IS_GENTLE_RED) {
1091 /*
1092 * According to Gentle-RED, if avg is greater than max_th the
1093 * packet is dropped with a probability
1094 * p_b = c_3 * avg - c_4
1095 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1096 */
1097 p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1098 } else {
1099 q->count = -1;
1100 DPRINTF(("dummynet: - drop"));
1101 return 1 ;
1102 }
1103 } else if (q->avg > fs->min_th) {
1104 /*
1105 * we compute p_b using the linear dropping function p_b = c_1 *
1106 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1107 * max_p * min_th / (max_th - min_th)
1108 */
1109 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1110 }
1111 if (fs->flags_fs & DN_QSIZE_IS_BYTES)
1112 p_b = (p_b * len) / fs->max_pkt_size;
1113 if (++q->count == 0)
1114 q->random = random() & 0xffff;
1115 else {
1116 /*
1117 * q->count counts packets arrived since last drop, so a greater
1118 * value of q->count means a greater packet drop probability.
1119 */
1120 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1121 q->count = 0;
1122 DPRINTF(("dummynet: - red drop"));
1123 /* after a drop we calculate a new random value */
1124 q->random = random() & 0xffff;
1125 return 1; /* drop */
1126 }
1127 }
1128 /* end of RED algorithm */
1129 return 0 ; /* accept */
1130 }
1131
1132 static __inline
1133 struct dn_flow_set *
1134 locate_flowset(int pipe_nr, struct ip_fw *rule)
1135 {
1136 struct dn_flow_set *fs;
1137 ipfw_insn *cmd = ACTION_PTR(rule);
1138
1139 if (cmd->opcode == O_LOG)
1140 cmd += F_LEN(cmd);
1141 #ifdef __i386__
1142 fs = ((ipfw_insn_pipe *)cmd)->pipe_ptr;
1143 #else
1144 bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs));
1145 #endif
1146
1147 if (fs != NULL)
1148 return fs;
1149
1150 if (cmd->opcode == O_QUEUE)
1151 for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
1152 ;
1153 else {
1154 struct dn_pipe *p1;
1155 for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
1156 ;
1157 if (p1 != NULL)
1158 fs = &(p1->fs) ;
1159 }
1160 /* record for the future */
1161 #ifdef __i386__
1162 ((ipfw_insn_pipe *)cmd)->pipe_ptr = fs;
1163 #else
1164 bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs));
1165 #endif
1166 return fs ;
1167 }
1168
1169 /*
1170 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1171 * depending on whether WF2Q or fixed bw is used.
1172 *
1173 * pipe_nr pipe or queue the packet is destined for.
1174 * dir where shall we send the packet after dummynet.
1175 * m the mbuf with the packet
1176 * ifp the 'ifp' parameter from the caller.
1177 * NULL in ip_input, destination interface in ip_output,
1178 * real_dst in bdg_forward
1179 * rule matching rule, in case of multiple passes
1180 * flags flags from the caller, only used in ip_output
1181 *
1182 */
1183 static int
1184 dummynet_io(struct mbuf *m, int dir, struct ip_fw_args *fwa)
1185 {
1186 struct dn_pkt_tag *pkt;
1187 struct m_tag *mtag;
1188 struct dn_flow_set *fs;
1189 struct dn_pipe *pipe ;
1190 u_int64_t len = m->m_pkthdr.len ;
1191 struct dn_flow_queue *q = NULL ;
1192 int is_pipe;
1193 ipfw_insn *cmd = ACTION_PTR(fwa->rule);
1194
1195 KASSERT(m->m_nextpkt == NULL,
1196 ("dummynet_io: mbuf queue passed to dummynet"));
1197
1198 if (cmd->opcode == O_LOG)
1199 cmd += F_LEN(cmd);
1200 is_pipe = (cmd->opcode == O_PIPE);
1201
1202 DUMMYNET_LOCK();
1203 /*
1204 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1205 */
1206 fs = locate_flowset(fwa->cookie, fwa->rule);
1207 if (fs == NULL)
1208 goto dropit ; /* this queue/pipe does not exist! */
1209 pipe = fs->pipe ;
1210 if (pipe == NULL) { /* must be a queue, try find a matching pipe */
1211 for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
1212 pipe = pipe->next)
1213 ;
1214 if (pipe != NULL)
1215 fs->pipe = pipe ;
1216 else {
1217 printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1218 fs->parent_nr, fs->fs_nr);
1219 goto dropit ;
1220 }
1221 }
1222 q = find_queue(fs, &(fwa->f_id));
1223 if ( q == NULL )
1224 goto dropit ; /* cannot allocate queue */
1225 /*
1226 * update statistics, then check reasons to drop pkt
1227 */
1228 q->tot_bytes += len ;
1229 q->tot_pkts++ ;
1230 if ( fs->plr && random() < fs->plr )
1231 goto dropit ; /* random pkt drop */
1232 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
1233 if (q->len_bytes > fs->qsize)
1234 goto dropit ; /* queue size overflow */
1235 } else {
1236 if (q->len >= fs->qsize)
1237 goto dropit ; /* queue count overflow */
1238 }
1239 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
1240 goto dropit ;
1241
1242 /* XXX expensive to zero, see if we can remove it*/
1243 mtag = m_tag_get(PACKET_TAG_DUMMYNET,
1244 sizeof(struct dn_pkt_tag), M_NOWAIT|M_ZERO);
1245 if ( mtag == NULL )
1246 goto dropit ; /* cannot allocate packet header */
1247 m_tag_prepend(m, mtag); /* attach to mbuf chain */
1248
1249 pkt = (struct dn_pkt_tag *)(mtag+1);
1250 /* ok, i can handle the pkt now... */
1251 /* build and enqueue packet + parameters */
1252 pkt->rule = fwa->rule ;
1253 pkt->dn_dir = dir ;
1254
1255 pkt->ifp = fwa->oif;
1256 if (dir == DN_TO_IP_OUT || dir == DN_TO_IP6_OUT)
1257 pkt->flags = fwa->flags;
1258
1259 if (q->head == NULL)
1260 q->head = m;
1261 else
1262 q->tail->m_nextpkt = m;
1263 q->tail = m;
1264 q->len++;
1265 q->len_bytes += len ;
1266
1267 if ( q->head != m ) /* flow was not idle, we are done */
1268 goto done;
1269 /*
1270 * If we reach this point the flow was previously idle, so we need
1271 * to schedule it. This involves different actions for fixed-rate or
1272 * WF2Q queues.
1273 */
1274 if (is_pipe) {
1275 /*
1276 * Fixed-rate queue: just insert into the ready_heap.
1277 */
1278 dn_key t = 0 ;
1279 if (pipe->bandwidth)
1280 t = SET_TICKS(m, q, pipe);
1281 q->sched_time = curr_time ;
1282 if (t == 0) /* must process it now */
1283 ready_event( q );
1284 else
1285 heap_insert(&ready_heap, curr_time + t , q );
1286 } else {
1287 /*
1288 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1289 * set S to the virtual time V for the controlling pipe, and update
1290 * the sum of weights for the pipe; otherwise, remove flow from
1291 * idle_heap and set S to max(F,V).
1292 * Second, compute finish time F = S + len/weight.
1293 * Third, if pipe was idle, update V=max(S, V).
1294 * Fourth, count one more backlogged flow.
1295 */
1296 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
1297 q->S = pipe->V ;
1298 pipe->sum += fs->weight ; /* add weight of new queue */
1299 } else {
1300 heap_extract(&(pipe->idle_heap), q);
1301 q->S = MAX64(q->F, pipe->V ) ;
1302 }
1303 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
1304
1305 if (pipe->not_eligible_heap.elements == 0 &&
1306 pipe->scheduler_heap.elements == 0)
1307 pipe->V = MAX64 ( q->S, pipe->V );
1308 fs->backlogged++ ;
1309 /*
1310 * Look at eligibility. A flow is not eligibile if S>V (when
1311 * this happens, it means that there is some other flow already
1312 * scheduled for the same pipe, so the scheduler_heap cannot be
1313 * empty). If the flow is not eligible we just store it in the
1314 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1315 * and possibly invoke ready_event_wfq() right now if there is
1316 * leftover credit.
1317 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1318 * and for all flows in not_eligible_heap (NEH), S_i > V .
1319 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1320 * we only need to look into NEH.
1321 */
1322 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
1323 if (pipe->scheduler_heap.elements == 0)
1324 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1325 heap_insert(&(pipe->not_eligible_heap), q->S, q);
1326 } else {
1327 heap_insert(&(pipe->scheduler_heap), q->F, q);
1328 if (pipe->numbytes >= 0) { /* pipe is idle */
1329 if (pipe->scheduler_heap.elements != 1)
1330 printf("dummynet: OUCH! pipe should have been idle!\n");
1331 DPRINTF(("dummynet: waking up pipe %d at %d\n",
1332 pipe->pipe_nr, (int)(q->F >> MY_M)));
1333 pipe->sched_time = curr_time ;
1334 ready_event_wfq(pipe);
1335 }
1336 }
1337 }
1338 done:
1339 DUMMYNET_UNLOCK();
1340 return 0;
1341
1342 dropit:
1343 if (q)
1344 q->drops++ ;
1345 DUMMYNET_UNLOCK();
1346 m_freem(m);
1347 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
1348 }
1349
1350 /*
1351 * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1352 * Doing this would probably save us the initial bzero of dn_pkt
1353 */
1354 #define DN_FREE_PKT(_m) do { \
1355 m_freem(_m); \
1356 } while (0)
1357
1358 /*
1359 * Dispose all packets and flow_queues on a flow_set.
1360 * If all=1, also remove red lookup table and other storage,
1361 * including the descriptor itself.
1362 * For the one in dn_pipe MUST also cleanup ready_heap...
1363 */
1364 static void
1365 purge_flow_set(struct dn_flow_set *fs, int all)
1366 {
1367 struct dn_flow_queue *q, *qn ;
1368 int i ;
1369
1370 DUMMYNET_LOCK_ASSERT();
1371
1372 for (i = 0 ; i <= fs->rq_size ; i++ ) {
1373 for (q = fs->rq[i] ; q ; q = qn ) {
1374 struct mbuf *m, *mnext;
1375
1376 mnext = q->head;
1377 while ((m = mnext) != NULL) {
1378 mnext = m->m_nextpkt;
1379 DN_FREE_PKT(m);
1380 }
1381 qn = q->next ;
1382 free(q, M_DUMMYNET);
1383 }
1384 fs->rq[i] = NULL ;
1385 }
1386 fs->rq_elements = 0 ;
1387 if (all) {
1388 /* RED - free lookup table */
1389 if (fs->w_q_lookup)
1390 free(fs->w_q_lookup, M_DUMMYNET);
1391 if (fs->rq)
1392 free(fs->rq, M_DUMMYNET);
1393 /* if this fs is not part of a pipe, free it */
1394 if (fs->pipe && fs != &(fs->pipe->fs) )
1395 free(fs, M_DUMMYNET);
1396 }
1397 }
1398
1399 /*
1400 * Dispose all packets queued on a pipe (not a flow_set).
1401 * Also free all resources associated to a pipe, which is about
1402 * to be deleted.
1403 */
1404 static void
1405 purge_pipe(struct dn_pipe *pipe)
1406 {
1407 struct mbuf *m, *mnext;
1408
1409 purge_flow_set( &(pipe->fs), 1 );
1410
1411 mnext = pipe->head;
1412 while ((m = mnext) != NULL) {
1413 mnext = m->m_nextpkt;
1414 DN_FREE_PKT(m);
1415 }
1416
1417 heap_free( &(pipe->scheduler_heap) );
1418 heap_free( &(pipe->not_eligible_heap) );
1419 heap_free( &(pipe->idle_heap) );
1420 }
1421
1422 /*
1423 * Delete all pipes and heaps returning memory. Must also
1424 * remove references from all ipfw rules to all pipes.
1425 */
1426 static void
1427 dummynet_flush(void)
1428 {
1429 struct dn_pipe *curr_p, *p ;
1430 struct dn_flow_set *fs, *curr_fs;
1431
1432 DUMMYNET_LOCK();
1433 /* remove all references to pipes ...*/
1434 flush_pipe_ptrs(NULL);
1435 /* prevent future matches... */
1436 p = all_pipes ;
1437 all_pipes = NULL ;
1438 fs = all_flow_sets ;
1439 all_flow_sets = NULL ;
1440 /* and free heaps so we don't have unwanted events */
1441 heap_free(&ready_heap);
1442 heap_free(&wfq_ready_heap);
1443 heap_free(&extract_heap);
1444
1445 /*
1446 * Now purge all queued pkts and delete all pipes
1447 */
1448 /* scan and purge all flow_sets. */
1449 for ( ; fs ; ) {
1450 curr_fs = fs ;
1451 fs = fs->next ;
1452 purge_flow_set(curr_fs, 1);
1453 }
1454 for ( ; p ; ) {
1455 purge_pipe(p);
1456 curr_p = p ;
1457 p = p->next ;
1458 free(curr_p, M_DUMMYNET);
1459 }
1460 DUMMYNET_UNLOCK();
1461 }
1462
1463
1464 extern struct ip_fw *ip_fw_default_rule ;
1465 static void
1466 dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
1467 {
1468 int i ;
1469 struct dn_flow_queue *q ;
1470 struct mbuf *m ;
1471
1472 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
1473 for (q = fs->rq[i] ; q ; q = q->next )
1474 for (m = q->head ; m ; m = m->m_nextpkt ) {
1475 struct dn_pkt_tag *pkt = dn_tag_get(m) ;
1476 if (pkt->rule == r)
1477 pkt->rule = ip_fw_default_rule ;
1478 }
1479 }
1480 /*
1481 * when a firewall rule is deleted, scan all queues and remove the flow-id
1482 * from packets matching this rule.
1483 */
1484 void
1485 dn_rule_delete(void *r)
1486 {
1487 struct dn_pipe *p ;
1488 struct dn_flow_set *fs ;
1489 struct dn_pkt_tag *pkt ;
1490 struct mbuf *m ;
1491
1492 DUMMYNET_LOCK();
1493 /*
1494 * If the rule references a queue (dn_flow_set), then scan
1495 * the flow set, otherwise scan pipes. Should do either, but doing
1496 * both does not harm.
1497 */
1498 for ( fs = all_flow_sets ; fs ; fs = fs->next )
1499 dn_rule_delete_fs(fs, r);
1500 for ( p = all_pipes ; p ; p = p->next ) {
1501 fs = &(p->fs) ;
1502 dn_rule_delete_fs(fs, r);
1503 for (m = p->head ; m ; m = m->m_nextpkt ) {
1504 pkt = dn_tag_get(m) ;
1505 if (pkt->rule == r)
1506 pkt->rule = ip_fw_default_rule ;
1507 }
1508 }
1509 DUMMYNET_UNLOCK();
1510 }
1511
1512 /*
1513 * setup RED parameters
1514 */
1515 static int
1516 config_red(struct dn_flow_set *p, struct dn_flow_set * x)
1517 {
1518 int i;
1519
1520 x->w_q = p->w_q;
1521 x->min_th = SCALE(p->min_th);
1522 x->max_th = SCALE(p->max_th);
1523 x->max_p = p->max_p;
1524
1525 x->c_1 = p->max_p / (p->max_th - p->min_th);
1526 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1527 if (x->flags_fs & DN_IS_GENTLE_RED) {
1528 x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
1529 x->c_4 = (SCALE(1) - 2 * p->max_p);
1530 }
1531
1532 /* if the lookup table already exist, free and create it again */
1533 if (x->w_q_lookup) {
1534 free(x->w_q_lookup, M_DUMMYNET);
1535 x->w_q_lookup = NULL ;
1536 }
1537 if (red_lookup_depth == 0) {
1538 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1539 free(x, M_DUMMYNET);
1540 return EINVAL;
1541 }
1542 x->lookup_depth = red_lookup_depth;
1543 x->w_q_lookup = (u_int *) malloc(x->lookup_depth * sizeof(int),
1544 M_DUMMYNET, M_NOWAIT);
1545 if (x->w_q_lookup == NULL) {
1546 printf("dummynet: sorry, cannot allocate red lookup table\n");
1547 free(x, M_DUMMYNET);
1548 return ENOSPC;
1549 }
1550
1551 /* fill the lookup table with (1 - w_q)^x */
1552 x->lookup_step = p->lookup_step ;
1553 x->lookup_weight = p->lookup_weight ;
1554 x->w_q_lookup[0] = SCALE(1) - x->w_q;
1555 for (i = 1; i < x->lookup_depth; i++)
1556 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
1557 if (red_avg_pkt_size < 1)
1558 red_avg_pkt_size = 512 ;
1559 x->avg_pkt_size = red_avg_pkt_size ;
1560 if (red_max_pkt_size < 1)
1561 red_max_pkt_size = 1500 ;
1562 x->max_pkt_size = red_max_pkt_size ;
1563 return 0 ;
1564 }
1565
1566 static int
1567 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
1568 {
1569 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
1570 int l = pfs->rq_size;
1571
1572 if (l == 0)
1573 l = dn_hash_size;
1574 if (l < 4)
1575 l = 4;
1576 else if (l > DN_MAX_HASH_SIZE)
1577 l = DN_MAX_HASH_SIZE;
1578 x->rq_size = l;
1579 } else /* one is enough for null mask */
1580 x->rq_size = 1;
1581 x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
1582 M_DUMMYNET, M_NOWAIT | M_ZERO);
1583 if (x->rq == NULL) {
1584 printf("dummynet: sorry, cannot allocate queue\n");
1585 return ENOSPC;
1586 }
1587 x->rq_elements = 0;
1588 return 0 ;
1589 }
1590
1591 static void
1592 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
1593 {
1594 x->flags_fs = src->flags_fs;
1595 x->qsize = src->qsize;
1596 x->plr = src->plr;
1597 x->flow_mask = src->flow_mask;
1598 if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1599 if (x->qsize > 1024*1024)
1600 x->qsize = 1024*1024 ;
1601 } else {
1602 if (x->qsize == 0)
1603 x->qsize = 50 ;
1604 if (x->qsize > 100)
1605 x->qsize = 50 ;
1606 }
1607 /* configuring RED */
1608 if ( x->flags_fs & DN_IS_RED )
1609 config_red(src, x) ; /* XXX should check errors */
1610 }
1611
1612 /*
1613 * setup pipe or queue parameters.
1614 */
1615
1616 static int
1617 config_pipe(struct dn_pipe *p)
1618 {
1619 int i, r;
1620 struct dn_flow_set *pfs = &(p->fs);
1621 struct dn_flow_queue *q;
1622
1623 /*
1624 * The config program passes parameters as follows:
1625 * bw = bits/second (0 means no limits),
1626 * delay = ms, must be translated into ticks.
1627 * qsize = slots/bytes
1628 */
1629 p->delay = ( p->delay * hz ) / 1000 ;
1630 /* We need either a pipe number or a flow_set number */
1631 if (p->pipe_nr == 0 && pfs->fs_nr == 0)
1632 return EINVAL ;
1633 if (p->pipe_nr != 0 && pfs->fs_nr != 0)
1634 return EINVAL ;
1635 if (p->pipe_nr != 0) { /* this is a pipe */
1636 struct dn_pipe *x, *a, *b;
1637
1638 DUMMYNET_LOCK();
1639 /* locate pipe */
1640 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1641 a = b , b = b->next) ;
1642
1643 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
1644 x = malloc(sizeof(struct dn_pipe), M_DUMMYNET, M_NOWAIT | M_ZERO);
1645 if (x == NULL) {
1646 DUMMYNET_UNLOCK();
1647 printf("dummynet: no memory for new pipe\n");
1648 return ENOSPC;
1649 }
1650 x->pipe_nr = p->pipe_nr;
1651 x->fs.pipe = x ;
1652 /* idle_heap is the only one from which we extract from the middle.
1653 */
1654 x->idle_heap.size = x->idle_heap.elements = 0 ;
1655 x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
1656 } else {
1657 x = b;
1658 /* Flush accumulated credit for all queues */
1659 for (i = 0; i <= x->fs.rq_size; i++)
1660 for (q = x->fs.rq[i]; q; q = q->next)
1661 q->numbytes = 0;
1662 }
1663
1664 x->bandwidth = p->bandwidth ;
1665 x->numbytes = 0; /* just in case... */
1666 bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
1667 x->ifp = NULL ; /* reset interface ptr */
1668 x->delay = p->delay ;
1669 set_fs_parms(&(x->fs), pfs);
1670
1671
1672 if ( x->fs.rq == NULL ) { /* a new pipe */
1673 r = alloc_hash(&(x->fs), pfs) ;
1674 if (r) {
1675 DUMMYNET_UNLOCK();
1676 free(x, M_DUMMYNET);
1677 return r ;
1678 }
1679 x->next = b ;
1680 if (a == NULL)
1681 all_pipes = x ;
1682 else
1683 a->next = x ;
1684 }
1685 DUMMYNET_UNLOCK();
1686 } else { /* config queue */
1687 struct dn_flow_set *x, *a, *b ;
1688
1689 DUMMYNET_LOCK();
1690 /* locate flow_set */
1691 for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
1692 a = b , b = b->next) ;
1693
1694 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */
1695 if (pfs->parent_nr == 0) { /* need link to a pipe */
1696 DUMMYNET_UNLOCK();
1697 return EINVAL ;
1698 }
1699 x = malloc(sizeof(struct dn_flow_set), M_DUMMYNET, M_NOWAIT|M_ZERO);
1700 if (x == NULL) {
1701 DUMMYNET_UNLOCK();
1702 printf("dummynet: no memory for new flow_set\n");
1703 return ENOSPC;
1704 }
1705 x->fs_nr = pfs->fs_nr;
1706 x->parent_nr = pfs->parent_nr;
1707 x->weight = pfs->weight ;
1708 if (x->weight == 0)
1709 x->weight = 1 ;
1710 else if (x->weight > 100)
1711 x->weight = 100 ;
1712 } else {
1713 /* Change parent pipe not allowed; must delete and recreate */
1714 if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) {
1715 DUMMYNET_UNLOCK();
1716 return EINVAL ;
1717 }
1718 x = b;
1719 }
1720 set_fs_parms(x, pfs);
1721
1722 if ( x->rq == NULL ) { /* a new flow_set */
1723 r = alloc_hash(x, pfs) ;
1724 if (r) {
1725 DUMMYNET_UNLOCK();
1726 free(x, M_DUMMYNET);
1727 return r ;
1728 }
1729 x->next = b;
1730 if (a == NULL)
1731 all_flow_sets = x;
1732 else
1733 a->next = x;
1734 }
1735 DUMMYNET_UNLOCK();
1736 }
1737 return 0 ;
1738 }
1739
1740 /*
1741 * Helper function to remove from a heap queues which are linked to
1742 * a flow_set about to be deleted.
1743 */
1744 static void
1745 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
1746 {
1747 int i = 0, found = 0 ;
1748 for (; i < h->elements ;)
1749 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
1750 h->elements-- ;
1751 h->p[i] = h->p[h->elements] ;
1752 found++ ;
1753 } else
1754 i++ ;
1755 if (found)
1756 heapify(h);
1757 }
1758
1759 /*
1760 * helper function to remove a pipe from a heap (can be there at most once)
1761 */
1762 static void
1763 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
1764 {
1765 if (h->elements > 0) {
1766 int i = 0 ;
1767 for (i=0; i < h->elements ; i++ ) {
1768 if (h->p[i].object == p) { /* found it */
1769 h->elements-- ;
1770 h->p[i] = h->p[h->elements] ;
1771 heapify(h);
1772 break ;
1773 }
1774 }
1775 }
1776 }
1777
1778 /*
1779 * drain all queues. Called in case of severe mbuf shortage.
1780 */
1781 void
1782 dummynet_drain()
1783 {
1784 struct dn_flow_set *fs;
1785 struct dn_pipe *p;
1786 struct mbuf *m, *mnext;
1787
1788 DUMMYNET_LOCK_ASSERT();
1789
1790 heap_free(&ready_heap);
1791 heap_free(&wfq_ready_heap);
1792 heap_free(&extract_heap);
1793 /* remove all references to this pipe from flow_sets */
1794 for (fs = all_flow_sets; fs; fs= fs->next )
1795 purge_flow_set(fs, 0);
1796
1797 for (p = all_pipes; p; p= p->next ) {
1798 purge_flow_set(&(p->fs), 0);
1799
1800 mnext = p->head;
1801 while ((m = mnext) != NULL) {
1802 mnext = m->m_nextpkt;
1803 DN_FREE_PKT(m);
1804 }
1805 p->head = p->tail = NULL ;
1806 }
1807 }
1808
1809 /*
1810 * Fully delete a pipe or a queue, cleaning up associated info.
1811 */
1812 static int
1813 delete_pipe(struct dn_pipe *p)
1814 {
1815 if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
1816 return EINVAL ;
1817 if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
1818 return EINVAL ;
1819 if (p->pipe_nr != 0) { /* this is an old-style pipe */
1820 struct dn_pipe *a, *b;
1821 struct dn_flow_set *fs;
1822
1823 DUMMYNET_LOCK();
1824 /* locate pipe */
1825 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1826 a = b , b = b->next) ;
1827 if (b == NULL || (b->pipe_nr != p->pipe_nr) ) {
1828 DUMMYNET_UNLOCK();
1829 return EINVAL ; /* not found */
1830 }
1831
1832 /* unlink from list of pipes */
1833 if (a == NULL)
1834 all_pipes = b->next ;
1835 else
1836 a->next = b->next ;
1837 /* remove references to this pipe from the ip_fw rules. */
1838 flush_pipe_ptrs(&(b->fs));
1839
1840 /* remove all references to this pipe from flow_sets */
1841 for (fs = all_flow_sets; fs; fs= fs->next )
1842 if (fs->pipe == b) {
1843 printf("dummynet: ++ ref to pipe %d from fs %d\n",
1844 p->pipe_nr, fs->fs_nr);
1845 fs->pipe = NULL ;
1846 purge_flow_set(fs, 0);
1847 }
1848 fs_remove_from_heap(&ready_heap, &(b->fs));
1849 purge_pipe(b); /* remove all data associated to this pipe */
1850 /* remove reference to here from extract_heap and wfq_ready_heap */
1851 pipe_remove_from_heap(&extract_heap, b);
1852 pipe_remove_from_heap(&wfq_ready_heap, b);
1853 DUMMYNET_UNLOCK();
1854
1855 free(b, M_DUMMYNET);
1856 } else { /* this is a WF2Q queue (dn_flow_set) */
1857 struct dn_flow_set *a, *b;
1858
1859 DUMMYNET_LOCK();
1860 /* locate set */
1861 for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
1862 a = b , b = b->next) ;
1863 if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) {
1864 DUMMYNET_UNLOCK();
1865 return EINVAL ; /* not found */
1866 }
1867
1868 if (a == NULL)
1869 all_flow_sets = b->next ;
1870 else
1871 a->next = b->next ;
1872 /* remove references to this flow_set from the ip_fw rules. */
1873 flush_pipe_ptrs(b);
1874
1875 if (b->pipe != NULL) {
1876 /* Update total weight on parent pipe and cleanup parent heaps */
1877 b->pipe->sum -= b->weight * b->backlogged ;
1878 fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
1879 fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
1880 #if 1 /* XXX should i remove from idle_heap as well ? */
1881 fs_remove_from_heap(&(b->pipe->idle_heap), b);
1882 #endif
1883 }
1884 purge_flow_set(b, 1);
1885 DUMMYNET_UNLOCK();
1886 }
1887 return 0 ;
1888 }
1889
1890 /*
1891 * helper function used to copy data from kernel in DUMMYNET_GET
1892 */
1893 static char *
1894 dn_copy_set(struct dn_flow_set *set, char *bp)
1895 {
1896 int i, copied = 0 ;
1897 struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
1898
1899 DUMMYNET_LOCK_ASSERT();
1900
1901 for (i = 0 ; i <= set->rq_size ; i++)
1902 for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
1903 if (q->hash_slot != i)
1904 printf("dummynet: ++ at %d: wrong slot (have %d, "
1905 "should be %d)\n", copied, q->hash_slot, i);
1906 if (q->fs != set)
1907 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
1908 i, q->fs, set);
1909 copied++ ;
1910 bcopy(q, qp, sizeof( *q ) );
1911 /* cleanup pointers */
1912 qp->next = NULL ;
1913 qp->head = qp->tail = NULL ;
1914 qp->fs = NULL ;
1915 }
1916 if (copied != set->rq_elements)
1917 printf("dummynet: ++ wrong count, have %d should be %d\n",
1918 copied, set->rq_elements);
1919 return (char *)qp ;
1920 }
1921
1922 static size_t
1923 dn_calc_size(void)
1924 {
1925 struct dn_flow_set *set ;
1926 struct dn_pipe *p ;
1927 size_t size ;
1928
1929 DUMMYNET_LOCK_ASSERT();
1930 /*
1931 * compute size of data structures: list of pipes and flow_sets.
1932 */
1933 for (p = all_pipes, size = 0 ; p ; p = p->next )
1934 size += sizeof( *p ) +
1935 p->fs.rq_elements * sizeof(struct dn_flow_queue);
1936 for (set = all_flow_sets ; set ; set = set->next )
1937 size += sizeof ( *set ) +
1938 set->rq_elements * sizeof(struct dn_flow_queue);
1939 return size ;
1940 }
1941
1942 static int
1943 dummynet_get(struct sockopt *sopt)
1944 {
1945 char *buf, *bp ; /* bp is the "copy-pointer" */
1946 size_t size ;
1947 struct dn_flow_set *set ;
1948 struct dn_pipe *p ;
1949 int error=0, i ;
1950
1951 /* XXX lock held too long */
1952 DUMMYNET_LOCK();
1953 /*
1954 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
1955 * cannot use this flag while holding a mutex.
1956 */
1957 for (i = 0; i < 10; i++) {
1958 size = dn_calc_size();
1959 DUMMYNET_UNLOCK();
1960 buf = malloc(size, M_TEMP, M_WAITOK);
1961 DUMMYNET_LOCK();
1962 if (size == dn_calc_size())
1963 break;
1964 free(buf, M_TEMP);
1965 buf = NULL;
1966 }
1967 if (buf == NULL) {
1968 DUMMYNET_UNLOCK();
1969 return ENOBUFS ;
1970 }
1971 for (p = all_pipes, bp = buf ; p ; p = p->next ) {
1972 struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
1973
1974 /*
1975 * copy pipe descriptor into *bp, convert delay back to ms,
1976 * then copy the flow_set descriptor(s) one at a time.
1977 * After each flow_set, copy the queue descriptor it owns.
1978 */
1979 bcopy(p, bp, sizeof( *p ) );
1980 pipe_bp->delay = (pipe_bp->delay * 1000) / hz ;
1981 /*
1982 * XXX the following is a hack based on ->next being the
1983 * first field in dn_pipe and dn_flow_set. The correct
1984 * solution would be to move the dn_flow_set to the beginning
1985 * of struct dn_pipe.
1986 */
1987 pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
1988 /* clean pointers */
1989 pipe_bp->head = pipe_bp->tail = NULL ;
1990 pipe_bp->fs.next = NULL ;
1991 pipe_bp->fs.pipe = NULL ;
1992 pipe_bp->fs.rq = NULL ;
1993
1994 bp += sizeof( *p ) ;
1995 bp = dn_copy_set( &(p->fs), bp );
1996 }
1997 for (set = all_flow_sets ; set ; set = set->next ) {
1998 struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ;
1999 bcopy(set, bp, sizeof( *set ) );
2000 /* XXX same hack as above */
2001 fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
2002 fs_bp->pipe = NULL ;
2003 fs_bp->rq = NULL ;
2004 bp += sizeof( *set ) ;
2005 bp = dn_copy_set( set, bp );
2006 }
2007 DUMMYNET_UNLOCK();
2008
2009 error = sooptcopyout(sopt, buf, size);
2010 free(buf, M_TEMP);
2011 return error ;
2012 }
2013
2014 /*
2015 * Handler for the various dummynet socket options (get, flush, config, del)
2016 */
2017 static int
2018 ip_dn_ctl(struct sockopt *sopt)
2019 {
2020 int error = 0 ;
2021 struct dn_pipe *p, tmp_pipe;
2022
2023 /* Disallow sets in really-really secure mode. */
2024 if (sopt->sopt_dir == SOPT_SET) {
2025 #if __FreeBSD_version >= 500034
2026 error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
2027 if (error)
2028 return (error);
2029 #else
2030 if (securelevel >= 3)
2031 return (EPERM);
2032 #endif
2033 }
2034
2035 switch (sopt->sopt_name) {
2036 default :
2037 printf("dummynet: -- unknown option %d", sopt->sopt_name);
2038 return EINVAL ;
2039
2040 case IP_DUMMYNET_GET :
2041 error = dummynet_get(sopt);
2042 break ;
2043
2044 case IP_DUMMYNET_FLUSH :
2045 dummynet_flush() ;
2046 break ;
2047
2048 case IP_DUMMYNET_CONFIGURE :
2049 p = &tmp_pipe ;
2050 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2051 if (error)
2052 break ;
2053 error = config_pipe(p);
2054 break ;
2055
2056 case IP_DUMMYNET_DEL : /* remove a pipe or queue */
2057 p = &tmp_pipe ;
2058 error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
2059 if (error)
2060 break ;
2061
2062 error = delete_pipe(p);
2063 break ;
2064 }
2065 return error ;
2066 }
2067
2068 static void
2069 ip_dn_init(void)
2070 {
2071 if (bootverbose)
2072 printf("DUMMYNET with IPv6 initialized (040826)\n");
2073
2074 DUMMYNET_LOCK_INIT();
2075
2076 all_pipes = NULL ;
2077 all_flow_sets = NULL ;
2078 ready_heap.size = ready_heap.elements = 0 ;
2079 ready_heap.offset = 0 ;
2080
2081 wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
2082 wfq_ready_heap.offset = 0 ;
2083
2084 extract_heap.size = extract_heap.elements = 0 ;
2085 extract_heap.offset = 0 ;
2086
2087 ip_dn_ctl_ptr = ip_dn_ctl;
2088 ip_dn_io_ptr = dummynet_io;
2089 ip_dn_ruledel_ptr = dn_rule_delete;
2090
2091 callout_init(&dn_timeout, NET_CALLOUT_MPSAFE);
2092 callout_reset(&dn_timeout, 1, dummynet, NULL);
2093 }
2094
2095 #ifdef KLD_MODULE
2096 static void
2097 ip_dn_destroy(void)
2098 {
2099 ip_dn_ctl_ptr = NULL;
2100 ip_dn_io_ptr = NULL;
2101 ip_dn_ruledel_ptr = NULL;
2102
2103 callout_stop(&dn_timeout);
2104 dummynet_flush();
2105
2106 DUMMYNET_LOCK_DESTROY();
2107 }
2108 #endif /* KLD_MODULE */
2109
2110 static int
2111 dummynet_modevent(module_t mod, int type, void *data)
2112 {
2113 switch (type) {
2114 case MOD_LOAD:
2115 if (DUMMYNET_LOADED) {
2116 printf("DUMMYNET already loaded\n");
2117 return EEXIST ;
2118 }
2119 ip_dn_init();
2120 break;
2121
2122 case MOD_UNLOAD:
2123 #if !defined(KLD_MODULE)
2124 printf("dummynet statically compiled, cannot unload\n");
2125 return EINVAL ;
2126 #else
2127 ip_dn_destroy();
2128 #endif
2129 break ;
2130 default:
2131 return EOPNOTSUPP;
2132 break ;
2133 }
2134 return 0 ;
2135 }
2136
2137 static moduledata_t dummynet_mod = {
2138 "dummynet",
2139 dummynet_modevent,
2140 NULL
2141 };
2142 DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
2143 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
2144 MODULE_VERSION(dummynet, 1);
Cache object: 893baa22b5fe103c6cc8cb182631625f
|