The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netpfil/ipfw/ip_dn_private.h

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
    5  * All rights reserved
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 /*
   30  * internal dummynet APIs.
   31  *
   32  * $FreeBSD$
   33  */
   34 
   35 #ifndef _IP_DN_PRIVATE_H
   36 #define _IP_DN_PRIVATE_H
   37 
   38 /* debugging support
   39  * use ND() to remove debugging, D() to print a line,
   40  * DX(level, ...) to print above a certain level
   41  * If you redefine D() you are expected to redefine all.
   42  */
   43 #ifndef D
   44 #define ND(fmt, ...) do {} while (0)
   45 #define D1(fmt, ...) do {} while (0)
   46 #define D(fmt, ...) printf("%-10s " fmt "\n",      \
   47         __FUNCTION__, ## __VA_ARGS__)
   48 #define DX(lev, fmt, ...) do {              \
   49         if (V_dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)
   50 #endif
   51 
   52 MALLOC_DECLARE(M_DUMMYNET);
   53 
   54 #ifndef __linux__
   55 #define div64(a, b)  ((int64_t)(a) / (int64_t)(b))
   56 #endif
   57 
   58 #define DN_LOCK_INIT() do {                             \
   59         mtx_init(&V_dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF);     \
   60         mtx_init(&V_dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF);     \
   61         } while (0)
   62 #define DN_LOCK_DESTROY() do {                          \
   63         mtx_destroy(&V_dn_cfg.uh_mtx);                  \
   64         mtx_destroy(&V_dn_cfg.bh_mtx);                  \
   65         } while (0)
   66 #if 0 /* not used yet */
   67 #define DN_UH_RLOCK()           mtx_lock(&V_dn_cfg.uh_mtx)
   68 #define DN_UH_RUNLOCK()         mtx_unlock(&V_dn_cfg.uh_mtx)
   69 #define DN_UH_WLOCK()           mtx_lock(&V_dn_cfg.uh_mtx)
   70 #define DN_UH_WUNLOCK()         mtx_unlock(&V_dn_cfg.uh_mtx)
   71 #define DN_UH_LOCK_ASSERT()     mtx_assert(&V_dn_cfg.uh_mtx, MA_OWNED)
   72 #endif
   73 
   74 #define DN_BH_RLOCK()           mtx_lock(&V_dn_cfg.uh_mtx)
   75 #define DN_BH_RUNLOCK()         mtx_unlock(&V_dn_cfg.uh_mtx)
   76 #define DN_BH_WLOCK()           mtx_lock(&V_dn_cfg.uh_mtx)
   77 #define DN_BH_WUNLOCK()         mtx_unlock(&V_dn_cfg.uh_mtx)
   78 #define DN_BH_LOCK_ASSERT()     mtx_assert(&V_dn_cfg.uh_mtx, MA_OWNED)
   79 
   80 SLIST_HEAD(dn_fsk_head, dn_fsk);
   81 
   82 struct mq {     /* a basic queue of packets*/
   83         struct mbuf *head, *tail;
   84         int count;
   85 };
   86 
   87 static inline void
   88 set_oid(struct dn_id *o, int type, int len)
   89 {
   90         o->type = type;
   91         o->len = len;
   92         o->subtype = 0;
   93 }
   94 
   95 /*
   96  * configuration and data for a dummynet instance
   97  *
   98  * When a configuration is modified from userland, 'id' is incremented
   99  * so we can use the value to check for stale pointers.
  100  */
  101 struct dn_parms {
  102         uint32_t        id;             /* configuration version */
  103 
  104         /* defaults (sysctl-accessible) */
  105         int     red_lookup_depth;
  106         int     red_avg_pkt_size;
  107         int     red_max_pkt_size;
  108         int     hash_size;
  109         int     max_hash_size;
  110         long    byte_limit;             /* max queue sizes */
  111         long    slot_limit;
  112 
  113         int     io_fast;
  114         int     debug;
  115 
  116         /* timekeeping */
  117         struct timeval prev_t;          /* last time dummynet_tick ran */
  118         struct dn_heap  evheap;         /* scheduled events */
  119 
  120         long    tick_last;              /* Last tick duration (usec). */
  121         long    tick_delta;             /* Last vs standard tick diff (usec). */
  122         long    tick_delta_sum; /* Accumulated tick difference (usec).*/
  123         long    tick_adjustment;        /* Tick adjustments done. */
  124         long    tick_lost;              /* Lost(coalesced) ticks number. */
  125         /* Adjusted vs non-adjusted curr_time difference (ticks). */
  126         long    tick_diff;
  127 
  128         /* counters of objects -- used for reporting space */
  129         int     schk_count;
  130         int     si_count;
  131         int     fsk_count;
  132         int     queue_count;
  133 
  134         /* packet counters */
  135         unsigned long   io_pkt;
  136         unsigned long   io_pkt_fast;
  137         unsigned long   io_pkt_drop;
  138 
  139         /* ticks and other stuff */
  140         uint64_t        curr_time;
  141         /* flowsets and schedulers are in hash tables, with 'hash_size'
  142          * buckets. fshash is looked up at every packet arrival
  143          * so better be generous if we expect many entries.
  144          */
  145         struct dn_ht    *fshash;
  146         struct dn_ht    *schedhash;
  147         /* list of flowsets without a scheduler -- use sch_chain */
  148         struct dn_fsk_head      fsu;    /* list of unlinked flowsets */
  149 
  150         /* Store the fs/sch to scan when draining. The value is the
  151          * bucket number of the hash table. Expire can be disabled
  152          * with net.inet.ip.dummynet.expire=0, or it happens every
  153          * expire ticks.
  154          **/
  155         int drain_fs;
  156         int drain_sch;
  157         uint32_t expire;
  158         uint32_t expire_cycle;  /* tick count */
  159 
  160         int init_done;
  161 
  162 #ifdef _KERNEL
  163         /*
  164          * This file is normally used in the kernel, unless we do
  165          * some userland tests, in which case we do not need a mtx.
  166          * uh_mtx arbitrates between system calls and also
  167          * protects fshash, schedhash and fsunlinked.
  168          * These structures are readonly for the lower half.
  169          * bh_mtx protects all other structures which may be
  170          * modified upon packet arrivals
  171          */
  172 #if defined( __linux__ ) || defined( _WIN32 )
  173         spinlock_t uh_mtx;
  174         spinlock_t bh_mtx;
  175 #else
  176         struct mtx uh_mtx;
  177         struct mtx bh_mtx;
  178 #endif
  179 
  180 #endif /* _KERNEL */
  181 };
  182 
  183 /*
  184  * Delay line, contains all packets on output from a link.
  185  * Every scheduler instance has one.
  186  */
  187 struct delay_line {
  188         struct dn_id oid;
  189         struct dn_sch_inst *si;
  190         struct mq mq;
  191 };
  192 
  193 /*
  194  * The kernel side of a flowset. It is linked in a hash table
  195  * of flowsets, and in a list of children of their parent scheduler.
  196  * qht is either the queue or (if HAVE_MASK) a hash table queues.
  197  * Note that the mask to use is the (flow_mask|sched_mask), which
  198  * changes as we attach/detach schedulers. So we store it here.
  199  *
  200  * XXX If we want to add scheduler-specific parameters, we need to
  201  * put them in external storage because the scheduler may not be
  202  * available when the fsk is created.
  203  */
  204 struct dn_fsk { /* kernel side of a flowset */
  205         struct dn_fs fs;
  206         SLIST_ENTRY(dn_fsk) fsk_next;   /* hash chain for fshash */
  207 
  208         struct ipfw_flow_id fsk_mask;
  209 
  210         /* qht is a hash table of queues, or just a single queue
  211          * a bit in fs.flags tells us which one
  212          */
  213         struct dn_ht    *qht;
  214         struct dn_schk *sched;          /* Sched we are linked to */
  215         SLIST_ENTRY(dn_fsk) sch_chain;  /* list of fsk attached to sched */
  216 
  217         /* bucket index used by drain routine to drain queues for this
  218          * flowset
  219          */
  220         int drain_bucket;
  221         /* Parameter realted to RED / GRED */
  222         /* original values are in dn_fs*/
  223         int w_q ;               /* queue weight (scaled) */
  224         int max_th ;            /* maximum threshold for queue (scaled) */
  225         int min_th ;            /* minimum threshold for queue (scaled) */
  226         int max_p ;             /* maximum value for p_b (scaled) */
  227 
  228         u_int c_1 ;             /* max_p/(max_th-min_th) (scaled) */
  229         u_int c_2 ;             /* max_p*min_th/(max_th-min_th) (scaled) */
  230         u_int c_3 ;             /* for GRED, (1-max_p)/max_th (scaled) */
  231         u_int c_4 ;             /* for GRED, 1 - 2*max_p (scaled) */
  232         u_int * w_q_lookup ;    /* lookup table for computing (1-w_q)^t */
  233         u_int lookup_depth ;    /* depth of lookup table */
  234         int lookup_step ;       /* granularity inside the lookup table */
  235         int lookup_weight ;     /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
  236         int avg_pkt_size ;      /* medium packet size */
  237         int max_pkt_size ;      /* max packet size */
  238 #ifdef NEW_AQM
  239         struct dn_aqm *aqmfp;   /* Pointer to AQM functions */
  240         void *aqmcfg;   /* configuration parameters for AQM */
  241 #endif
  242 };
  243 
  244 /*
  245  * A queue is created as a child of a flowset unless it belongs to
  246  * a !MULTIQUEUE scheduler. It is normally in a hash table in the
  247  * flowset. fs always points to the parent flowset.
  248  * si normally points to the sch_inst, unless the flowset has been
  249  * detached from the scheduler -- in this case si == NULL and we
  250  * should not enqueue.
  251  */
  252 struct dn_queue {
  253         struct dn_flow ni;      /* oid, flow_id, stats */
  254         struct mq mq;   /* packets queue */
  255         struct dn_sch_inst *_si;        /* owner scheduler instance */
  256         SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */
  257         struct dn_fsk *fs;              /* parent flowset. */
  258 
  259         /* RED parameters */
  260         int avg;                /* average queue length est. (scaled) */
  261         int count;              /* arrivals since last RED drop */
  262         int random;             /* random value (scaled) */
  263         uint64_t q_time;        /* start of queue idle time */
  264 #ifdef NEW_AQM
  265         void *aqm_status;       /* per-queue status variables*/
  266 #endif
  267 
  268 };
  269 
  270 /*
  271  * The kernel side of a scheduler. Contains the userland config,
  272  * a link, pointer to extra config arguments from command line,
  273  * kernel flags, and a pointer to the scheduler methods.
  274  * It is stored in a hash table, and holds a list of all
  275  * flowsets and scheduler instances.
  276  * XXX sch must be at the beginning, see schk_hash().
  277  */
  278 struct dn_schk {
  279         struct dn_sch sch;
  280         struct dn_alg *fp;      /* Pointer to scheduler functions */
  281         struct dn_link link;    /* The link, embedded */
  282         struct dn_profile *profile; /* delay profile, if any */
  283         struct dn_id *cfg;      /* extra config arguments */
  284 
  285         SLIST_ENTRY(dn_schk) schk_next;  /* hash chain for schedhash */
  286 
  287         struct dn_fsk_head fsk_list;  /* all fsk linked to me */
  288         struct dn_fsk *fs;      /* Flowset for !MULTIQUEUE */
  289 
  290         /* bucket index used by the drain routine to drain the scheduler
  291          * instance for this flowset.
  292          */
  293         int drain_bucket;
  294 
  295         /* Hash table of all instances (through sch.sched_mask)
  296          * or single instance if no mask. Always valid.
  297          */
  298         struct dn_ht    *siht;
  299 };
  300 
  301 /*
  302  * Scheduler instance.
  303  * Contains variables and all queues relative to a this instance.
  304  * This struct is created a runtime.
  305  */
  306 struct dn_sch_inst {
  307         struct dn_flow  ni;     /* oid, flowid and stats */
  308         SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */
  309         struct delay_line dline;
  310         struct dn_schk *sched;  /* the template */
  311         int             kflags; /* DN_ACTIVE */
  312 
  313         int64_t credit;         /* bits I can transmit (more or less). */
  314         uint64_t sched_time;    /* time link was scheduled in ready_heap */
  315         uint64_t idle_time;     /* start of scheduler instance idle time */
  316 
  317         /* q_count is the number of queues that this instance is using.
  318          * The counter is incremented or decremented when
  319          * a reference from the queue is created or deleted.
  320          * It is used to make sure that a scheduler instance can be safely
  321          * deleted by the drain routine. See notes below.
  322          */
  323         int q_count;
  324 
  325 };
  326 
  327 /*
  328  * NOTE about object drain.
  329  * The system will automatically (XXX check when) drain queues and
  330  * scheduler instances when they are idle.
  331  * A queue is idle when it has no packets; an instance is idle when
  332  * it is not in the evheap heap, and the corresponding delay line is empty.
  333  * A queue can be safely deleted when it is idle because of the scheduler
  334  * function xxx_free_queue() will remove any references to it.
  335  * An instance can be only deleted when no queues reference it. To be sure
  336  * of that, a counter (q_count) stores the number of queues that are pointing
  337  * to the instance.
  338  *
  339  * XXX
  340  * Order of scan:
  341  * - take all flowset in a bucket for the flowset hash table
  342  * - take all queues in a bucket for the flowset
  343  * - increment the queue bucket
  344  * - scan next flowset bucket
  345  * Nothing is done if a bucket contains no entries.
  346  *
  347  * The same schema is used for sceduler instances
  348  */
  349 
  350 /* kernel-side flags. Linux has DN_DELETE in fcntl.h
  351  */
  352 enum {
  353         /* 1 and 2 are reserved for the SCAN flags */
  354         DN_DESTROY      = 0x0004, /* destroy */
  355         DN_DELETE_FS    = 0x0008, /* destroy flowset */
  356         DN_DETACH       = 0x0010,
  357         DN_ACTIVE       = 0x0020, /* object is in evheap */
  358         DN_F_DLINE      = 0x0040, /* object is a delay line */
  359         DN_DEL_SAFE     = 0x0080, /* delete a queue only if no longer needed
  360                                    * by scheduler */
  361         DN_QHT_IS_Q     = 0x0100, /* in flowset, qht is a single queue */
  362 };
  363 
  364 /*
  365  * Packets processed by dummynet have an mbuf tag associated with
  366  * them that carries their dummynet state.
  367  * Outside dummynet, only the 'rule' field is relevant, and it must
  368  * be at the beginning of the structure.
  369  */
  370 struct dn_pkt_tag {
  371         struct ipfw_rule_ref rule;      /* matching rule        */
  372 
  373         /* second part, dummynet specific */
  374         int dn_dir;             /* action when packet comes out.*/
  375                                 /* see ip_fw_private.h          */
  376         uint64_t output_time;   /* when the pkt is due for delivery*/
  377         uint16_t if_index;
  378         uint16_t if_idxgen;
  379         struct _ip6dn_args ip6opt;      /* XXX ipv6 options     */
  380         uint16_t iphdr_off;     /* IP header offset for mtodo() */
  381 };
  382 
  383 /*
  384  * Possible values for dn_dir. XXXGL: this needs to be reviewed
  385  * and converted to same values ip_fw_args.flags use.
  386  */
  387 enum {
  388         DIR_OUT =       0,
  389         DIR_IN =        1,
  390         DIR_FWD =       2,
  391         DIR_DROP =      3,
  392         PROTO_LAYER2 =  0x4, /* set for layer 2 */
  393         PROTO_IPV4 =    0x08,
  394         PROTO_IPV6 =    0x10,
  395         PROTO_IFB =     0x0c, /* layer2 + ifbridge */
  396 };
  397 
  398 //extern struct dn_parms V_dn_cfg;
  399 VNET_DECLARE(struct dn_parms, dn_cfg);
  400 #define V_dn_cfg        VNET(dn_cfg)
  401 
  402 int dummynet_io(struct mbuf **, struct ip_fw_args *);
  403 void dummynet_sched_lock(void);
  404 void dummynet_sched_unlock(void);
  405 void dummynet_task(void *context, int pending);
  406 void dn_reschedule(void);
  407 struct dn_pkt_tag * dn_tag_get(struct mbuf *m);
  408 
  409 struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *,
  410         struct ipfw_flow_id *);
  411 struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);
  412 
  413 /*
  414  * copy_range is a template for requests for ranges of pipes/queues/scheds.
  415  * The number of ranges is variable and can be derived by o.len.
  416  * As a default, we use a small number of entries so that the struct
  417  * fits easily on the stack and is sufficient for most common requests.
  418  */
  419 #define DEFAULT_RANGES  5
  420 struct copy_range {
  421         struct dn_id o;
  422         uint32_t        r[ 2 * DEFAULT_RANGES ];
  423 };
  424 
  425 struct copy_args {
  426         char **start;
  427         char *end;
  428         int flags;
  429         int type;
  430         struct copy_range *extra;       /* extra filtering */
  431 };
  432 
  433 struct sockopt;
  434 int ip_dummynet_compat(struct sockopt *sopt);
  435 int dummynet_get(struct sockopt *sopt, void **compat);
  436 int dn_c_copy_q (void *_ni, void *arg);
  437 int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
  438 int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
  439 int dn_compat_copy_queue(struct copy_args *a, void *_o);
  440 int dn_compat_copy_pipe(struct copy_args *a, void *_o);
  441 int copy_data_helper_compat(void *_o, void *_arg);
  442 int dn_compat_calc_size(void);
  443 int do_config(void *p, size_t l);
  444 
  445 /* function to drain idle object */
  446 void dn_drain_scheduler(void);
  447 void dn_drain_queue(void);
  448 
  449 #ifdef NEW_AQM
  450 int ecn_mark(struct mbuf* m);
  451 
  452 /* moved from ip_dn_io.c to here to be available for AQMs modules*/
  453 static inline void
  454 mq_append(struct mq *q, struct mbuf *m)
  455 {
  456 #ifdef USERSPACE
  457         // buffers from netmap need to be copied
  458         // XXX note that the routine is not expected to fail
  459         ND("append %p to %p", m, q);
  460         if (m->m_flags & M_STACK) {
  461                 struct mbuf *m_new;
  462                 void *p;
  463                 int l, ofs;
  464 
  465                 ofs = m->m_data - m->__m_extbuf;
  466                 // XXX allocate
  467                 MGETHDR(m_new, M_NOWAIT, MT_DATA);
  468                 ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p",
  469                         m, m->__m_extbuf, m->__m_extlen, ofs, m_new);
  470                 p = m_new->__m_extbuf;  /* new pointer */
  471                 l = m_new->__m_extlen;  /* new len */
  472                 if (l <= m->__m_extlen) {
  473                         panic("extlen too large");
  474                 }
  475 
  476                 *m_new = *m;    // copy
  477                 m_new->m_flags &= ~M_STACK;
  478                 m_new->__m_extbuf = p; // point to new buffer
  479                 _pkt_copy(m->__m_extbuf, p, m->__m_extlen);
  480                 m_new->m_data = p + ofs;
  481                 m = m_new;
  482         }
  483 #endif /* USERSPACE */
  484         if (q->head == NULL)
  485                 q->head = m;
  486         else
  487                 q->tail->m_nextpkt = m;
  488         q->count++;
  489         q->tail = m;
  490         m->m_nextpkt = NULL;
  491 }
  492 #endif /* NEW_AQM */
  493 
  494 #endif /* _IP_DN_PRIVATE_H */

Cache object: 0d669da1d68c698711698da0b8f24902


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.