The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/altq/altq_red.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: altq_red.c,v 1.28 2008/06/18 09:06:27 yamt Exp $       */
    2 /*      $KAME: altq_red.c,v 1.20 2005/04/13 03:44:25 suz Exp $  */
    3 
    4 /*
    5  * Copyright (C) 1997-2003
    6  *      Sony Computer Science Laboratories Inc.  All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  */
   30 /*
   31  * Copyright (c) 1990-1994 Regents of the University of California.
   32  * All rights reserved.
   33  *
   34  * Redistribution and use in source and binary forms, with or without
   35  * modification, are permitted provided that the following conditions
   36  * are met:
   37  * 1. Redistributions of source code must retain the above copyright
   38  *    notice, this list of conditions and the following disclaimer.
   39  * 2. Redistributions in binary form must reproduce the above copyright
   40  *    notice, this list of conditions and the following disclaimer in the
   41  *    documentation and/or other materials provided with the distribution.
   42  * 3. All advertising materials mentioning features or use of this software
   43  *    must display the following acknowledgement:
   44  *      This product includes software developed by the Computer Systems
   45  *      Engineering Group at Lawrence Berkeley Laboratory.
   46  * 4. Neither the name of the University nor of the Laboratory may be used
   47  *    to endorse or promote products derived from this software without
   48  *    specific prior written permission.
   49  *
   50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   60  * SUCH DAMAGE.
   61  */
   62 
   63 #include <sys/cdefs.h>
   64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.28 2008/06/18 09:06:27 yamt Exp $");
   65 
   66 #ifdef _KERNEL_OPT
   67 #include "opt_altq.h"
   68 #include "opt_inet.h"
   69 #include "pf.h"
   70 #endif
   71 
   72 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
   73 
   74 #include <sys/param.h>
   75 #include <sys/malloc.h>
   76 #include <sys/mbuf.h>
   77 #include <sys/socket.h>
   78 #include <sys/systm.h>
   79 #include <sys/errno.h>
   80 #include <sys/kauth.h>
   81 #if 1 /* ALTQ3_COMPAT */
   82 #include <sys/sockio.h>
   83 #include <sys/proc.h>
   84 #include <sys/kernel.h>
   85 #ifdef ALTQ_FLOWVALVE
   86 #include <sys/queue.h>
   87 #include <sys/time.h>
   88 #endif
   89 #endif /* ALTQ3_COMPAT */
   90 
   91 #include <net/if.h>
   92 
   93 #include <netinet/in.h>
   94 #include <netinet/in_systm.h>
   95 #include <netinet/ip.h>
   96 #ifdef INET6
   97 #include <netinet/ip6.h>
   98 #endif
   99 
  100 #if NPF > 0
  101 #include <net/pfvar.h>
  102 #endif
  103 #include <altq/altq.h>
  104 #include <altq/altq_red.h>
  105 #ifdef ALTQ3_COMPAT
  106 #include <altq/altq_conf.h>
  107 #ifdef ALTQ_FLOWVALVE
  108 #include <altq/altq_flowvalve.h>
  109 #endif
  110 #endif
  111 
  112 /*
  113  * ALTQ/RED (Random Early Detection) implementation using 32-bit
  114  * fixed-point calculation.
  115  *
  116  * written by kjc using the ns code as a reference.
  117  * you can learn more about red and ns from Sally's home page at
  118  * http://www-nrg.ee.lbl.gov/floyd/
  119  *
  120  * most of the red parameter values are fixed in this implementation
  121  * to prevent fixed-point overflow/underflow.
  122  * if you change the parameters, watch out for overflow/underflow!
  123  *
  124  * the parameters used are recommended values by Sally.
  125  * the corresponding ns config looks:
  126  *      q_weight=0.00195
  127  *      minthresh=5 maxthresh=15 queue-size=60
  128  *      linterm=30
  129  *      dropmech=drop-tail
  130  *      bytes=false (can't be handled by 32-bit fixed-point)
  131  *      doubleq=false dqthresh=false
  132  *      wait=true
  133  */
  134 /*
  135  * alternative red parameters for a slow link.
  136  *
  137  * assume the queue length becomes from zero to L and keeps L, it takes
  138  * N packets for q_avg to reach 63% of L.
  139  * when q_weight is 0.002, N is about 500 packets.
  140  * for a slow link like dial-up, 500 packets takes more than 1 minute!
  141  * when q_weight is 0.008, N is about 127 packets.
  142  * when q_weight is 0.016, N is about 63 packets.
  143  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
  144  * are allowed for 0.016.
  145  * see Sally's paper for more details.
  146  */
  147 /* normal red parameters */
  148 #define W_WEIGHT        512     /* inverse of weight of EWMA (511/512) */
  149                                 /* q_weight = 0.00195 */
  150 
  151 /* red parameters for a slow link */
  152 #define W_WEIGHT_1      128     /* inverse of weight of EWMA (127/128) */
  153                                 /* q_weight = 0.0078125 */
  154 
  155 /* red parameters for a very slow link (e.g., dialup) */
  156 #define W_WEIGHT_2      64      /* inverse of weight of EWMA (63/64) */
  157                                 /* q_weight = 0.015625 */
  158 
  159 /* fixed-point uses 12-bit decimal places */
  160 #define FP_SHIFT        12      /* fixed-point shift */
  161 
  162 /* red parameters for drop probability */
  163 #define INV_P_MAX       10      /* inverse of max drop probability */
  164 #define TH_MIN          5       /* min threshold */
  165 #define TH_MAX          15      /* max threshold */
  166 
  167 #define RED_LIMIT       60      /* default max queue lenght */
  168 #define RED_STATS               /* collect statistics */
  169 
  170 /*
  171  * our default policy for forced-drop is drop-tail.
  172  * (in altq-1.1.2 or earlier, the default was random-drop.
  173  * but it makes more sense to punish the cause of the surge.)
  174  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
  175  */
  176 
  177 #ifdef ALTQ3_COMPAT
  178 #ifdef ALTQ_FLOWVALVE
  179 /*
  180  * flow-valve is an extention to protect red from unresponsive flows
  181  * and to promote end-to-end congestion control.
  182  * flow-valve observes the average drop rates of the flows that have
  183  * experienced packet drops in the recent past.
  184  * when the average drop rate exceeds the threshold, the flow is
  185  * blocked by the flow-valve.  the trapped flow should back off
  186  * exponentially to escape from the flow-valve.
  187  */
  188 #ifdef RED_RANDOM_DROP
  189 #error "random-drop can't be used with flow-valve!"
  190 #endif
  191 #endif /* ALTQ_FLOWVALVE */
  192 
  193 /* red_list keeps all red_queue_t's allocated. */
  194 static red_queue_t *red_list = NULL;
  195 
  196 #endif /* ALTQ3_COMPAT */
  197 
  198 /* default red parameter values */
  199 static int default_th_min = TH_MIN;
  200 static int default_th_max = TH_MAX;
  201 static int default_inv_pmax = INV_P_MAX;
  202 
  203 #ifdef ALTQ3_COMPAT
  204 /* internal function prototypes */
  205 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
  206 static struct mbuf *red_dequeue(struct ifaltq *, int);
  207 static int red_request(struct ifaltq *, int, void *);
  208 static void red_purgeq(red_queue_t *);
  209 static int red_detach(red_queue_t *);
  210 #ifdef ALTQ_FLOWVALVE
  211 static inline struct fve *flowlist_lookup(struct flowvalve *,
  212                          struct altq_pktattr *, struct timeval *);
  213 static inline struct fve *flowlist_reclaim(struct flowvalve *,
  214                                              struct altq_pktattr *);
  215 static inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
  216 static inline int fv_p2f(struct flowvalve *, int);
  217 static struct flowvalve *fv_alloc(struct red *);
  218 static void fv_destroy(struct flowvalve *);
  219 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
  220                         struct fve **);
  221 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
  222                          struct fve *);
  223 #endif
  224 #endif /* ALTQ3_COMPAT */
  225 
  226 /*
  227  * red support routines
  228  */
  229 red_t *
  230 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
  231    int pkttime)
  232 {
  233         red_t   *rp;
  234         int      w, i;
  235         int      npkts_per_sec;
  236 
  237         rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
  238         if (rp == NULL)
  239                 return (NULL);
  240 
  241         rp->red_avg = 0;
  242         rp->red_idle = 1;
  243 
  244         if (weight == 0)
  245                 rp->red_weight = W_WEIGHT;
  246         else
  247                 rp->red_weight = weight;
  248         if (inv_pmax == 0)
  249                 rp->red_inv_pmax = default_inv_pmax;
  250         else
  251                 rp->red_inv_pmax = inv_pmax;
  252         if (th_min == 0)
  253                 rp->red_thmin = default_th_min;
  254         else
  255                 rp->red_thmin = th_min;
  256         if (th_max == 0)
  257                 rp->red_thmax = default_th_max;
  258         else
  259                 rp->red_thmax = th_max;
  260 
  261         rp->red_flags = flags;
  262 
  263         if (pkttime == 0)
  264                 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
  265                 rp->red_pkttime = 800;
  266         else
  267                 rp->red_pkttime = pkttime;
  268 
  269         if (weight == 0) {
  270                 /* when the link is very slow, adjust red parameters */
  271                 npkts_per_sec = 1000000 / rp->red_pkttime;
  272                 if (npkts_per_sec < 50) {
  273                         /* up to about 400Kbps */
  274                         rp->red_weight = W_WEIGHT_2;
  275                 } else if (npkts_per_sec < 300) {
  276                         /* up to about 2.4Mbps */
  277                         rp->red_weight = W_WEIGHT_1;
  278                 }
  279         }
  280 
  281         /* calculate wshift.  weight must be power of 2 */
  282         w = rp->red_weight;
  283         for (i = 0; w > 1; i++)
  284                 w = w >> 1;
  285         rp->red_wshift = i;
  286         w = 1 << rp->red_wshift;
  287         if (w != rp->red_weight) {
  288                 printf("invalid weight value %d for red! use %d\n",
  289                        rp->red_weight, w);
  290                 rp->red_weight = w;
  291         }
  292 
  293         /*
  294          * thmin_s and thmax_s are scaled versions of th_min and th_max
  295          * to be compared with avg.
  296          */
  297         rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
  298         rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
  299 
  300         /*
  301          * precompute probability denominator
  302          *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
  303          */
  304         rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
  305                          * rp->red_inv_pmax) << FP_SHIFT;
  306 
  307         /* allocate weight table */
  308         rp->red_wtab = wtab_alloc(rp->red_weight);
  309 
  310         microtime(&rp->red_last);
  311 #ifdef ALTQ3_COMPAT
  312 #ifdef ALTQ_FLOWVALVE
  313         if (flags & REDF_FLOWVALVE)
  314                 rp->red_flowvalve = fv_alloc(rp);
  315         /* if fv_alloc failes, flowvalve is just disabled */
  316 #endif
  317 #endif /* ALTQ3_COMPAT */
  318         return (rp);
  319 }
  320 
  321 void
  322 red_destroy(red_t *rp)
  323 {
  324 #ifdef ALTQ3_COMPAT
  325 #ifdef ALTQ_FLOWVALVE
  326         if (rp->red_flowvalve != NULL)
  327                 fv_destroy(rp->red_flowvalve);
  328 #endif
  329 #endif /* ALTQ3_COMPAT */
  330         wtab_destroy(rp->red_wtab);
  331         free(rp, M_DEVBUF);
  332 }
  333 
  334 void
  335 red_getstats(red_t *rp, struct redstats *sp)
  336 {
  337         sp->q_avg               = rp->red_avg >> rp->red_wshift;
  338         sp->xmit_cnt            = rp->red_stats.xmit_cnt;
  339         sp->drop_cnt            = rp->red_stats.drop_cnt;
  340         sp->drop_forced         = rp->red_stats.drop_forced;
  341         sp->drop_unforced       = rp->red_stats.drop_unforced;
  342         sp->marked_packets      = rp->red_stats.marked_packets;
  343 }
  344 
  345 int
  346 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
  347     struct altq_pktattr *pktattr)
  348 {
  349         int avg, droptype;
  350         int n;
  351 #ifdef ALTQ3_COMPAT
  352 #ifdef ALTQ_FLOWVALVE
  353         struct fve *fve = NULL;
  354 
  355         if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
  356                 if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
  357                         m_freem(m);
  358                         return (-1);
  359                 }
  360 #endif
  361 #endif /* ALTQ3_COMPAT */
  362 
  363         avg = rp->red_avg;
  364 
  365         /*
  366          * if we were idle, we pretend that n packets arrived during
  367          * the idle period.
  368          */
  369         if (rp->red_idle) {
  370                 struct timeval now;
  371                 int t;
  372 
  373                 rp->red_idle = 0;
  374                 microtime(&now);
  375                 t = (now.tv_sec - rp->red_last.tv_sec);
  376                 if (t > 60) {
  377                         /*
  378                          * being idle for more than 1 minute, set avg to zero.
  379                          * this prevents t from overflow.
  380                          */
  381                         avg = 0;
  382                 } else {
  383                         t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
  384                         n = t / rp->red_pkttime - 1;
  385 
  386                         /* the following line does (avg = (1 - Wq)^n * avg) */
  387                         if (n > 0)
  388                                 avg = (avg >> FP_SHIFT) *
  389                                     pow_w(rp->red_wtab, n);
  390                 }
  391         }
  392 
  393         /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
  394         avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
  395         rp->red_avg = avg;              /* save the new value */
  396 
  397         /*
  398          * red_count keeps a tally of arriving traffic that has not
  399          * been dropped.
  400          */
  401         rp->red_count++;
  402 
  403         /* see if we drop early */
  404         droptype = DTYPE_NODROP;
  405         if (avg >= rp->red_thmin_s && qlen(q) > 1) {
  406                 if (avg >= rp->red_thmax_s) {
  407                         /* avg >= th_max: forced drop */
  408                         droptype = DTYPE_FORCED;
  409                 } else if (rp->red_old == 0) {
  410                         /* first exceeds th_min */
  411                         rp->red_count = 1;
  412                         rp->red_old = 1;
  413                 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
  414                                       rp->red_probd, rp->red_count)) {
  415                         /* mark or drop by red */
  416                         if ((rp->red_flags & REDF_ECN) &&
  417                             mark_ecn(m, pktattr, rp->red_flags)) {
  418                                 /* successfully marked.  do not drop. */
  419                                 rp->red_count = 0;
  420 #ifdef RED_STATS
  421                                 rp->red_stats.marked_packets++;
  422 #endif
  423                         } else {
  424                                 /* unforced drop by red */
  425                                 droptype = DTYPE_EARLY;
  426                         }
  427                 }
  428         } else {
  429                 /* avg < th_min */
  430                 rp->red_old = 0;
  431         }
  432 
  433         /*
  434          * if the queue length hits the hard limit, it's a forced drop.
  435          */
  436         if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
  437                 droptype = DTYPE_FORCED;
  438 
  439 #ifdef RED_RANDOM_DROP
  440         /* if successful or forced drop, enqueue this packet. */
  441         if (droptype != DTYPE_EARLY)
  442                 _addq(q, m);
  443 #else
  444         /* if successful, enqueue this packet. */
  445         if (droptype == DTYPE_NODROP)
  446                 _addq(q, m);
  447 #endif
  448         if (droptype != DTYPE_NODROP) {
  449                 if (droptype == DTYPE_EARLY) {
  450                         /* drop the incoming packet */
  451 #ifdef RED_STATS
  452                         rp->red_stats.drop_unforced++;
  453 #endif
  454                 } else {
  455                         /* forced drop, select a victim packet in the queue. */
  456 #ifdef RED_RANDOM_DROP
  457                         m = _getq_random(q);
  458 #endif
  459 #ifdef RED_STATS
  460                         rp->red_stats.drop_forced++;
  461 #endif
  462                 }
  463 #ifdef RED_STATS
  464                 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
  465 #endif
  466                 rp->red_count = 0;
  467 #ifdef ALTQ3_COMPAT
  468 #ifdef ALTQ_FLOWVALVE
  469                 if (rp->red_flowvalve != NULL)
  470                         fv_dropbyred(rp->red_flowvalve, pktattr, fve);
  471 #endif
  472 #endif /* ALTQ3_COMPAT */
  473                 m_freem(m);
  474                 return (-1);
  475         }
  476         /* successfully queued */
  477 #ifdef RED_STATS
  478         PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
  479 #endif
  480         return (0);
  481 }
  482 
  483 /*
  484  * early-drop probability is calculated as follows:
  485  *   prob = p_max * (avg - th_min) / (th_max - th_min)
  486  *   prob_a = prob / (2 - count*prob)
  487  *          = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
  488  * here prob_a increases as successive undrop count increases.
  489  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
  490  * becomes 1 when (count >= (2 / prob))).
  491  */
  492 int
  493 drop_early(int fp_len, int fp_probd, int count)
  494 {
  495         int     d;              /* denominator of drop-probability */
  496 
  497         d = fp_probd - count * fp_len;
  498         if (d <= 0)
  499                 /* count exceeds the hard limit: drop or mark */
  500                 return (1);
  501 
  502         /*
  503          * now the range of d is [1..600] in fixed-point. (when
  504          * th_max-th_min=10 and p_max=1/30)
  505          * drop probability = (avg - TH_MIN) / d
  506          */
  507 
  508         if ((arc4random() % d) < fp_len) {
  509                 /* drop or mark */
  510                 return (1);
  511         }
  512         /* no drop/mark */
  513         return (0);
  514 }
  515 
  516 /*
  517  * try to mark CE bit to the packet.
  518  *    returns 1 if successfully marked, 0 otherwise.
  519  */
  520 int
  521 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
  522 {
  523         struct mbuf     *m0;
  524         struct m_tag    *t;
  525         struct altq_tag *at;
  526         void            *hdr;
  527         int              af;
  528 
  529         t = m_tag_find(m, PACKET_TAG_ALTQ_QID, NULL);
  530         if (t != NULL) {
  531                 at = (struct altq_tag *)(t + 1);
  532                 if (at == NULL)
  533                         return (0);
  534                 af = at->af;
  535                 hdr = at->hdr;
  536 #ifdef ALTQ3_COMPAT
  537         } else if (pktattr != NULL) {
  538                 af = pktattr->pattr_af;
  539                 hdr = pktattr->pattr_hdr;
  540 #endif /* ALTQ3_COMPAT */
  541         } else
  542                 return (0);
  543 
  544         if (af != AF_INET && af != AF_INET6)
  545                 return (0);
  546 
  547         /* verify that pattr_hdr is within the mbuf data */
  548         for (m0 = m; m0 != NULL; m0 = m0->m_next)
  549                 if (((char *)hdr >= m0->m_data) &&
  550                     ((char *)hdr < m0->m_data + m0->m_len))
  551                         break;
  552         if (m0 == NULL) {
  553                 /* ick, tag info is stale */
  554                 return (0);
  555         }
  556 
  557         switch (af) {
  558         case AF_INET:
  559                 if (flags & REDF_ECN4) {
  560                         struct ip *ip = hdr;
  561                         u_int8_t otos;
  562                         int sum;
  563 
  564                         if (ip->ip_v != 4)
  565                                 return (0);     /* version mismatch! */
  566 
  567                         if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
  568                                 return (0);     /* not-ECT */
  569                         if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
  570                                 return (1);     /* already marked */
  571 
  572                         /*
  573                          * ecn-capable but not marked,
  574                          * mark CE and update checksum
  575                          */
  576                         otos = ip->ip_tos;
  577                         ip->ip_tos |= IPTOS_ECN_CE;
  578                         /*
  579                          * update checksum (from RFC1624)
  580                          *         HC' = ~(~HC + ~m + m')
  581                          */
  582                         sum = ~ntohs(ip->ip_sum) & 0xffff;
  583                         sum += (~otos & 0xffff) + ip->ip_tos;
  584                         sum = (sum >> 16) + (sum & 0xffff);
  585                         sum += (sum >> 16);  /* add carry */
  586                         ip->ip_sum = htons(~sum & 0xffff);
  587                         return (1);
  588                 }
  589                 break;
  590 #ifdef INET6
  591         case AF_INET6:
  592                 if (flags & REDF_ECN6) {
  593                         struct ip6_hdr *ip6 = hdr;
  594                         u_int32_t flowlabel;
  595 
  596                         flowlabel = ntohl(ip6->ip6_flow);
  597                         if ((flowlabel >> 28) != 6)
  598                                 return (0);     /* version mismatch! */
  599                         if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
  600                             (IPTOS_ECN_NOTECT << 20))
  601                                 return (0);     /* not-ECT */
  602                         if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
  603                             (IPTOS_ECN_CE << 20))
  604                                 return (1);     /* already marked */
  605                         /*
  606                          * ecn-capable but not marked,  mark CE
  607                          */
  608                         flowlabel |= (IPTOS_ECN_CE << 20);
  609                         ip6->ip6_flow = htonl(flowlabel);
  610                         return (1);
  611                 }
  612                 break;
  613 #endif  /* INET6 */
  614         }
  615 
  616         /* not marked */
  617         return (0);
  618 }
  619 
  620 struct mbuf *
  621 red_getq(red_t *rp, class_queue_t *q)
  622 {
  623         struct mbuf *m;
  624 
  625         if ((m = _getq(q)) == NULL) {
  626                 if (rp->red_idle == 0) {
  627                         rp->red_idle = 1;
  628                         microtime(&rp->red_last);
  629                 }
  630                 return NULL;
  631         }
  632 
  633         rp->red_idle = 0;
  634         return (m);
  635 }
  636 
  637 /*
  638  * helper routine to calibrate avg during idle.
  639  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
  640  * here Wq = 1/weight and the code assumes Wq is close to zero.
  641  *
  642  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
  643  */
  644 static struct wtab *wtab_list = NULL;   /* pointer to wtab list */
  645 
  646 struct wtab *
  647 wtab_alloc(int weight)
  648 {
  649         struct wtab     *w;
  650         int              i;
  651 
  652         for (w = wtab_list; w != NULL; w = w->w_next)
  653                 if (w->w_weight == weight) {
  654                         w->w_refcount++;
  655                         return (w);
  656                 }
  657 
  658         w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
  659         if (w == NULL)
  660                 panic("wtab_alloc: malloc failed!");
  661         w->w_weight = weight;
  662         w->w_refcount = 1;
  663         w->w_next = wtab_list;
  664         wtab_list = w;
  665 
  666         /* initialize the weight table */
  667         w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
  668         for (i = 1; i < 32; i++) {
  669                 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
  670                 if (w->w_tab[i] == 0 && w->w_param_max == 0)
  671                         w->w_param_max = 1 << i;
  672         }
  673 
  674         return (w);
  675 }
  676 
  677 int
  678 wtab_destroy(struct wtab *w)
  679 {
  680         struct wtab     *prev;
  681 
  682         if (--w->w_refcount > 0)
  683                 return (0);
  684 
  685         if (wtab_list == w)
  686                 wtab_list = w->w_next;
  687         else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
  688                 if (prev->w_next == w) {
  689                         prev->w_next = w->w_next;
  690                         break;
  691                 }
  692 
  693         free(w, M_DEVBUF);
  694         return (0);
  695 }
  696 
  697 int32_t
  698 pow_w(struct wtab *w, int n)
  699 {
  700         int     i, bit;
  701         int32_t val;
  702 
  703         if (n >= w->w_param_max)
  704                 return (0);
  705 
  706         val = 1 << FP_SHIFT;
  707         if (n <= 0)
  708                 return (val);
  709 
  710         bit = 1;
  711         i = 0;
  712         while (n) {
  713                 if (n & bit) {
  714                         val = (val * w->w_tab[i]) >> FP_SHIFT;
  715                         n &= ~bit;
  716                 }
  717                 i++;
  718                 bit <<=  1;
  719         }
  720         return (val);
  721 }
  722 
  723 #ifdef ALTQ3_COMPAT
  724 /*
  725  * red device interface
  726  */
  727 altqdev_decl(red);
  728 
  729 int
  730 redopen(dev_t dev, int flag, int fmt,
  731     struct lwp *l)
  732 {
  733         /* everything will be done when the queueing scheme is attached. */
  734         return 0;
  735 }
  736 
  737 int
  738 redclose(dev_t dev, int flag, int fmt,
  739     struct lwp *l)
  740 {
  741         red_queue_t *rqp;
  742         int err, error = 0;
  743 
  744         while ((rqp = red_list) != NULL) {
  745                 /* destroy all */
  746                 err = red_detach(rqp);
  747                 if (err != 0 && error == 0)
  748                         error = err;
  749         }
  750 
  751         return error;
  752 }
  753 
  754 int
  755 redioctl(dev_t dev, ioctlcmd_t cmd, void *addr, int flag,
  756     struct lwp *l)
  757 {
  758         red_queue_t *rqp;
  759         struct red_interface *ifacep;
  760         struct ifnet *ifp;
  761         struct proc *p = l->l_proc;
  762         int     error = 0;
  763 
  764         /* check super-user privilege */
  765         switch (cmd) {
  766         case RED_GETSTATS:
  767                 break;
  768         default:
  769 #if (__FreeBSD_version > 400000)
  770                 if ((error = suser(p)) != 0)
  771 #else
  772                 if ((error = kauth_authorize_network(p->p_cred,
  773                     KAUTH_NETWORK_ALTQ, KAUTH_REQ_NETWORK_ALTQ_RED, NULL,
  774                     NULL, NULL)) != 0)
  775 #endif
  776                         return (error);
  777                 break;
  778         }
  779 
  780         switch (cmd) {
  781 
  782         case RED_ENABLE:
  783                 ifacep = (struct red_interface *)addr;
  784                 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
  785                         error = EBADF;
  786                         break;
  787                 }
  788                 error = altq_enable(rqp->rq_ifq);
  789                 break;
  790 
  791         case RED_DISABLE:
  792                 ifacep = (struct red_interface *)addr;
  793                 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
  794                         error = EBADF;
  795                         break;
  796                 }
  797                 error = altq_disable(rqp->rq_ifq);
  798                 break;
  799 
  800         case RED_IF_ATTACH:
  801                 ifp = ifunit(((struct red_interface *)addr)->red_ifname);
  802                 if (ifp == NULL) {
  803                         error = ENXIO;
  804                         break;
  805                 }
  806 
  807                 /* allocate and initialize red_queue_t */
  808                 rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
  809                 if (rqp == NULL) {
  810                         error = ENOMEM;
  811                         break;
  812                 }
  813 
  814                 rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
  815                     M_WAITOK|M_ZERO);
  816                 if (rqp->rq_q == NULL) {
  817                         free(rqp, M_DEVBUF);
  818                         error = ENOMEM;
  819                         break;
  820                 }
  821 
  822                 rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
  823                 if (rqp->rq_red == NULL) {
  824                         free(rqp->rq_q, M_DEVBUF);
  825                         free(rqp, M_DEVBUF);
  826                         error = ENOMEM;
  827                         break;
  828                 }
  829 
  830                 rqp->rq_ifq = &ifp->if_snd;
  831                 qtail(rqp->rq_q) = NULL;
  832                 qlen(rqp->rq_q) = 0;
  833                 qlimit(rqp->rq_q) = RED_LIMIT;
  834                 qtype(rqp->rq_q) = Q_RED;
  835 
  836                 /*
  837                  * set RED to this ifnet structure.
  838                  */
  839                 error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
  840                                     red_enqueue, red_dequeue, red_request,
  841                                     NULL, NULL);
  842                 if (error) {
  843                         red_destroy(rqp->rq_red);
  844                         free(rqp->rq_q, M_DEVBUF);
  845                         free(rqp, M_DEVBUF);
  846                         break;
  847                 }
  848 
  849                 /* add this state to the red list */
  850                 rqp->rq_next = red_list;
  851                 red_list = rqp;
  852                 break;
  853 
  854         case RED_IF_DETACH:
  855                 ifacep = (struct red_interface *)addr;
  856                 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
  857                         error = EBADF;
  858                         break;
  859                 }
  860                 error = red_detach(rqp);
  861                 break;
  862 
  863         case RED_GETSTATS:
  864                 do {
  865                         struct red_stats *q_stats;
  866                         red_t *rp;
  867 
  868                         q_stats = (struct red_stats *)addr;
  869                         if ((rqp = altq_lookup(q_stats->iface.red_ifname,
  870                                              ALTQT_RED)) == NULL) {
  871                                 error = EBADF;
  872                                 break;
  873                         }
  874 
  875                         q_stats->q_len     = qlen(rqp->rq_q);
  876                         q_stats->q_limit   = qlimit(rqp->rq_q);
  877 
  878                         rp = rqp->rq_red;
  879                         q_stats->q_avg     = rp->red_avg >> rp->red_wshift;
  880                         q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
  881                         q_stats->drop_cnt  = rp->red_stats.drop_cnt;
  882                         q_stats->drop_forced   = rp->red_stats.drop_forced;
  883                         q_stats->drop_unforced = rp->red_stats.drop_unforced;
  884                         q_stats->marked_packets = rp->red_stats.marked_packets;
  885 
  886                         q_stats->weight         = rp->red_weight;
  887                         q_stats->inv_pmax       = rp->red_inv_pmax;
  888                         q_stats->th_min         = rp->red_thmin;
  889                         q_stats->th_max         = rp->red_thmax;
  890 
  891 #ifdef ALTQ_FLOWVALVE
  892                         if (rp->red_flowvalve != NULL) {
  893                                 struct flowvalve *fv = rp->red_flowvalve;
  894                                 q_stats->fv_flows    = fv->fv_flows;
  895                                 q_stats->fv_pass     = fv->fv_stats.pass;
  896                                 q_stats->fv_predrop  = fv->fv_stats.predrop;
  897                                 q_stats->fv_alloc    = fv->fv_stats.alloc;
  898                                 q_stats->fv_escape   = fv->fv_stats.escape;
  899                         } else {
  900 #endif /* ALTQ_FLOWVALVE */
  901                                 q_stats->fv_flows    = 0;
  902                                 q_stats->fv_pass     = 0;
  903                                 q_stats->fv_predrop  = 0;
  904                                 q_stats->fv_alloc    = 0;
  905                                 q_stats->fv_escape   = 0;
  906 #ifdef ALTQ_FLOWVALVE
  907                         }
  908 #endif /* ALTQ_FLOWVALVE */
  909                 } while (/*CONSTCOND*/ 0);
  910                 break;
  911 
  912         case RED_CONFIG:
  913                 do {
  914                         struct red_conf *fc;
  915                         red_t *new;
  916                         int s, limit;
  917 
  918                         fc = (struct red_conf *)addr;
  919                         if ((rqp = altq_lookup(fc->iface.red_ifname,
  920                                                ALTQT_RED)) == NULL) {
  921                                 error = EBADF;
  922                                 break;
  923                         }
  924                         new = red_alloc(fc->red_weight,
  925                                         fc->red_inv_pmax,
  926                                         fc->red_thmin,
  927                                         fc->red_thmax,
  928                                         fc->red_flags,
  929                                         fc->red_pkttime);
  930                         if (new == NULL) {
  931                                 error = ENOMEM;
  932                                 break;
  933                         }
  934 
  935                         s = splnet();
  936                         red_purgeq(rqp);
  937                         limit = fc->red_limit;
  938                         if (limit < fc->red_thmax)
  939                                 limit = fc->red_thmax;
  940                         qlimit(rqp->rq_q) = limit;
  941                         fc->red_limit = limit;  /* write back the new value */
  942 
  943                         red_destroy(rqp->rq_red);
  944                         rqp->rq_red = new;
  945 
  946                         splx(s);
  947 
  948                         /* write back new values */
  949                         fc->red_limit = limit;
  950                         fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
  951                         fc->red_thmin = rqp->rq_red->red_thmin;
  952                         fc->red_thmax = rqp->rq_red->red_thmax;
  953 
  954                 } while (/*CONSTCOND*/ 0);
  955                 break;
  956 
  957         case RED_SETDEFAULTS:
  958                 do {
  959                         struct redparams *rp;
  960 
  961                         rp = (struct redparams *)addr;
  962 
  963                         default_th_min = rp->th_min;
  964                         default_th_max = rp->th_max;
  965                         default_inv_pmax = rp->inv_pmax;
  966                 } while (/*CONSTCOND*/ 0);
  967                 break;
  968 
  969         default:
  970                 error = EINVAL;
  971                 break;
  972         }
  973         return error;
  974 }
  975 
  976 static int
  977 red_detach(red_queue_t *rqp)
  978 {
  979         red_queue_t *tmp;
  980         int error = 0;
  981 
  982         if (ALTQ_IS_ENABLED(rqp->rq_ifq))
  983                 altq_disable(rqp->rq_ifq);
  984 
  985         if ((error = altq_detach(rqp->rq_ifq)))
  986                 return (error);
  987 
  988         if (red_list == rqp)
  989                 red_list = rqp->rq_next;
  990         else {
  991                 for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
  992                         if (tmp->rq_next == rqp) {
  993                                 tmp->rq_next = rqp->rq_next;
  994                                 break;
  995                         }
  996                 if (tmp == NULL)
  997                         printf("red_detach: no state found in red_list!\n");
  998         }
  999 
 1000         red_destroy(rqp->rq_red);
 1001         free(rqp->rq_q, M_DEVBUF);
 1002         free(rqp, M_DEVBUF);
 1003         return (error);
 1004 }
 1005 
 1006 /*
 1007  * enqueue routine:
 1008  *
 1009  *      returns: 0 when successfully queued.
 1010  *               ENOBUFS when drop occurs.
 1011  */
 1012 static int
 1013 red_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 1014 {
 1015         red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
 1016 
 1017         if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
 1018                 return ENOBUFS;
 1019         ifq->ifq_len++;
 1020         return 0;
 1021 }
 1022 
 1023 /*
 1024  * dequeue routine:
 1025  *      must be called in splnet.
 1026  *
 1027  *      returns: mbuf dequeued.
 1028  *               NULL when no packet is available in the queue.
 1029  */
 1030 
 1031 static struct mbuf *
 1032 red_dequeue(struct ifaltq *ifq, int op)
 1033 {
 1034         red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
 1035         struct mbuf *m;
 1036 
 1037         if (op == ALTDQ_POLL)
 1038                 return qhead(rqp->rq_q);
 1039 
 1040         /* op == ALTDQ_REMOVE */
 1041         m =  red_getq(rqp->rq_red, rqp->rq_q);
 1042         if (m != NULL)
 1043                 ifq->ifq_len--;
 1044         return (m);
 1045 }
 1046 
 1047 static int
 1048 red_request(struct ifaltq *ifq, int req, void *arg)
 1049 {
 1050         red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
 1051 
 1052         switch (req) {
 1053         case ALTRQ_PURGE:
 1054                 red_purgeq(rqp);
 1055                 break;
 1056         }
 1057         return (0);
 1058 }
 1059 
 1060 static void
 1061 red_purgeq(red_queue_t *rqp)
 1062 {
 1063         _flushq(rqp->rq_q);
 1064         if (ALTQ_IS_ENABLED(rqp->rq_ifq))
 1065                 rqp->rq_ifq->ifq_len = 0;
 1066 }
 1067 
 1068 #ifdef ALTQ_FLOWVALVE
 1069 
 1070 #define FV_PSHIFT       7       /* weight of average drop rate -- 1/128 */
 1071 #define FV_PSCALE(x)    ((x) << FV_PSHIFT)
 1072 #define FV_PUNSCALE(x)  ((x) >> FV_PSHIFT)
 1073 #define FV_FSHIFT       5       /* weight of average fraction -- 1/32 */
 1074 #define FV_FSCALE(x)    ((x) << FV_FSHIFT)
 1075 #define FV_FUNSCALE(x)  ((x) >> FV_FSHIFT)
 1076 
 1077 #define FV_TIMER        (3 * hz)        /* timer value for garbage collector */
 1078 #define FV_FLOWLISTSIZE         64      /* how many flows in flowlist */
 1079 
 1080 #define FV_N                    10      /* update fve_f every FV_N packets */
 1081 
 1082 #define FV_BACKOFFTHRESH        1  /* backoff threshold interval in second */
 1083 #define FV_TTHRESH              3  /* time threshold to delete fve */
 1084 #define FV_ALPHA                5  /* extra packet count */
 1085 
 1086 #define FV_STATS
 1087 
 1088 #define FV_TIMESTAMP(tp)        getmicrotime(tp)
 1089 
 1090 /*
 1091  * Brtt table: 127 entry table to convert drop rate (p) to
 1092  * the corresponding bandwidth fraction (f)
 1093  * the following equation is implemented to use scaled values,
 1094  * fve_p and fve_f, in the fixed point format.
 1095  *
 1096  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
 1097  *   f = Brtt(p) / (max_th + alpha)
 1098  */
 1099 #define BRTT_SIZE       128
 1100 #define BRTT_SHIFT      12
 1101 #define BRTT_MASK       0x0007f000
 1102 #define BRTT_PMAX       (1 << (FV_PSHIFT + FP_SHIFT))
 1103 
 1104 const int brtt_tab[BRTT_SIZE] = {
 1105         0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
 1106         392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
 1107         225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
 1108         145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
 1109         98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
 1110         67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
 1111         47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
 1112         33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
 1113         24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
 1114         18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
 1115         14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
 1116         10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
 1117         8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
 1118         6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
 1119         5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
 1120         4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
 1121 };
 1122 
 1123 static inline struct fve *
 1124 flowlist_lookup(struct flowvalve *fv, struct altq_pktattr *pktattr,
 1125     struct timeval *now)
 1126 {
 1127         struct fve *fve;
 1128         int flows;
 1129         struct ip *ip;
 1130 #ifdef INET6
 1131         struct ip6_hdr *ip6;
 1132 #endif
 1133         struct timeval tthresh;
 1134 
 1135         if (pktattr == NULL)
 1136                 return (NULL);
 1137 
 1138         tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
 1139         flows = 0;
 1140         /*
 1141          * search the flow list
 1142          */
 1143         switch (pktattr->pattr_af) {
 1144         case AF_INET:
 1145                 ip = (struct ip *)pktattr->pattr_hdr;
 1146                 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
 1147                         if (fve->fve_lastdrop.tv_sec == 0)
 1148                                 break;
 1149                         if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
 1150                                 fve->fve_lastdrop.tv_sec = 0;
 1151                                 break;
 1152                         }
 1153                         if (fve->fve_flow.flow_af == AF_INET &&
 1154                             fve->fve_flow.flow_ip.ip_src.s_addr ==
 1155                             ip->ip_src.s_addr &&
 1156                             fve->fve_flow.flow_ip.ip_dst.s_addr ==
 1157                             ip->ip_dst.s_addr)
 1158                                 return (fve);
 1159                         flows++;
 1160                 }
 1161                 break;
 1162 #ifdef INET6
 1163         case AF_INET6:
 1164                 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
 1165                 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
 1166                         if (fve->fve_lastdrop.tv_sec == 0)
 1167                                 break;
 1168                         if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
 1169                                 fve->fve_lastdrop.tv_sec = 0;
 1170                                 break;
 1171                         }
 1172                         if (fve->fve_flow.flow_af == AF_INET6 &&
 1173                             IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
 1174                                                &ip6->ip6_src) &&
 1175                             IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
 1176                                                &ip6->ip6_dst))
 1177                                 return (fve);
 1178                         flows++;
 1179                 }
 1180                 break;
 1181 #endif /* INET6 */
 1182 
 1183         default:
 1184                 /* unknown protocol.  no drop. */
 1185                 return (NULL);
 1186         }
 1187         fv->fv_flows = flows;   /* save the number of active fve's */
 1188         return (NULL);
 1189 }
 1190 
 1191 static inline struct fve *
 1192 flowlist_reclaim(struct flowvalve *fv, struct altq_pktattr *pktattr)
 1193 {
 1194         struct fve *fve;
 1195         struct ip *ip;
 1196 #ifdef INET6
 1197         struct ip6_hdr *ip6;
 1198 #endif
 1199 
 1200         /*
 1201          * get an entry from the tail of the LRU list.
 1202          */
 1203         fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
 1204 
 1205         switch (pktattr->pattr_af) {
 1206         case AF_INET:
 1207                 ip = (struct ip *)pktattr->pattr_hdr;
 1208                 fve->fve_flow.flow_af = AF_INET;
 1209                 fve->fve_flow.flow_ip.ip_src = ip->ip_src;
 1210                 fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
 1211                 break;
 1212 #ifdef INET6
 1213         case AF_INET6:
 1214                 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
 1215                 fve->fve_flow.flow_af = AF_INET6;
 1216                 fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
 1217                 fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
 1218                 break;
 1219 #endif
 1220         }
 1221 
 1222         fve->fve_state = Green;
 1223         fve->fve_p = 0.0;
 1224         fve->fve_f = 0.0;
 1225         fve->fve_ifseq = fv->fv_ifseq - 1;
 1226         fve->fve_count = 0;
 1227 
 1228         fv->fv_flows++;
 1229 #ifdef FV_STATS
 1230         fv->fv_stats.alloc++;
 1231 #endif
 1232         return (fve);
 1233 }
 1234 
 1235 static inline void
 1236 flowlist_move_to_head(struct flowvalve *fv, struct fve *fve)
 1237 {
 1238         if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
 1239                 TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
 1240                 TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
 1241         }
 1242 }
 1243 
 1244 /*
 1245  * allocate flowvalve structure
 1246  */
 1247 static struct flowvalve *
 1248 fv_alloc(struct red *rp)
 1249 {
 1250         struct flowvalve *fv;
 1251         struct fve *fve;
 1252         int i, num;
 1253 
 1254         num = FV_FLOWLISTSIZE;
 1255         fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
 1256         if (fv == NULL)
 1257                 return (NULL);
 1258 
 1259         fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
 1260             M_WAITOK|M_ZERO);
 1261         if (fv->fv_fves == NULL) {
 1262                 free(fv, M_DEVBUF);
 1263                 return (NULL);
 1264         }
 1265 
 1266         fv->fv_flows = 0;
 1267         TAILQ_INIT(&fv->fv_flowlist);
 1268         for (i = 0; i < num; i++) {
 1269                 fve = &fv->fv_fves[i];
 1270                 fve->fve_lastdrop.tv_sec = 0;
 1271                 TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
 1272         }
 1273 
 1274         /* initialize drop rate threshold in scaled fixed-point */
 1275         fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
 1276 
 1277         /* initialize drop rate to fraction table */
 1278         fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
 1279         if (fv->fv_p2ftab == NULL) {
 1280                 free(fv->fv_fves, M_DEVBUF);
 1281                 free(fv, M_DEVBUF);
 1282                 return (NULL);
 1283         }
 1284         /*
 1285          * create the p2f table.
 1286          * (shift is used to keep the precision)
 1287          */
 1288         for (i = 1; i < BRTT_SIZE; i++) {
 1289                 int f;
 1290 
 1291                 f = brtt_tab[i] << 8;
 1292                 fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
 1293         }
 1294 
 1295         return (fv);
 1296 }
 1297 
 1298 static void
 1299 fv_destroy(struct flowvalve *fv)
 1300 {
 1301         free(fv->fv_p2ftab, M_DEVBUF);
 1302         free(fv->fv_fves, M_DEVBUF);
 1303         free(fv, M_DEVBUF);
 1304 }
 1305 
 1306 static inline int
 1307 fv_p2f(struct flowvalve *fv, int p)
 1308 {
 1309         int val, f;
 1310 
 1311         if (p >= BRTT_PMAX)
 1312                 f = fv->fv_p2ftab[BRTT_SIZE-1];
 1313         else if ((val = (p & BRTT_MASK)))
 1314                 f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
 1315         else
 1316                 f = fv->fv_p2ftab[1];
 1317         return (f);
 1318 }
 1319 
 1320 /*
 1321  * check if an arriving packet should be pre-dropped.
 1322  * called from red_addq() when a packet arrives.
 1323  * returns 1 when the packet should be pre-dropped.
 1324  * should be called in splnet.
 1325  */
 1326 static int
 1327 fv_checkflow(struct flowvalve *fv, struct altq_pktattr *pktattr,
 1328     struct fve **fcache)
 1329 {
 1330         struct fve *fve;
 1331         struct timeval now;
 1332 
 1333         fv->fv_ifseq++;
 1334         FV_TIMESTAMP(&now);
 1335 
 1336         if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
 1337                 /* no matching entry in the flowlist */
 1338                 return (0);
 1339 
 1340         *fcache = fve;
 1341 
 1342         /* update fraction f for every FV_N packets */
 1343         if (++fve->fve_count == FV_N) {
 1344                 /*
 1345                  * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
 1346                  */
 1347                 fve->fve_f =
 1348                         (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
 1349                         + fve->fve_f - FV_FUNSCALE(fve->fve_f);
 1350                 fve->fve_ifseq = fv->fv_ifseq;
 1351                 fve->fve_count = 0;
 1352         }
 1353 
 1354         /*
 1355          * overpumping test
 1356          */
 1357         if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
 1358                 int fthresh;
 1359 
 1360                 /* calculate a threshold */
 1361                 fthresh = fv_p2f(fv, fve->fve_p);
 1362                 if (fve->fve_f > fthresh)
 1363                         fve->fve_state = Red;
 1364         }
 1365 
 1366         if (fve->fve_state == Red) {
 1367                 /*
 1368                  * backoff test
 1369                  */
 1370                 if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
 1371                         /* no drop for at least FV_BACKOFFTHRESH sec */
 1372                         fve->fve_p = 0;
 1373                         fve->fve_state = Green;
 1374 #ifdef FV_STATS
 1375                         fv->fv_stats.escape++;
 1376 #endif
 1377                 } else {
 1378                         /* block this flow */
 1379                         flowlist_move_to_head(fv, fve);
 1380                         fve->fve_lastdrop = now;
 1381 #ifdef FV_STATS
 1382                         fv->fv_stats.predrop++;
 1383 #endif
 1384                         return (1);
 1385                 }
 1386         }
 1387 
 1388         /*
 1389          * p = (1 - Wp) * p
 1390          */
 1391         fve->fve_p -= FV_PUNSCALE(fve->fve_p);
 1392         if (fve->fve_p < 0)
 1393                 fve->fve_p = 0;
 1394 #ifdef FV_STATS
 1395         fv->fv_stats.pass++;
 1396 #endif
 1397         return (0);
 1398 }
 1399 
 1400 /*
 1401  * called from red_addq when a packet is dropped by red.
 1402  * should be called in splnet.
 1403  */
 1404 static void
 1405 fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *pktattr,
 1406     struct fve *fcache)
 1407 {
 1408         struct fve *fve;
 1409         struct timeval now;
 1410 
 1411         if (pktattr == NULL)
 1412                 return;
 1413         FV_TIMESTAMP(&now);
 1414 
 1415         if (fcache != NULL)
 1416                 /* the fve of this packet is already cached */
 1417                 fve = fcache;
 1418         else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
 1419                 fve = flowlist_reclaim(fv, pktattr);
 1420 
 1421         flowlist_move_to_head(fv, fve);
 1422 
 1423         /*
 1424          * update p:  the following line cancels the update
 1425          *            in fv_checkflow() and calculate
 1426          *      p = Wp + (1 - Wp) * p
 1427          */
 1428         fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
 1429 
 1430         fve->fve_lastdrop = now;
 1431 }
 1432 
 1433 #endif /* ALTQ_FLOWVALVE */
 1434 
 1435 #ifdef KLD_MODULE
 1436 
 1437 static struct altqsw red_sw =
 1438         {"red", redopen, redclose, redioctl};
 1439 
 1440 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
 1441 MODULE_VERSION(altq_red, 1);
 1442 
 1443 #endif /* KLD_MODULE */
 1444 #endif /* ALTQ3_COMPAT */
 1445 
 1446 #endif /* ALTQ_RED */

Cache object: 2cf2fd3632ce734b882be6d3e8c306a1


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.