The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/altq/altq_red.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: altq_red.c,v 1.24 2006/11/16 01:32:37 christos Exp $   */
    2 /*      $KAME: altq_red.c,v 1.20 2005/04/13 03:44:25 suz Exp $  */
    3 
    4 /*
    5  * Copyright (C) 1997-2003
    6  *      Sony Computer Science Laboratories Inc.  All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  */
   30 /*
   31  * Copyright (c) 1990-1994 Regents of the University of California.
   32  * All rights reserved.
   33  *
   34  * Redistribution and use in source and binary forms, with or without
   35  * modification, are permitted provided that the following conditions
   36  * are met:
   37  * 1. Redistributions of source code must retain the above copyright
   38  *    notice, this list of conditions and the following disclaimer.
   39  * 2. Redistributions in binary form must reproduce the above copyright
   40  *    notice, this list of conditions and the following disclaimer in the
   41  *    documentation and/or other materials provided with the distribution.
   42  * 3. All advertising materials mentioning features or use of this software
   43  *    must display the following acknowledgement:
   44  *      This product includes software developed by the Computer Systems
   45  *      Engineering Group at Lawrence Berkeley Laboratory.
   46  * 4. Neither the name of the University nor of the Laboratory may be used
   47  *    to endorse or promote products derived from this software without
   48  *    specific prior written permission.
   49  *
   50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   60  * SUCH DAMAGE.
   61  */
   62 
   63 #include <sys/cdefs.h>
   64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.24 2006/11/16 01:32:37 christos Exp $");
   65 
   66 #ifdef _KERNEL_OPT
   67 #include "opt_altq.h"
   68 #include "opt_inet.h"
   69 #include "pf.h"
   70 #endif
   71 
   72 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
   73 
   74 #include <sys/param.h>
   75 #include <sys/malloc.h>
   76 #include <sys/mbuf.h>
   77 #include <sys/socket.h>
   78 #include <sys/systm.h>
   79 #include <sys/errno.h>
   80 #include <sys/kauth.h>
   81 #if 1 /* ALTQ3_COMPAT */
   82 #include <sys/sockio.h>
   83 #include <sys/proc.h>
   84 #include <sys/kernel.h>
   85 #include <sys/kauth.h>
   86 #ifdef ALTQ_FLOWVALVE
   87 #include <sys/queue.h>
   88 #include <sys/time.h>
   89 #endif
   90 #endif /* ALTQ3_COMPAT */
   91 
   92 #include <net/if.h>
   93 
   94 #include <netinet/in.h>
   95 #include <netinet/in_systm.h>
   96 #include <netinet/ip.h>
   97 #ifdef INET6
   98 #include <netinet/ip6.h>
   99 #endif
  100 
  101 #if NPF > 0
  102 #include <net/pfvar.h>
  103 #endif
  104 #include <altq/altq.h>
  105 #include <altq/altq_red.h>
  106 #ifdef ALTQ3_COMPAT
  107 #include <altq/altq_conf.h>
  108 #ifdef ALTQ_FLOWVALVE
  109 #include <altq/altq_flowvalve.h>
  110 #endif
  111 #endif
  112 
  113 /*
  114  * ALTQ/RED (Random Early Detection) implementation using 32-bit
  115  * fixed-point calculation.
  116  *
  117  * written by kjc using the ns code as a reference.
  118  * you can learn more about red and ns from Sally's home page at
  119  * http://www-nrg.ee.lbl.gov/floyd/
  120  *
  121  * most of the red parameter values are fixed in this implementation
  122  * to prevent fixed-point overflow/underflow.
  123  * if you change the parameters, watch out for overflow/underflow!
  124  *
  125  * the parameters used are recommended values by Sally.
  126  * the corresponding ns config looks:
  127  *      q_weight=0.00195
  128  *      minthresh=5 maxthresh=15 queue-size=60
  129  *      linterm=30
  130  *      dropmech=drop-tail
  131  *      bytes=false (can't be handled by 32-bit fixed-point)
  132  *      doubleq=false dqthresh=false
  133  *      wait=true
  134  */
  135 /*
  136  * alternative red parameters for a slow link.
  137  *
  138  * assume the queue length becomes from zero to L and keeps L, it takes
  139  * N packets for q_avg to reach 63% of L.
  140  * when q_weight is 0.002, N is about 500 packets.
  141  * for a slow link like dial-up, 500 packets takes more than 1 minute!
  142  * when q_weight is 0.008, N is about 127 packets.
  143  * when q_weight is 0.016, N is about 63 packets.
  144  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
  145  * are allowed for 0.016.
  146  * see Sally's paper for more details.
  147  */
  148 /* normal red parameters */
  149 #define W_WEIGHT        512     /* inverse of weight of EWMA (511/512) */
  150                                 /* q_weight = 0.00195 */
  151 
  152 /* red parameters for a slow link */
  153 #define W_WEIGHT_1      128     /* inverse of weight of EWMA (127/128) */
  154                                 /* q_weight = 0.0078125 */
  155 
  156 /* red parameters for a very slow link (e.g., dialup) */
  157 #define W_WEIGHT_2      64      /* inverse of weight of EWMA (63/64) */
  158                                 /* q_weight = 0.015625 */
  159 
  160 /* fixed-point uses 12-bit decimal places */
  161 #define FP_SHIFT        12      /* fixed-point shift */
  162 
  163 /* red parameters for drop probability */
  164 #define INV_P_MAX       10      /* inverse of max drop probability */
  165 #define TH_MIN          5       /* min threshold */
  166 #define TH_MAX          15      /* max threshold */
  167 
  168 #define RED_LIMIT       60      /* default max queue lenght */
  169 #define RED_STATS               /* collect statistics */
  170 
  171 /*
  172  * our default policy for forced-drop is drop-tail.
  173  * (in altq-1.1.2 or earlier, the default was random-drop.
  174  * but it makes more sense to punish the cause of the surge.)
  175  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
  176  */
  177 
  178 #ifdef ALTQ3_COMPAT
  179 #ifdef ALTQ_FLOWVALVE
  180 /*
  181  * flow-valve is an extention to protect red from unresponsive flows
  182  * and to promote end-to-end congestion control.
  183  * flow-valve observes the average drop rates of the flows that have
  184  * experienced packet drops in the recent past.
  185  * when the average drop rate exceeds the threshold, the flow is
  186  * blocked by the flow-valve.  the trapped flow should back off
  187  * exponentially to escape from the flow-valve.
  188  */
  189 #ifdef RED_RANDOM_DROP
  190 #error "random-drop can't be used with flow-valve!"
  191 #endif
  192 #endif /* ALTQ_FLOWVALVE */
  193 
  194 /* red_list keeps all red_queue_t's allocated. */
  195 static red_queue_t *red_list = NULL;
  196 
  197 #endif /* ALTQ3_COMPAT */
  198 
  199 /* default red parameter values */
  200 static int default_th_min = TH_MIN;
  201 static int default_th_max = TH_MAX;
  202 static int default_inv_pmax = INV_P_MAX;
  203 
  204 #ifdef ALTQ3_COMPAT
  205 /* internal function prototypes */
  206 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
  207 static struct mbuf *red_dequeue(struct ifaltq *, int);
  208 static int red_request(struct ifaltq *, int, void *);
  209 static void red_purgeq(red_queue_t *);
  210 static int red_detach(red_queue_t *);
  211 #ifdef ALTQ_FLOWVALVE
  212 static inline struct fve *flowlist_lookup(struct flowvalve *,
  213                          struct altq_pktattr *, struct timeval *);
  214 static inline struct fve *flowlist_reclaim(struct flowvalve *,
  215                                              struct altq_pktattr *);
  216 static inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
  217 static inline int fv_p2f(struct flowvalve *, int);
  218 static struct flowvalve *fv_alloc(struct red *);
  219 static void fv_destroy(struct flowvalve *);
  220 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
  221                         struct fve **);
  222 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
  223                          struct fve *);
  224 #endif
  225 #endif /* ALTQ3_COMPAT */
  226 
  227 /*
  228  * red support routines
  229  */
  230 red_t *
  231 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
  232    int pkttime)
  233 {
  234         red_t   *rp;
  235         int      w, i;
  236         int      npkts_per_sec;
  237 
  238         rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
  239         if (rp == NULL)
  240                 return (NULL);
  241 
  242         rp->red_avg = 0;
  243         rp->red_idle = 1;
  244 
  245         if (weight == 0)
  246                 rp->red_weight = W_WEIGHT;
  247         else
  248                 rp->red_weight = weight;
  249         if (inv_pmax == 0)
  250                 rp->red_inv_pmax = default_inv_pmax;
  251         else
  252                 rp->red_inv_pmax = inv_pmax;
  253         if (th_min == 0)
  254                 rp->red_thmin = default_th_min;
  255         else
  256                 rp->red_thmin = th_min;
  257         if (th_max == 0)
  258                 rp->red_thmax = default_th_max;
  259         else
  260                 rp->red_thmax = th_max;
  261 
  262         rp->red_flags = flags;
  263 
  264         if (pkttime == 0)
  265                 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
  266                 rp->red_pkttime = 800;
  267         else
  268                 rp->red_pkttime = pkttime;
  269 
  270         if (weight == 0) {
  271                 /* when the link is very slow, adjust red parameters */
  272                 npkts_per_sec = 1000000 / rp->red_pkttime;
  273                 if (npkts_per_sec < 50) {
  274                         /* up to about 400Kbps */
  275                         rp->red_weight = W_WEIGHT_2;
  276                 } else if (npkts_per_sec < 300) {
  277                         /* up to about 2.4Mbps */
  278                         rp->red_weight = W_WEIGHT_1;
  279                 }
  280         }
  281 
  282         /* calculate wshift.  weight must be power of 2 */
  283         w = rp->red_weight;
  284         for (i = 0; w > 1; i++)
  285                 w = w >> 1;
  286         rp->red_wshift = i;
  287         w = 1 << rp->red_wshift;
  288         if (w != rp->red_weight) {
  289                 printf("invalid weight value %d for red! use %d\n",
  290                        rp->red_weight, w);
  291                 rp->red_weight = w;
  292         }
  293 
  294         /*
  295          * thmin_s and thmax_s are scaled versions of th_min and th_max
  296          * to be compared with avg.
  297          */
  298         rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
  299         rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
  300 
  301         /*
  302          * precompute probability denominator
  303          *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
  304          */
  305         rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
  306                          * rp->red_inv_pmax) << FP_SHIFT;
  307 
  308         /* allocate weight table */
  309         rp->red_wtab = wtab_alloc(rp->red_weight);
  310 
  311         microtime(&rp->red_last);
  312 #ifdef ALTQ3_COMPAT
  313 #ifdef ALTQ_FLOWVALVE
  314         if (flags & REDF_FLOWVALVE)
  315                 rp->red_flowvalve = fv_alloc(rp);
  316         /* if fv_alloc failes, flowvalve is just disabled */
  317 #endif
  318 #endif /* ALTQ3_COMPAT */
  319         return (rp);
  320 }
  321 
  322 void
  323 red_destroy(red_t *rp)
  324 {
  325 #ifdef ALTQ3_COMPAT
  326 #ifdef ALTQ_FLOWVALVE
  327         if (rp->red_flowvalve != NULL)
  328                 fv_destroy(rp->red_flowvalve);
  329 #endif
  330 #endif /* ALTQ3_COMPAT */
  331         wtab_destroy(rp->red_wtab);
  332         free(rp, M_DEVBUF);
  333 }
  334 
  335 void
  336 red_getstats(red_t *rp, struct redstats *sp)
  337 {
  338         sp->q_avg               = rp->red_avg >> rp->red_wshift;
  339         sp->xmit_cnt            = rp->red_stats.xmit_cnt;
  340         sp->drop_cnt            = rp->red_stats.drop_cnt;
  341         sp->drop_forced         = rp->red_stats.drop_forced;
  342         sp->drop_unforced       = rp->red_stats.drop_unforced;
  343         sp->marked_packets      = rp->red_stats.marked_packets;
  344 }
  345 
  346 int
  347 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
  348     struct altq_pktattr *pktattr)
  349 {
  350         int avg, droptype;
  351         int n;
  352 #ifdef ALTQ3_COMPAT
  353 #ifdef ALTQ_FLOWVALVE
  354         struct fve *fve = NULL;
  355 
  356         if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
  357                 if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
  358                         m_freem(m);
  359                         return (-1);
  360                 }
  361 #endif
  362 #endif /* ALTQ3_COMPAT */
  363 
  364         avg = rp->red_avg;
  365 
  366         /*
  367          * if we were idle, we pretend that n packets arrived during
  368          * the idle period.
  369          */
  370         if (rp->red_idle) {
  371                 struct timeval now;
  372                 int t;
  373 
  374                 rp->red_idle = 0;
  375                 microtime(&now);
  376                 t = (now.tv_sec - rp->red_last.tv_sec);
  377                 if (t > 60) {
  378                         /*
  379                          * being idle for more than 1 minute, set avg to zero.
  380                          * this prevents t from overflow.
  381                          */
  382                         avg = 0;
  383                 } else {
  384                         t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
  385                         n = t / rp->red_pkttime - 1;
  386 
  387                         /* the following line does (avg = (1 - Wq)^n * avg) */
  388                         if (n > 0)
  389                                 avg = (avg >> FP_SHIFT) *
  390                                     pow_w(rp->red_wtab, n);
  391                 }
  392         }
  393 
  394         /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
  395         avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
  396         rp->red_avg = avg;              /* save the new value */
  397 
  398         /*
  399          * red_count keeps a tally of arriving traffic that has not
  400          * been dropped.
  401          */
  402         rp->red_count++;
  403 
  404         /* see if we drop early */
  405         droptype = DTYPE_NODROP;
  406         if (avg >= rp->red_thmin_s && qlen(q) > 1) {
  407                 if (avg >= rp->red_thmax_s) {
  408                         /* avg >= th_max: forced drop */
  409                         droptype = DTYPE_FORCED;
  410                 } else if (rp->red_old == 0) {
  411                         /* first exceeds th_min */
  412                         rp->red_count = 1;
  413                         rp->red_old = 1;
  414                 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
  415                                       rp->red_probd, rp->red_count)) {
  416                         /* mark or drop by red */
  417                         if ((rp->red_flags & REDF_ECN) &&
  418                             mark_ecn(m, pktattr, rp->red_flags)) {
  419                                 /* successfully marked.  do not drop. */
  420                                 rp->red_count = 0;
  421 #ifdef RED_STATS
  422                                 rp->red_stats.marked_packets++;
  423 #endif
  424                         } else {
  425                                 /* unforced drop by red */
  426                                 droptype = DTYPE_EARLY;
  427                         }
  428                 }
  429         } else {
  430                 /* avg < th_min */
  431                 rp->red_old = 0;
  432         }
  433 
  434         /*
  435          * if the queue length hits the hard limit, it's a forced drop.
  436          */
  437         if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
  438                 droptype = DTYPE_FORCED;
  439 
  440 #ifdef RED_RANDOM_DROP
  441         /* if successful or forced drop, enqueue this packet. */
  442         if (droptype != DTYPE_EARLY)
  443                 _addq(q, m);
  444 #else
  445         /* if successful, enqueue this packet. */
  446         if (droptype == DTYPE_NODROP)
  447                 _addq(q, m);
  448 #endif
  449         if (droptype != DTYPE_NODROP) {
  450                 if (droptype == DTYPE_EARLY) {
  451                         /* drop the incoming packet */
  452 #ifdef RED_STATS
  453                         rp->red_stats.drop_unforced++;
  454 #endif
  455                 } else {
  456                         /* forced drop, select a victim packet in the queue. */
  457 #ifdef RED_RANDOM_DROP
  458                         m = _getq_random(q);
  459 #endif
  460 #ifdef RED_STATS
  461                         rp->red_stats.drop_forced++;
  462 #endif
  463                 }
  464 #ifdef RED_STATS
  465                 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
  466 #endif
  467                 rp->red_count = 0;
  468 #ifdef ALTQ3_COMPAT
  469 #ifdef ALTQ_FLOWVALVE
  470                 if (rp->red_flowvalve != NULL)
  471                         fv_dropbyred(rp->red_flowvalve, pktattr, fve);
  472 #endif
  473 #endif /* ALTQ3_COMPAT */
  474                 m_freem(m);
  475                 return (-1);
  476         }
  477         /* successfully queued */
  478 #ifdef RED_STATS
  479         PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
  480 #endif
  481         return (0);
  482 }
  483 
  484 /*
  485  * early-drop probability is calculated as follows:
  486  *   prob = p_max * (avg - th_min) / (th_max - th_min)
  487  *   prob_a = prob / (2 - count*prob)
  488  *          = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
  489  * here prob_a increases as successive undrop count increases.
  490  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
  491  * becomes 1 when (count >= (2 / prob))).
  492  */
  493 int
  494 drop_early(int fp_len, int fp_probd, int count)
  495 {
  496         int     d;              /* denominator of drop-probability */
  497 
  498         d = fp_probd - count * fp_len;
  499         if (d <= 0)
  500                 /* count exceeds the hard limit: drop or mark */
  501                 return (1);
  502 
  503         /*
  504          * now the range of d is [1..600] in fixed-point. (when
  505          * th_max-th_min=10 and p_max=1/30)
  506          * drop probability = (avg - TH_MIN) / d
  507          */
  508 
  509         if ((arc4random() % d) < fp_len) {
  510                 /* drop or mark */
  511                 return (1);
  512         }
  513         /* no drop/mark */
  514         return (0);
  515 }
  516 
  517 /*
  518  * try to mark CE bit to the packet.
  519  *    returns 1 if successfully marked, 0 otherwise.
  520  */
  521 int
  522 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
  523 {
  524         struct mbuf     *m0;
  525         struct m_tag    *t;
  526         struct altq_tag *at;
  527         void            *hdr;
  528         int              af;
  529 
  530         t = m_tag_find(m, PACKET_TAG_PF_QID, NULL);
  531         if (t != NULL) {
  532                 at = (struct altq_tag *)(t + 1);
  533                 if (at == NULL)
  534                         return (0);
  535                 af = at->af;
  536                 hdr = at->hdr;
  537 #ifdef ALTQ3_COMPAT
  538         } else if (pktattr != NULL) {
  539                 af = pktattr->pattr_af;
  540                 hdr = pktattr->pattr_hdr;
  541 #endif /* ALTQ3_COMPAT */
  542         } else
  543                 return (0);
  544 
  545         if (af != AF_INET && af != AF_INET6)
  546                 return (0);
  547 
  548         /* verify that pattr_hdr is within the mbuf data */
  549         for (m0 = m; m0 != NULL; m0 = m0->m_next)
  550                 if (((caddr_t)hdr >= m0->m_data) &&
  551                     ((caddr_t)hdr < m0->m_data + m0->m_len))
  552                         break;
  553         if (m0 == NULL) {
  554                 /* ick, tag info is stale */
  555                 return (0);
  556         }
  557 
  558         switch (af) {
  559         case AF_INET:
  560                 if (flags & REDF_ECN4) {
  561                         struct ip *ip = hdr;
  562                         u_int8_t otos;
  563                         int sum;
  564 
  565                         if (ip->ip_v != 4)
  566                                 return (0);     /* version mismatch! */
  567 
  568                         if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
  569                                 return (0);     /* not-ECT */
  570                         if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
  571                                 return (1);     /* already marked */
  572 
  573                         /*
  574                          * ecn-capable but not marked,
  575                          * mark CE and update checksum
  576                          */
  577                         otos = ip->ip_tos;
  578                         ip->ip_tos |= IPTOS_ECN_CE;
  579                         /*
  580                          * update checksum (from RFC1624)
  581                          *         HC' = ~(~HC + ~m + m')
  582                          */
  583                         sum = ~ntohs(ip->ip_sum) & 0xffff;
  584                         sum += (~otos & 0xffff) + ip->ip_tos;
  585                         sum = (sum >> 16) + (sum & 0xffff);
  586                         sum += (sum >> 16);  /* add carry */
  587                         ip->ip_sum = htons(~sum & 0xffff);
  588                         return (1);
  589                 }
  590                 break;
  591 #ifdef INET6
  592         case AF_INET6:
  593                 if (flags & REDF_ECN6) {
  594                         struct ip6_hdr *ip6 = hdr;
  595                         u_int32_t flowlabel;
  596 
  597                         flowlabel = ntohl(ip6->ip6_flow);
  598                         if ((flowlabel >> 28) != 6)
  599                                 return (0);     /* version mismatch! */
  600                         if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
  601                             (IPTOS_ECN_NOTECT << 20))
  602                                 return (0);     /* not-ECT */
  603                         if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
  604                             (IPTOS_ECN_CE << 20))
  605                                 return (1);     /* already marked */
  606                         /*
  607                          * ecn-capable but not marked,  mark CE
  608                          */
  609                         flowlabel |= (IPTOS_ECN_CE << 20);
  610                         ip6->ip6_flow = htonl(flowlabel);
  611                         return (1);
  612                 }
  613                 break;
  614 #endif  /* INET6 */
  615         }
  616 
  617         /* not marked */
  618         return (0);
  619 }
  620 
  621 struct mbuf *
  622 red_getq(red_t *rp, class_queue_t *q)
  623 {
  624         struct mbuf *m;
  625 
  626         if ((m = _getq(q)) == NULL) {
  627                 if (rp->red_idle == 0) {
  628                         rp->red_idle = 1;
  629                         microtime(&rp->red_last);
  630                 }
  631                 return NULL;
  632         }
  633 
  634         rp->red_idle = 0;
  635         return (m);
  636 }
  637 
  638 /*
  639  * helper routine to calibrate avg during idle.
  640  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
  641  * here Wq = 1/weight and the code assumes Wq is close to zero.
  642  *
  643  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
  644  */
  645 static struct wtab *wtab_list = NULL;   /* pointer to wtab list */
  646 
  647 struct wtab *
  648 wtab_alloc(int weight)
  649 {
  650         struct wtab     *w;
  651         int              i;
  652 
  653         for (w = wtab_list; w != NULL; w = w->w_next)
  654                 if (w->w_weight == weight) {
  655                         w->w_refcount++;
  656                         return (w);
  657                 }
  658 
  659         w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
  660         if (w == NULL)
  661                 panic("wtab_alloc: malloc failed!");
  662         w->w_weight = weight;
  663         w->w_refcount = 1;
  664         w->w_next = wtab_list;
  665         wtab_list = w;
  666 
  667         /* initialize the weight table */
  668         w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
  669         for (i = 1; i < 32; i++) {
  670                 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
  671                 if (w->w_tab[i] == 0 && w->w_param_max == 0)
  672                         w->w_param_max = 1 << i;
  673         }
  674 
  675         return (w);
  676 }
  677 
  678 int
  679 wtab_destroy(struct wtab *w)
  680 {
  681         struct wtab     *prev;
  682 
  683         if (--w->w_refcount > 0)
  684                 return (0);
  685 
  686         if (wtab_list == w)
  687                 wtab_list = w->w_next;
  688         else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
  689                 if (prev->w_next == w) {
  690                         prev->w_next = w->w_next;
  691                         break;
  692                 }
  693 
  694         free(w, M_DEVBUF);
  695         return (0);
  696 }
  697 
  698 int32_t
  699 pow_w(struct wtab *w, int n)
  700 {
  701         int     i, bit;
  702         int32_t val;
  703 
  704         if (n >= w->w_param_max)
  705                 return (0);
  706 
  707         val = 1 << FP_SHIFT;
  708         if (n <= 0)
  709                 return (val);
  710 
  711         bit = 1;
  712         i = 0;
  713         while (n) {
  714                 if (n & bit) {
  715                         val = (val * w->w_tab[i]) >> FP_SHIFT;
  716                         n &= ~bit;
  717                 }
  718                 i++;
  719                 bit <<=  1;
  720         }
  721         return (val);
  722 }
  723 
  724 #ifdef ALTQ3_COMPAT
  725 /*
  726  * red device interface
  727  */
  728 altqdev_decl(red);
  729 
  730 int
  731 redopen(dev_t dev, int flag, int fmt,
  732     struct lwp *l)
  733 {
  734         /* everything will be done when the queueing scheme is attached. */
  735         return 0;
  736 }
  737 
  738 int
  739 redclose(dev_t dev, int flag, int fmt,
  740     struct lwp *l)
  741 {
  742         red_queue_t *rqp;
  743         int err, error = 0;
  744 
  745         while ((rqp = red_list) != NULL) {
  746                 /* destroy all */
  747                 err = red_detach(rqp);
  748                 if (err != 0 && error == 0)
  749                         error = err;
  750         }
  751 
  752         return error;
  753 }
  754 
  755 int
  756 redioctl(dev_t dev, ioctlcmd_t cmd, caddr_t addr, int flag,
  757     struct lwp *l)
  758 {
  759         red_queue_t *rqp;
  760         struct red_interface *ifacep;
  761         struct ifnet *ifp;
  762         struct proc *p = l->l_proc;
  763         int     error = 0;
  764 
  765         /* check super-user privilege */
  766         switch (cmd) {
  767         case RED_GETSTATS:
  768                 break;
  769         default:
  770 #if (__FreeBSD_version > 400000)
  771                 if ((error = suser(p)) != 0)
  772 #else
  773                 if ((error = kauth_authorize_network(p->p_cred,
  774                     KAUTH_NETWORK_ALTQ, KAUTH_REQ_NETWORK_ALTQ_RED, NULL,
  775                     NULL, NULL)) != 0)
  776 #endif
  777                         return (error);
  778                 break;
  779         }
  780 
  781         switch (cmd) {
  782 
  783         case RED_ENABLE:
  784                 ifacep = (struct red_interface *)addr;
  785                 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
  786                         error = EBADF;
  787                         break;
  788                 }
  789                 error = altq_enable(rqp->rq_ifq);
  790                 break;
  791 
  792         case RED_DISABLE:
  793                 ifacep = (struct red_interface *)addr;
  794                 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
  795                         error = EBADF;
  796                         break;
  797                 }
  798                 error = altq_disable(rqp->rq_ifq);
  799                 break;
  800 
  801         case RED_IF_ATTACH:
  802                 ifp = ifunit(((struct red_interface *)addr)->red_ifname);
  803                 if (ifp == NULL) {
  804                         error = ENXIO;
  805                         break;
  806                 }
  807 
  808                 /* allocate and initialize red_queue_t */
  809                 rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
  810                 if (rqp == NULL) {
  811                         error = ENOMEM;
  812                         break;
  813                 }
  814 
  815                 rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
  816                     M_WAITOK|M_ZERO);
  817                 if (rqp->rq_q == NULL) {
  818                         free(rqp, M_DEVBUF);
  819                         error = ENOMEM;
  820                         break;
  821                 }
  822 
  823                 rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
  824                 if (rqp->rq_red == NULL) {
  825                         free(rqp->rq_q, M_DEVBUF);
  826                         free(rqp, M_DEVBUF);
  827                         error = ENOMEM;
  828                         break;
  829                 }
  830 
  831                 rqp->rq_ifq = &ifp->if_snd;
  832                 qtail(rqp->rq_q) = NULL;
  833                 qlen(rqp->rq_q) = 0;
  834                 qlimit(rqp->rq_q) = RED_LIMIT;
  835                 qtype(rqp->rq_q) = Q_RED;
  836 
  837                 /*
  838                  * set RED to this ifnet structure.
  839                  */
  840                 error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
  841                                     red_enqueue, red_dequeue, red_request,
  842                                     NULL, NULL);
  843                 if (error) {
  844                         red_destroy(rqp->rq_red);
  845                         free(rqp->rq_q, M_DEVBUF);
  846                         free(rqp, M_DEVBUF);
  847                         break;
  848                 }
  849 
  850                 /* add this state to the red list */
  851                 rqp->rq_next = red_list;
  852                 red_list = rqp;
  853                 break;
  854 
  855         case RED_IF_DETACH:
  856                 ifacep = (struct red_interface *)addr;
  857                 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
  858                         error = EBADF;
  859                         break;
  860                 }
  861                 error = red_detach(rqp);
  862                 break;
  863 
  864         case RED_GETSTATS:
  865                 do {
  866                         struct red_stats *q_stats;
  867                         red_t *rp;
  868 
  869                         q_stats = (struct red_stats *)addr;
  870                         if ((rqp = altq_lookup(q_stats->iface.red_ifname,
  871                                              ALTQT_RED)) == NULL) {
  872                                 error = EBADF;
  873                                 break;
  874                         }
  875 
  876                         q_stats->q_len     = qlen(rqp->rq_q);
  877                         q_stats->q_limit   = qlimit(rqp->rq_q);
  878 
  879                         rp = rqp->rq_red;
  880                         q_stats->q_avg     = rp->red_avg >> rp->red_wshift;
  881                         q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
  882                         q_stats->drop_cnt  = rp->red_stats.drop_cnt;
  883                         q_stats->drop_forced   = rp->red_stats.drop_forced;
  884                         q_stats->drop_unforced = rp->red_stats.drop_unforced;
  885                         q_stats->marked_packets = rp->red_stats.marked_packets;
  886 
  887                         q_stats->weight         = rp->red_weight;
  888                         q_stats->inv_pmax       = rp->red_inv_pmax;
  889                         q_stats->th_min         = rp->red_thmin;
  890                         q_stats->th_max         = rp->red_thmax;
  891 
  892 #ifdef ALTQ_FLOWVALVE
  893                         if (rp->red_flowvalve != NULL) {
  894                                 struct flowvalve *fv = rp->red_flowvalve;
  895                                 q_stats->fv_flows    = fv->fv_flows;
  896                                 q_stats->fv_pass     = fv->fv_stats.pass;
  897                                 q_stats->fv_predrop  = fv->fv_stats.predrop;
  898                                 q_stats->fv_alloc    = fv->fv_stats.alloc;
  899                                 q_stats->fv_escape   = fv->fv_stats.escape;
  900                         } else {
  901 #endif /* ALTQ_FLOWVALVE */
  902                                 q_stats->fv_flows    = 0;
  903                                 q_stats->fv_pass     = 0;
  904                                 q_stats->fv_predrop  = 0;
  905                                 q_stats->fv_alloc    = 0;
  906                                 q_stats->fv_escape   = 0;
  907 #ifdef ALTQ_FLOWVALVE
  908                         }
  909 #endif /* ALTQ_FLOWVALVE */
  910                 } while (/*CONSTCOND*/ 0);
  911                 break;
  912 
  913         case RED_CONFIG:
  914                 do {
  915                         struct red_conf *fc;
  916                         red_t *new;
  917                         int s, limit;
  918 
  919                         fc = (struct red_conf *)addr;
  920                         if ((rqp = altq_lookup(fc->iface.red_ifname,
  921                                                ALTQT_RED)) == NULL) {
  922                                 error = EBADF;
  923                                 break;
  924                         }
  925                         new = red_alloc(fc->red_weight,
  926                                         fc->red_inv_pmax,
  927                                         fc->red_thmin,
  928                                         fc->red_thmax,
  929                                         fc->red_flags,
  930                                         fc->red_pkttime);
  931                         if (new == NULL) {
  932                                 error = ENOMEM;
  933                                 break;
  934                         }
  935 
  936                         s = splnet();
  937                         red_purgeq(rqp);
  938                         limit = fc->red_limit;
  939                         if (limit < fc->red_thmax)
  940                                 limit = fc->red_thmax;
  941                         qlimit(rqp->rq_q) = limit;
  942                         fc->red_limit = limit;  /* write back the new value */
  943 
  944                         red_destroy(rqp->rq_red);
  945                         rqp->rq_red = new;
  946 
  947                         splx(s);
  948 
  949                         /* write back new values */
  950                         fc->red_limit = limit;
  951                         fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
  952                         fc->red_thmin = rqp->rq_red->red_thmin;
  953                         fc->red_thmax = rqp->rq_red->red_thmax;
  954 
  955                 } while (/*CONSTCOND*/ 0);
  956                 break;
  957 
  958         case RED_SETDEFAULTS:
  959                 do {
  960                         struct redparams *rp;
  961 
  962                         rp = (struct redparams *)addr;
  963 
  964                         default_th_min = rp->th_min;
  965                         default_th_max = rp->th_max;
  966                         default_inv_pmax = rp->inv_pmax;
  967                 } while (/*CONSTCOND*/ 0);
  968                 break;
  969 
  970         default:
  971                 error = EINVAL;
  972                 break;
  973         }
  974         return error;
  975 }
  976 
  977 static int
  978 red_detach(red_queue_t *rqp)
  979 {
  980         red_queue_t *tmp;
  981         int error = 0;
  982 
  983         if (ALTQ_IS_ENABLED(rqp->rq_ifq))
  984                 altq_disable(rqp->rq_ifq);
  985 
  986         if ((error = altq_detach(rqp->rq_ifq)))
  987                 return (error);
  988 
  989         if (red_list == rqp)
  990                 red_list = rqp->rq_next;
  991         else {
  992                 for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
  993                         if (tmp->rq_next == rqp) {
  994                                 tmp->rq_next = rqp->rq_next;
  995                                 break;
  996                         }
  997                 if (tmp == NULL)
  998                         printf("red_detach: no state found in red_list!\n");
  999         }
 1000 
 1001         red_destroy(rqp->rq_red);
 1002         free(rqp->rq_q, M_DEVBUF);
 1003         free(rqp, M_DEVBUF);
 1004         return (error);
 1005 }
 1006 
 1007 /*
 1008  * enqueue routine:
 1009  *
 1010  *      returns: 0 when successfully queued.
 1011  *               ENOBUFS when drop occurs.
 1012  */
 1013 static int
 1014 red_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
 1015 {
 1016         red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
 1017 
 1018         if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
 1019                 return ENOBUFS;
 1020         ifq->ifq_len++;
 1021         return 0;
 1022 }
 1023 
 1024 /*
 1025  * dequeue routine:
 1026  *      must be called in splnet.
 1027  *
 1028  *      returns: mbuf dequeued.
 1029  *               NULL when no packet is available in the queue.
 1030  */
 1031 
 1032 static struct mbuf *
 1033 red_dequeue(struct ifaltq *ifq, int op)
 1034 {
 1035         red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
 1036         struct mbuf *m;
 1037 
 1038         if (op == ALTDQ_POLL)
 1039                 return qhead(rqp->rq_q);
 1040 
 1041         /* op == ALTDQ_REMOVE */
 1042         m =  red_getq(rqp->rq_red, rqp->rq_q);
 1043         if (m != NULL)
 1044                 ifq->ifq_len--;
 1045         return (m);
 1046 }
 1047 
 1048 static int
 1049 red_request(struct ifaltq *ifq, int req, void *arg)
 1050 {
 1051         red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
 1052 
 1053         switch (req) {
 1054         case ALTRQ_PURGE:
 1055                 red_purgeq(rqp);
 1056                 break;
 1057         }
 1058         return (0);
 1059 }
 1060 
 1061 static void
 1062 red_purgeq(red_queue_t *rqp)
 1063 {
 1064         _flushq(rqp->rq_q);
 1065         if (ALTQ_IS_ENABLED(rqp->rq_ifq))
 1066                 rqp->rq_ifq->ifq_len = 0;
 1067 }
 1068 
 1069 #ifdef ALTQ_FLOWVALVE
 1070 
 1071 #define FV_PSHIFT       7       /* weight of average drop rate -- 1/128 */
 1072 #define FV_PSCALE(x)    ((x) << FV_PSHIFT)
 1073 #define FV_PUNSCALE(x)  ((x) >> FV_PSHIFT)
 1074 #define FV_FSHIFT       5       /* weight of average fraction -- 1/32 */
 1075 #define FV_FSCALE(x)    ((x) << FV_FSHIFT)
 1076 #define FV_FUNSCALE(x)  ((x) >> FV_FSHIFT)
 1077 
 1078 #define FV_TIMER        (3 * hz)        /* timer value for garbage collector */
 1079 #define FV_FLOWLISTSIZE         64      /* how many flows in flowlist */
 1080 
 1081 #define FV_N                    10      /* update fve_f every FV_N packets */
 1082 
 1083 #define FV_BACKOFFTHRESH        1  /* backoff threshold interval in second */
 1084 #define FV_TTHRESH              3  /* time threshold to delete fve */
 1085 #define FV_ALPHA                5  /* extra packet count */
 1086 
 1087 #define FV_STATS
 1088 
 1089 #if (__FreeBSD_version > 300000) || defined(__HAVE_TIMECOUNTER)
 1090 #define FV_TIMESTAMP(tp)        getmicrotime(tp)
 1091 #else
 1092 #define FV_TIMESTAMP(tp)        { (*(tp)) = time; }
 1093 #endif
 1094 
 1095 /*
 1096  * Brtt table: 127 entry table to convert drop rate (p) to
 1097  * the corresponding bandwidth fraction (f)
 1098  * the following equation is implemented to use scaled values,
 1099  * fve_p and fve_f, in the fixed point format.
 1100  *
 1101  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
 1102  *   f = Brtt(p) / (max_th + alpha)
 1103  */
 1104 #define BRTT_SIZE       128
 1105 #define BRTT_SHIFT      12
 1106 #define BRTT_MASK       0x0007f000
 1107 #define BRTT_PMAX       (1 << (FV_PSHIFT + FP_SHIFT))
 1108 
 1109 const int brtt_tab[BRTT_SIZE] = {
 1110         0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
 1111         392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
 1112         225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
 1113         145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
 1114         98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
 1115         67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
 1116         47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
 1117         33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
 1118         24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
 1119         18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
 1120         14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
 1121         10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
 1122         8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
 1123         6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
 1124         5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
 1125         4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
 1126 };
 1127 
 1128 static inline struct fve *
 1129 flowlist_lookup(struct flowvalve *fv, struct altq_pktattr *pktattr,
 1130     struct timeval *now)
 1131 {
 1132         struct fve *fve;
 1133         int flows;
 1134         struct ip *ip;
 1135 #ifdef INET6
 1136         struct ip6_hdr *ip6;
 1137 #endif
 1138         struct timeval tthresh;
 1139 
 1140         if (pktattr == NULL)
 1141                 return (NULL);
 1142 
 1143         tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
 1144         flows = 0;
 1145         /*
 1146          * search the flow list
 1147          */
 1148         switch (pktattr->pattr_af) {
 1149         case AF_INET:
 1150                 ip = (struct ip *)pktattr->pattr_hdr;
 1151                 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
 1152                         if (fve->fve_lastdrop.tv_sec == 0)
 1153                                 break;
 1154                         if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
 1155                                 fve->fve_lastdrop.tv_sec = 0;
 1156                                 break;
 1157                         }
 1158                         if (fve->fve_flow.flow_af == AF_INET &&
 1159                             fve->fve_flow.flow_ip.ip_src.s_addr ==
 1160                             ip->ip_src.s_addr &&
 1161                             fve->fve_flow.flow_ip.ip_dst.s_addr ==
 1162                             ip->ip_dst.s_addr)
 1163                                 return (fve);
 1164                         flows++;
 1165                 }
 1166                 break;
 1167 #ifdef INET6
 1168         case AF_INET6:
 1169                 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
 1170                 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
 1171                         if (fve->fve_lastdrop.tv_sec == 0)
 1172                                 break;
 1173                         if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
 1174                                 fve->fve_lastdrop.tv_sec = 0;
 1175                                 break;
 1176                         }
 1177                         if (fve->fve_flow.flow_af == AF_INET6 &&
 1178                             IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
 1179                                                &ip6->ip6_src) &&
 1180                             IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
 1181                                                &ip6->ip6_dst))
 1182                                 return (fve);
 1183                         flows++;
 1184                 }
 1185                 break;
 1186 #endif /* INET6 */
 1187 
 1188         default:
 1189                 /* unknown protocol.  no drop. */
 1190                 return (NULL);
 1191         }
 1192         fv->fv_flows = flows;   /* save the number of active fve's */
 1193         return (NULL);
 1194 }
 1195 
 1196 static inline struct fve *
 1197 flowlist_reclaim(struct flowvalve *fv, struct altq_pktattr *pktattr)
 1198 {
 1199         struct fve *fve;
 1200         struct ip *ip;
 1201 #ifdef INET6
 1202         struct ip6_hdr *ip6;
 1203 #endif
 1204 
 1205         /*
 1206          * get an entry from the tail of the LRU list.
 1207          */
 1208         fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
 1209 
 1210         switch (pktattr->pattr_af) {
 1211         case AF_INET:
 1212                 ip = (struct ip *)pktattr->pattr_hdr;
 1213                 fve->fve_flow.flow_af = AF_INET;
 1214                 fve->fve_flow.flow_ip.ip_src = ip->ip_src;
 1215                 fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
 1216                 break;
 1217 #ifdef INET6
 1218         case AF_INET6:
 1219                 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
 1220                 fve->fve_flow.flow_af = AF_INET6;
 1221                 fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
 1222                 fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
 1223                 break;
 1224 #endif
 1225         }
 1226 
 1227         fve->fve_state = Green;
 1228         fve->fve_p = 0.0;
 1229         fve->fve_f = 0.0;
 1230         fve->fve_ifseq = fv->fv_ifseq - 1;
 1231         fve->fve_count = 0;
 1232 
 1233         fv->fv_flows++;
 1234 #ifdef FV_STATS
 1235         fv->fv_stats.alloc++;
 1236 #endif
 1237         return (fve);
 1238 }
 1239 
 1240 static inline void
 1241 flowlist_move_to_head(struct flowvalve *fv, struct fve *fve)
 1242 {
 1243         if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
 1244                 TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
 1245                 TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
 1246         }
 1247 }
 1248 
 1249 /*
 1250  * allocate flowvalve structure
 1251  */
 1252 static struct flowvalve *
 1253 fv_alloc(struct red *rp)
 1254 {
 1255         struct flowvalve *fv;
 1256         struct fve *fve;
 1257         int i, num;
 1258 
 1259         num = FV_FLOWLISTSIZE;
 1260         fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
 1261         if (fv == NULL)
 1262                 return (NULL);
 1263 
 1264         fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
 1265             M_WAITOK|M_ZERO);
 1266         if (fv->fv_fves == NULL) {
 1267                 free(fv, M_DEVBUF);
 1268                 return (NULL);
 1269         }
 1270 
 1271         fv->fv_flows = 0;
 1272         TAILQ_INIT(&fv->fv_flowlist);
 1273         for (i = 0; i < num; i++) {
 1274                 fve = &fv->fv_fves[i];
 1275                 fve->fve_lastdrop.tv_sec = 0;
 1276                 TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
 1277         }
 1278 
 1279         /* initialize drop rate threshold in scaled fixed-point */
 1280         fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
 1281 
 1282         /* initialize drop rate to fraction table */
 1283         fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
 1284         if (fv->fv_p2ftab == NULL) {
 1285                 free(fv->fv_fves, M_DEVBUF);
 1286                 free(fv, M_DEVBUF);
 1287                 return (NULL);
 1288         }
 1289         /*
 1290          * create the p2f table.
 1291          * (shift is used to keep the precision)
 1292          */
 1293         for (i = 1; i < BRTT_SIZE; i++) {
 1294                 int f;
 1295 
 1296                 f = brtt_tab[i] << 8;
 1297                 fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
 1298         }
 1299 
 1300         return (fv);
 1301 }
 1302 
 1303 static void
 1304 fv_destroy(struct flowvalve *fv)
 1305 {
 1306         free(fv->fv_p2ftab, M_DEVBUF);
 1307         free(fv->fv_fves, M_DEVBUF);
 1308         free(fv, M_DEVBUF);
 1309 }
 1310 
 1311 static inline int
 1312 fv_p2f(struct flowvalve *fv, int p)
 1313 {
 1314         int val, f;
 1315 
 1316         if (p >= BRTT_PMAX)
 1317                 f = fv->fv_p2ftab[BRTT_SIZE-1];
 1318         else if ((val = (p & BRTT_MASK)))
 1319                 f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
 1320         else
 1321                 f = fv->fv_p2ftab[1];
 1322         return (f);
 1323 }
 1324 
 1325 /*
 1326  * check if an arriving packet should be pre-dropped.
 1327  * called from red_addq() when a packet arrives.
 1328  * returns 1 when the packet should be pre-dropped.
 1329  * should be called in splnet.
 1330  */
 1331 static int
 1332 fv_checkflow(struct flowvalve *fv, struct altq_pktattr *pktattr,
 1333     struct fve **fcache)
 1334 {
 1335         struct fve *fve;
 1336         struct timeval now;
 1337 
 1338         fv->fv_ifseq++;
 1339         FV_TIMESTAMP(&now);
 1340 
 1341         if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
 1342                 /* no matching entry in the flowlist */
 1343                 return (0);
 1344 
 1345         *fcache = fve;
 1346 
 1347         /* update fraction f for every FV_N packets */
 1348         if (++fve->fve_count == FV_N) {
 1349                 /*
 1350                  * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
 1351                  */
 1352                 fve->fve_f =
 1353                         (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
 1354                         + fve->fve_f - FV_FUNSCALE(fve->fve_f);
 1355                 fve->fve_ifseq = fv->fv_ifseq;
 1356                 fve->fve_count = 0;
 1357         }
 1358 
 1359         /*
 1360          * overpumping test
 1361          */
 1362         if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
 1363                 int fthresh;
 1364 
 1365                 /* calculate a threshold */
 1366                 fthresh = fv_p2f(fv, fve->fve_p);
 1367                 if (fve->fve_f > fthresh)
 1368                         fve->fve_state = Red;
 1369         }
 1370 
 1371         if (fve->fve_state == Red) {
 1372                 /*
 1373                  * backoff test
 1374                  */
 1375                 if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
 1376                         /* no drop for at least FV_BACKOFFTHRESH sec */
 1377                         fve->fve_p = 0;
 1378                         fve->fve_state = Green;
 1379 #ifdef FV_STATS
 1380                         fv->fv_stats.escape++;
 1381 #endif
 1382                 } else {
 1383                         /* block this flow */
 1384                         flowlist_move_to_head(fv, fve);
 1385                         fve->fve_lastdrop = now;
 1386 #ifdef FV_STATS
 1387                         fv->fv_stats.predrop++;
 1388 #endif
 1389                         return (1);
 1390                 }
 1391         }
 1392 
 1393         /*
 1394          * p = (1 - Wp) * p
 1395          */
 1396         fve->fve_p -= FV_PUNSCALE(fve->fve_p);
 1397         if (fve->fve_p < 0)
 1398                 fve->fve_p = 0;
 1399 #ifdef FV_STATS
 1400         fv->fv_stats.pass++;
 1401 #endif
 1402         return (0);
 1403 }
 1404 
 1405 /*
 1406  * called from red_addq when a packet is dropped by red.
 1407  * should be called in splnet.
 1408  */
 1409 static void
 1410 fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *pktattr,
 1411     struct fve *fcache)
 1412 {
 1413         struct fve *fve;
 1414         struct timeval now;
 1415 
 1416         if (pktattr == NULL)
 1417                 return;
 1418         FV_TIMESTAMP(&now);
 1419 
 1420         if (fcache != NULL)
 1421                 /* the fve of this packet is already cached */
 1422                 fve = fcache;
 1423         else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
 1424                 fve = flowlist_reclaim(fv, pktattr);
 1425 
 1426         flowlist_move_to_head(fv, fve);
 1427 
 1428         /*
 1429          * update p:  the following line cancels the update
 1430          *            in fv_checkflow() and calculate
 1431          *      p = Wp + (1 - Wp) * p
 1432          */
 1433         fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
 1434 
 1435         fve->fve_lastdrop = now;
 1436 }
 1437 
 1438 #endif /* ALTQ_FLOWVALVE */
 1439 
 1440 #ifdef KLD_MODULE
 1441 
 1442 static struct altqsw red_sw =
 1443         {"red", redopen, redclose, redioctl};
 1444 
 1445 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
 1446 MODULE_VERSION(altq_red, 1);
 1447 
 1448 #endif /* KLD_MODULE */
 1449 #endif /* ALTQ3_COMPAT */
 1450 
 1451 #endif /* ALTQ_RED */

Cache object: ecddd6cde00cb513a8cbd532b1b6c86b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.