ip_dn_io.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
    5  * All rights reserved
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 /*
   30  * Dummynet portions related to packet handling.
   31  */
   32 #include <sys/cdefs.h>
   33 __FBSDID("$FreeBSD$");
   34 
   35 #include "opt_inet6.h"
   36 
   37 #include <sys/param.h>
   38 #include <sys/systm.h>
   39 #include <sys/malloc.h>
   40 #include <sys/mbuf.h>
   41 #include <sys/kernel.h>
   42 #include <sys/lock.h>
   43 #include <sys/module.h>
   44 #include <sys/mutex.h>
   45 #include <sys/priv.h>
   46 #include <sys/proc.h>
   47 #include <sys/rwlock.h>
   48 #include <sys/socket.h>
   49 #include <sys/time.h>
   50 #include <sys/sysctl.h>
   51 
   52 #include <net/if.h>     /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
   53 #include <net/if_var.h> /* NET_EPOCH_... */
   54 #include <net/netisr.h>
   55 #include <net/vnet.h>
   56 
   57 #include <netinet/in.h>
   58 #include <netinet/ip.h>         /* ip_len, ip_off */
   59 #include <netinet/ip_var.h>     /* ip_output(), IP_FORWARDING */
   60 #include <netinet/ip_fw.h>
   61 #include <netinet/ip_dummynet.h>
   62 #include <netinet/if_ether.h> /* various ether_* routines */
   63 #include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
   64 #include <netinet6/ip6_var.h>
   65 
   66 #include <netpfil/ipfw/ip_fw_private.h>
   67 #include <netpfil/ipfw/dn_heap.h>
   68 #include <netpfil/ipfw/ip_dn_private.h>
   69 #ifdef NEW_AQM
   70 #include <netpfil/ipfw/dn_aqm.h>
   71 #endif
   72 #include <netpfil/ipfw/dn_sched.h>
   73 
   74 /*
   75  * We keep a private variable for the simulation time, but we could
   76  * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
   77  * instead of V_dn_cfg.curr_time
   78  */
   79 VNET_DEFINE(struct dn_parms, dn_cfg);
   80 #define V_dn_cfg VNET(dn_cfg)
   81 
   82 /*
   83  * We use a heap to store entities for which we have pending timer events.
   84  * The heap is checked at every tick and all entities with expired events
   85  * are extracted.
   86  */
   87   
   88 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
   89 
   90 extern  void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
   91 
   92 #ifdef SYSCTL_NODE
   93 
   94 /*
   95  * Because of the way the SYSBEGIN/SYSEND macros work on other
   96  * platforms, there should not be functions between them.
   97  * So keep the handlers outside the block.
   98  */
   99 static int
  100 sysctl_hash_size(SYSCTL_HANDLER_ARGS)
  101 {
  102         int error, value;
  103 
  104         value = V_dn_cfg.hash_size;
  105         error = sysctl_handle_int(oidp, &value, 0, req);
  106         if (error != 0 || req->newptr == NULL)
  107                 return (error);
  108         if (value < 16 || value > 65536)
  109                 return (EINVAL);
  110         V_dn_cfg.hash_size = value;
  111         return (0);
  112 }
  113 
  114 static int
  115 sysctl_limits(SYSCTL_HANDLER_ARGS)
  116 {
  117         int error;
  118         long value;
  119 
  120         if (arg2 != 0)
  121                 value = V_dn_cfg.slot_limit;
  122         else
  123                 value = V_dn_cfg.byte_limit;
  124         error = sysctl_handle_long(oidp, &value, 0, req);
  125 
  126         if (error != 0 || req->newptr == NULL)
  127                 return (error);
  128         if (arg2 != 0) {
  129                 if (value < 1)
  130                         return (EINVAL);
  131                 V_dn_cfg.slot_limit = value;
  132         } else {
  133                 if (value < 1500)
  134                         return (EINVAL);
  135                 V_dn_cfg.byte_limit = value;
  136         }
  137         return (0);
  138 }
  139 
  140 SYSBEGIN(f4)
  141 
  142 SYSCTL_DECL(_net_inet);
  143 SYSCTL_DECL(_net_inet_ip);
  144 #ifdef NEW_AQM
  145 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  146     "Dummynet");
  147 #else
  148 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
  149     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  150     "Dummynet");
  151 #endif
  152 
  153 /* wrapper to pass V_dn_cfg fields to SYSCTL_* */
  154 #define DC(x)   (&(VNET_NAME(dn_cfg).x))
  155 
  156 /* parameters */
  157 
  158 SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size,
  159     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  160     0, 0, sysctl_hash_size, "I",
  161     "Default hash table size");
  162 
  163 SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
  164     CTLTYPE_LONG | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  165     0, 1, sysctl_limits, "L",
  166     "Upper limit in slots for pipe queue.");
  167 SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
  168     CTLTYPE_LONG | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  169     0, 0, sysctl_limits, "L",
  170     "Upper limit in bytes for pipe queue.");
  171 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
  172     CTLFLAG_RW | CTLFLAG_VNET, DC(io_fast), 0, "Enable fast dummynet io.");
  173 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
  174     CTLFLAG_RW | CTLFLAG_VNET, DC(debug), 0, "Dummynet debug level");
  175 
  176 /* RED parameters */
  177 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
  178     CTLFLAG_RD | CTLFLAG_VNET, DC(red_lookup_depth), 0, "Depth of RED lookup table");
  179 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
  180     CTLFLAG_RD | CTLFLAG_VNET, DC(red_avg_pkt_size), 0, "RED Medium packet size");
  181 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
  182     CTLFLAG_RD | CTLFLAG_VNET, DC(red_max_pkt_size), 0, "RED Max packet size");
  183 
  184 /* time adjustment */
  185 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
  186     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_delta), 0, "Last vs standard tick difference (usec).");
  187 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
  188     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_delta_sum), 0, "Accumulated tick difference (usec).");
  189 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
  190     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_adjustment), 0, "Tick adjustments done.");
  191 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
  192     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_diff), 0,
  193     "Adjusted vs non-adjusted curr_time difference (ticks).");
  194 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
  195     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_lost), 0,
  196     "Number of ticks coalesced by dummynet taskqueue.");
  197 
  198 /* Drain parameters */
  199 SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire,
  200     CTLFLAG_RW | CTLFLAG_VNET, DC(expire), 0, "Expire empty queues/pipes");
  201 SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
  202     CTLFLAG_RD | CTLFLAG_VNET, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
  203 
  204 /* statistics */
  205 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
  206     CTLFLAG_RD | CTLFLAG_VNET, DC(schk_count), 0, "Number of schedulers");
  207 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
  208     CTLFLAG_RD | CTLFLAG_VNET, DC(si_count), 0, "Number of scheduler instances");
  209 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
  210     CTLFLAG_RD | CTLFLAG_VNET, DC(fsk_count), 0, "Number of flowsets");
  211 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
  212     CTLFLAG_RD | CTLFLAG_VNET, DC(queue_count), 0, "Number of queues");
  213 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
  214     CTLFLAG_RD | CTLFLAG_VNET, DC(io_pkt), 0,
  215     "Number of packets passed to dummynet.");
  216 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
  217     CTLFLAG_RD | CTLFLAG_VNET, DC(io_pkt_fast), 0,
  218     "Number of packets bypassed dummynet scheduler.");
  219 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
  220     CTLFLAG_RD | CTLFLAG_VNET, DC(io_pkt_drop), 0,
  221     "Number of packets dropped by dummynet.");
  222 #undef DC
  223 SYSEND
  224 
  225 #endif
  226 
  227 static void     dummynet_send(struct mbuf *);
  228 
  229 /*
  230  * Return the mbuf tag holding the dummynet state (it should
  231  * be the first one on the list).
  232  */
  233 struct dn_pkt_tag *
  234 dn_tag_get(struct mbuf *m)
  235 {
  236         struct m_tag *mtag = m_tag_first(m);
  237 #ifdef NEW_AQM
  238         /* XXX: to skip ts m_tag. For Debugging only*/
  239         if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) {
  240                 m_tag_delete(m,mtag); 
  241                 mtag = m_tag_first(m);
  242                 D("skip TS tag");
  243         }
  244 #endif
  245         KASSERT(mtag != NULL &&
  246             mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
  247             mtag->m_tag_id == PACKET_TAG_DUMMYNET,
  248             ("packet on dummynet queue w/o dummynet tag!"));
  249         return (struct dn_pkt_tag *)(mtag+1);
  250 }
  251 
  252 #ifndef NEW_AQM
  253 static inline void
  254 mq_append(struct mq *q, struct mbuf *m)
  255 {
  256 #ifdef USERSPACE
  257         // buffers from netmap need to be copied
  258         // XXX note that the routine is not expected to fail
  259         ND("append %p to %p", m, q);
  260         if (m->m_flags & M_STACK) {
  261                 struct mbuf *m_new;
  262                 void *p;
  263                 int l, ofs;
  264 
  265                 ofs = m->m_data - m->__m_extbuf;
  266                 // XXX allocate
  267                 MGETHDR(m_new, M_NOWAIT, MT_DATA);
  268                 ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p",
  269                         m, m->__m_extbuf, m->__m_extlen, ofs, m_new);
  270                 p = m_new->__m_extbuf;  /* new pointer */
  271                 l = m_new->__m_extlen;  /* new len */
  272                 if (l <= m->__m_extlen) {
  273                         panic("extlen too large");
  274                 }
  275 
  276                 *m_new = *m;    // copy
  277                 m_new->m_flags &= ~M_STACK;
  278                 m_new->__m_extbuf = p; // point to new buffer
  279                 _pkt_copy(m->__m_extbuf, p, m->__m_extlen);
  280                 m_new->m_data = p + ofs;
  281                 m = m_new;
  282         }
  283 #endif /* USERSPACE */
  284         if (q->head == NULL)
  285                 q->head = m;
  286         else
  287                 q->tail->m_nextpkt = m;
  288         q->count++;
  289         q->tail = m;
  290         m->m_nextpkt = NULL;
  291 }
  292 #endif
  293 
  294 /*
  295  * Dispose a list of packet. Use a functions so if we need to do
  296  * more work, this is a central point to do it.
  297  */
  298 void dn_free_pkts(struct mbuf *mnext)
  299 {
  300         struct mbuf *m;
  301     
  302         while ((m = mnext) != NULL) {
  303                 mnext = m->m_nextpkt;
  304                 FREE_PKT(m);
  305         }
  306 }
  307 
  308 static int
  309 red_drops (struct dn_queue *q, int len)
  310 {
  311         /*
  312          * RED algorithm
  313          *
  314          * RED calculates the average queue size (avg) using a low-pass filter
  315          * with an exponential weighted (w_q) moving average:
  316          *      avg  <-  (1-w_q) * avg + w_q * q_size
  317          * where q_size is the queue length (measured in bytes or * packets).
  318          *
  319          * If q_size == 0, we compute the idle time for the link, and set
  320          *      avg = (1 - w_q)^(idle/s)
  321          * where s is the time needed for transmitting a medium-sized packet.
  322          *
  323          * Now, if avg < min_th the packet is enqueued.
  324          * If avg > max_th the packet is dropped. Otherwise, the packet is
  325          * dropped with probability P function of avg.
  326          */
  327 
  328         struct dn_fsk *fs = q->fs;
  329         int64_t p_b = 0;
  330 
  331         /* Queue in bytes or packets? */
  332         uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
  333             q->ni.len_bytes : q->ni.length;
  334 
  335         /* Average queue size estimation. */
  336         if (q_size != 0) {
  337                 /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
  338                 int diff = SCALE(q_size) - q->avg;
  339                 int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
  340 
  341                 q->avg += (int)v;
  342         } else {
  343                 /*
  344                  * Queue is empty, find for how long the queue has been
  345                  * empty and use a lookup table for computing
  346                  * (1 - * w_q)^(idle_time/s) where s is the time to send a
  347                  * (small) packet.
  348                  * XXX check wraps...
  349                  */
  350                 if (q->avg) {
  351                         u_int t = div64((V_dn_cfg.curr_time - q->q_time), fs->lookup_step);
  352 
  353                         q->avg = (t < fs->lookup_depth) ?
  354                             SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
  355                 }
  356         }
  357 
  358         /* Should i drop? */
  359         if (q->avg < fs->min_th) {
  360                 q->count = -1;
  361                 return (0);     /* accept packet */
  362         }
  363         if (q->avg >= fs->max_th) {     /* average queue >=  max threshold */
  364                 if (fs->fs.flags & DN_IS_ECN)
  365                         return (1);
  366                 if (fs->fs.flags & DN_IS_GENTLE_RED) {
  367                         /*
  368                          * According to Gentle-RED, if avg is greater than
  369                          * max_th the packet is dropped with a probability
  370                          *       p_b = c_3 * avg - c_4
  371                          * where c_3 = (1 - max_p) / max_th
  372                          *       c_4 = 1 - 2 * max_p
  373                          */
  374                         p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
  375                             fs->c_4;
  376                 } else {
  377                         q->count = -1;
  378                         return (1);
  379                 }
  380         } else if (q->avg > fs->min_th) {
  381                 if (fs->fs.flags & DN_IS_ECN)
  382                         return (1);
  383                 /*
  384                  * We compute p_b using the linear dropping function
  385                  *       p_b = c_1 * avg - c_2
  386                  * where c_1 = max_p / (max_th - min_th)
  387                  *       c_2 = max_p * min_th / (max_th - min_th)
  388                  */
  389                 p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
  390         }
  391 
  392         if (fs->fs.flags & DN_QSIZE_BYTES)
  393                 p_b = div64((p_b * len) , fs->max_pkt_size);
  394         if (++q->count == 0)
  395                 q->random = random() & 0xffff;
  396         else {
  397                 /*
  398                  * q->count counts packets arrived since last drop, so a greater
  399                  * value of q->count means a greater packet drop probability.
  400                  */
  401                 if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
  402                         q->count = 0;
  403                         /* After a drop we calculate a new random value. */
  404                         q->random = random() & 0xffff;
  405                         return (1);     /* drop */
  406                 }
  407         }
  408         /* End of RED algorithm. */
  409 
  410         return (0);     /* accept */
  411 
  412 }
  413 
  414 /*
  415  * ECN/ECT Processing (partially adopted from altq)
  416  */
  417 #ifndef NEW_AQM
  418 static
  419 #endif
  420 int
  421 ecn_mark(struct mbuf* m)
  422 {
  423         struct ip *ip;
  424         ip = (struct ip *)mtodo(m, dn_tag_get(m)->iphdr_off);
  425 
  426         switch (ip->ip_v) {
  427         case IPVERSION:
  428         {
  429                 uint16_t old;
  430 
  431                 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
  432                         return (0);     /* not-ECT */
  433                 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
  434                         return (1);     /* already marked */
  435 
  436                 /*
  437                  * ecn-capable but not marked,
  438                  * mark CE and update checksum
  439                  */
  440                 old = *(uint16_t *)ip;
  441                 ip->ip_tos |= IPTOS_ECN_CE;
  442                 ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip);
  443                 return (1);
  444         }
  445 #ifdef INET6
  446         case (IPV6_VERSION >> 4):
  447         {
  448                 struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
  449                 u_int32_t flowlabel;
  450 
  451                 flowlabel = ntohl(ip6->ip6_flow);
  452                 if ((flowlabel >> 28) != 6)
  453                         return (0);     /* version mismatch! */
  454                 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
  455                     (IPTOS_ECN_NOTECT << 20))
  456                         return (0);     /* not-ECT */
  457                 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
  458                     (IPTOS_ECN_CE << 20))
  459                         return (1);     /* already marked */
  460                 /*
  461                  * ecn-capable but not marked, mark CE
  462                  */
  463                 flowlabel |= (IPTOS_ECN_CE << 20);
  464                 ip6->ip6_flow = htonl(flowlabel);
  465                 return (1);
  466         }
  467 #endif
  468         }
  469         return (0);
  470 }
  471 
  472 /*
  473  * Enqueue a packet in q, subject to space and queue management policy
  474  * (whose parameters are in q->fs).
  475  * Update stats for the queue and the scheduler.
  476  * Return 0 on success, 1 on drop. The packet is consumed anyways.
  477  */
  478 int
  479 dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
  480 {   
  481         struct dn_fs *f;
  482         struct dn_flow *ni;     /* stats for scheduler instance */
  483         uint64_t len;
  484 
  485         if (q->fs == NULL || q->_si == NULL) {
  486                 printf("%s fs %p si %p, dropping\n",
  487                         __FUNCTION__, q->fs, q->_si);
  488                 FREE_PKT(m);
  489                 return 1;
  490         }
  491         f = &(q->fs->fs);
  492         ni = &q->_si->ni;
  493         len = m->m_pkthdr.len;
  494         /* Update statistics, then check reasons to drop pkt. */
  495         q->ni.tot_bytes += len;
  496         q->ni.tot_pkts++;
  497         ni->tot_bytes += len;
  498         ni->tot_pkts++;
  499         if (drop)
  500                 goto drop;
  501         if (f->plr && random() < f->plr)
  502                 goto drop;
  503         if (m->m_pkthdr.rcvif != NULL)
  504                 m_rcvif_serialize(m);
  505 #ifdef NEW_AQM
  506         /* Call AQM enqueue function */
  507         if (q->fs->aqmfp)
  508                 return q->fs->aqmfp->enqueue(q ,m);
  509 #endif
  510         if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) {
  511                 if (!(f->flags & DN_IS_ECN) || !ecn_mark(m))
  512                         goto drop;
  513         }
  514         if (f->flags & DN_QSIZE_BYTES) {
  515                 if (q->ni.len_bytes > f->qsize)
  516                         goto drop;
  517         } else if (q->ni.length >= f->qsize) {
  518                 goto drop;
  519         }
  520         mq_append(&q->mq, m);
  521         q->ni.length++;
  522         q->ni.len_bytes += len;
  523         ni->length++;
  524         ni->len_bytes += len;
  525         return (0);
  526 
  527 drop:
  528         V_dn_cfg.io_pkt_drop++;
  529         q->ni.drops++;
  530         ni->drops++;
  531         FREE_PKT(m);
  532         return (1);
  533 }
  534 
  535 /*
  536  * Fetch packets from the delay line which are due now. If there are
  537  * leftover packets, reinsert the delay line in the heap.
  538  * Runs under scheduler lock.
  539  */
  540 static void
  541 transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
  542 {
  543         struct mbuf *m;
  544         struct dn_pkt_tag *pkt = NULL;
  545 
  546         dline->oid.subtype = 0; /* not in heap */
  547         while ((m = dline->mq.head) != NULL) {
  548                 pkt = dn_tag_get(m);
  549                 if (!DN_KEY_LEQ(pkt->output_time, now))
  550                         break;
  551                 dline->mq.head = m->m_nextpkt;
  552                 dline->mq.count--;
  553                 if (m->m_pkthdr.rcvif != NULL &&
  554                   __predict_false(m_rcvif_restore(m) == NULL))
  555                         m_freem(m);
  556                 else
  557                         mq_append(q, m);
  558         }
  559         if (m != NULL) {
  560                 dline->oid.subtype = 1; /* in heap */
  561                 heap_insert(&V_dn_cfg.evheap, pkt->output_time, dline);
  562         }
  563 }
  564 
  565 /*
  566  * Convert the additional MAC overheads/delays into an equivalent
  567  * number of bits for the given data rate. The samples are
  568  * in milliseconds so we need to divide by 1000.
  569  */
  570 static uint64_t
  571 extra_bits(struct mbuf *m, struct dn_schk *s)
  572 {
  573         int index;
  574         uint64_t bits;
  575         struct dn_profile *pf = s->profile;
  576 
  577         if (!pf || pf->samples_no == 0)
  578                 return 0;
  579         index  = random() % pf->samples_no;
  580         bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
  581         if (index >= pf->loss_level) {
  582                 struct dn_pkt_tag *dt = dn_tag_get(m);
  583                 if (dt)
  584                         dt->dn_dir = DIR_DROP;
  585         }
  586         return bits;
  587 }
  588 
  589 /*
  590  * Send traffic from a scheduler instance due by 'now'.
  591  * Return a pointer to the head of the queue.
  592  */
  593 static struct mbuf *
  594 serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
  595 {
  596         struct mq def_q;
  597         struct dn_schk *s = si->sched;
  598         struct mbuf *m = NULL;
  599         int delay_line_idle = (si->dline.mq.head == NULL);
  600         int done;
  601         uint32_t bw;
  602 
  603         if (q == NULL) {
  604                 q = &def_q;
  605                 q->head = NULL;
  606         }
  607 
  608         bw = s->link.bandwidth;
  609         si->kflags &= ~DN_ACTIVE;
  610 
  611         if (bw > 0)
  612                 si->credit += (now - si->sched_time) * bw;
  613         else
  614                 si->credit = 0;
  615         si->sched_time = now;
  616         done = 0;
  617         while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
  618                 uint64_t len_scaled;
  619 
  620                 done++;
  621                 len_scaled = (bw == 0) ? 0 : hz *
  622                         (m->m_pkthdr.len * 8 + extra_bits(m, s));
  623                 si->credit -= len_scaled;
  624                 /* Move packet in the delay line */
  625                 dn_tag_get(m)->output_time = V_dn_cfg.curr_time + s->link.delay ;
  626                 if (m->m_pkthdr.rcvif != NULL)
  627                         m_rcvif_serialize(m);
  628                 mq_append(&si->dline.mq, m);
  629         }
  630 
  631         /*
  632          * If credit >= 0 the instance is idle, mark time.
  633          * Otherwise put back in the heap, and adjust the output
  634          * time of the last inserted packet, m, which was too early.
  635          */
  636         if (si->credit >= 0) {
  637                 si->idle_time = now;
  638         } else {
  639                 uint64_t t;
  640                 KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
  641                 t = div64(bw - 1 - si->credit, bw);
  642                 if (m)
  643                         dn_tag_get(m)->output_time += t;
  644                 si->kflags |= DN_ACTIVE;
  645                 heap_insert(&V_dn_cfg.evheap, now + t, si);
  646         }
  647         if (delay_line_idle && done)
  648                 transmit_event(q, &si->dline, now);
  649         return q->head;
  650 }
  651 
  652 /*
  653  * The timer handler for dummynet. Time is computed in ticks, but
  654  * but the code is tolerant to the actual rate at which this is called.
  655  * Once complete, the function reschedules itself for the next tick.
  656  */
  657 void
  658 dummynet_task(void *context, int pending)
  659 {
  660         struct timeval t;
  661         struct mq q = { NULL, NULL }; /* queue to accumulate results */
  662         struct epoch_tracker et;
  663 
  664         VNET_ITERATOR_DECL(vnet_iter);
  665         VNET_LIST_RLOCK();
  666         NET_EPOCH_ENTER(et);
  667 
  668         VNET_FOREACH(vnet_iter) {
  669                 memset(&q, 0, sizeof(struct mq));
  670                 CURVNET_SET(vnet_iter);
  671 
  672                 if (! V_dn_cfg.init_done) {
  673                         CURVNET_RESTORE();
  674                         continue;
  675                 }
  676 
  677                 DN_BH_WLOCK();
  678 
  679                 /* Update number of lost(coalesced) ticks. */
  680                 V_dn_cfg.tick_lost += pending - 1;
  681 
  682                 getmicrouptime(&t);
  683                 /* Last tick duration (usec). */
  684                 V_dn_cfg.tick_last = (t.tv_sec - V_dn_cfg.prev_t.tv_sec) * 1000000 +
  685                 (t.tv_usec - V_dn_cfg.prev_t.tv_usec);
  686                 /* Last tick vs standard tick difference (usec). */
  687                 V_dn_cfg.tick_delta = (V_dn_cfg.tick_last * hz - 1000000) / hz;
  688                 /* Accumulated tick difference (usec). */
  689                 V_dn_cfg.tick_delta_sum += V_dn_cfg.tick_delta;
  690 
  691                 V_dn_cfg.prev_t = t;
  692 
  693                 /*
  694                 * Adjust curr_time if the accumulated tick difference is
  695                 * greater than the 'standard' tick. Since curr_time should
  696                 * be monotonically increasing, we do positive adjustments
  697                 * as required, and throttle curr_time in case of negative
  698                 * adjustment.
  699                 */
  700                 V_dn_cfg.curr_time++;
  701                 if (V_dn_cfg.tick_delta_sum - tick >= 0) {
  702                         int diff = V_dn_cfg.tick_delta_sum / tick;
  703 
  704                         V_dn_cfg.curr_time += diff;
  705                         V_dn_cfg.tick_diff += diff;
  706                         V_dn_cfg.tick_delta_sum %= tick;
  707                         V_dn_cfg.tick_adjustment++;
  708                 } else if (V_dn_cfg.tick_delta_sum + tick <= 0) {
  709                         V_dn_cfg.curr_time--;
  710                         V_dn_cfg.tick_diff--;
  711                         V_dn_cfg.tick_delta_sum += tick;
  712                         V_dn_cfg.tick_adjustment++;
  713                 }
  714 
  715                 /* serve pending events, accumulate in q */
  716                 for (;;) {
  717                         struct dn_id *p;    /* generic parameter to handler */
  718 
  719                         if (V_dn_cfg.evheap.elements == 0 ||
  720                             DN_KEY_LT(V_dn_cfg.curr_time, HEAP_TOP(&V_dn_cfg.evheap)->key))
  721                                 break;
  722                         p = HEAP_TOP(&V_dn_cfg.evheap)->object;
  723                         heap_extract(&V_dn_cfg.evheap, NULL);
  724                         if (p->type == DN_SCH_I) {
  725                                 serve_sched(&q, (struct dn_sch_inst *)p, V_dn_cfg.curr_time);
  726                         } else { /* extracted a delay line */
  727                                 transmit_event(&q, (struct delay_line *)p, V_dn_cfg.curr_time);
  728                         }
  729                 }
  730                 if (V_dn_cfg.expire && ++V_dn_cfg.expire_cycle >= V_dn_cfg.expire) {
  731                         V_dn_cfg.expire_cycle = 0;
  732                         dn_drain_scheduler();
  733                         dn_drain_queue();
  734                 }
  735                 DN_BH_WUNLOCK();
  736                 if (q.head != NULL)
  737                         dummynet_send(q.head);
  738 
  739                 CURVNET_RESTORE();
  740         }
  741         NET_EPOCH_EXIT(et);
  742         VNET_LIST_RUNLOCK();
  743 
  744         /* Schedule our next run. */
  745         dn_reschedule();
  746 }
  747 
  748 /*
  749  * forward a chain of packets to the proper destination.
  750  * This runs outside the dummynet lock.
  751  */
  752 static void
  753 dummynet_send(struct mbuf *m)
  754 {
  755         struct mbuf *n;
  756 
  757         NET_EPOCH_ASSERT();
  758 
  759         for (; m != NULL; m = n) {
  760                 struct ifnet *ifp = NULL;       /* gcc 3.4.6 complains */
  761                 struct m_tag *tag;
  762                 int dst;
  763 
  764                 n = m->m_nextpkt;
  765                 m->m_nextpkt = NULL;
  766                 tag = m_tag_first(m);
  767                 if (tag == NULL) { /* should not happen */
  768                         dst = DIR_DROP;
  769                 } else {
  770                         struct dn_pkt_tag *pkt = dn_tag_get(m);
  771                         /* extract the dummynet info, rename the tag
  772                          * to carry reinject info.
  773                          */
  774                         ifp = ifnet_byindexgen(pkt->if_index, pkt->if_idxgen);
  775                         if (((pkt->dn_dir == (DIR_OUT | PROTO_LAYER2)) ||
  776                             (pkt->dn_dir == (DIR_OUT | PROTO_LAYER2 | PROTO_IPV6))) &&
  777                                 ifp == NULL) {
  778                                 dst = DIR_DROP;
  779                         } else {
  780                                 dst = pkt->dn_dir;
  781                                 tag->m_tag_cookie = MTAG_IPFW_RULE;
  782                                 tag->m_tag_id = 0;
  783                         }
  784                 }
  785 
  786                 switch (dst) {
  787                 case DIR_OUT:
  788                         ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
  789                         break ;
  790 
  791                 case DIR_IN :
  792                         netisr_dispatch(NETISR_IP, m);
  793                         break;
  794 
  795 #ifdef INET6
  796                 case DIR_IN | PROTO_IPV6:
  797                         netisr_dispatch(NETISR_IPV6, m);
  798                         break;
  799 
  800                 case DIR_OUT | PROTO_IPV6:
  801                         ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
  802                         break;
  803 #endif
  804 
  805                 case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
  806                         if (bridge_dn_p != NULL)
  807                                 ((*bridge_dn_p)(m, ifp));
  808                         else
  809                                 printf("dummynet: if_bridge not loaded\n");
  810 
  811                         break;
  812 
  813                 case DIR_IN | PROTO_LAYER2 | PROTO_IPV6:
  814                 case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
  815                         /*
  816                          * The Ethernet code assumes the Ethernet header is
  817                          * contiguous in the first mbuf header.
  818                          * Insure this is true.
  819                          */
  820                         if (m->m_len < ETHER_HDR_LEN &&
  821                             (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
  822                                 printf("dummynet/ether: pullup failed, "
  823                                     "dropping packet\n");
  824                                 break;
  825                         }
  826                         ether_demux(m->m_pkthdr.rcvif, m);
  827                         break;
  828 
  829                 case DIR_OUT | PROTO_LAYER2 | PROTO_IPV6:
  830                 case DIR_OUT | PROTO_LAYER2: /* DN_TO_ETH_OUT: */
  831                         MPASS(ifp != NULL);
  832                         ether_output_frame(ifp, m);
  833                         break;
  834 
  835                 case DIR_DROP:
  836                         /* drop the packet after some time */
  837                         FREE_PKT(m);
  838                         break;
  839 
  840                 default:
  841                         printf("dummynet: bad switch %d!\n", dst);
  842                         FREE_PKT(m);
  843                         break;
  844                 }
  845         }
  846 }
  847 
  848 static inline int
  849 tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
  850 {
  851         struct dn_pkt_tag *dt;
  852         struct m_tag *mtag;
  853 
  854         mtag = m_tag_get(PACKET_TAG_DUMMYNET,
  855                     sizeof(*dt), M_NOWAIT | M_ZERO);
  856         if (mtag == NULL)
  857                 return 1;               /* Cannot allocate packet header. */
  858         m_tag_prepend(m, mtag);         /* Attach to mbuf chain. */
  859         dt = (struct dn_pkt_tag *)(mtag + 1);
  860         dt->rule = fwa->rule;
  861         /* only keep this info */
  862         dt->rule.info &= (IPFW_ONEPASS | IPFW_IS_DUMMYNET);
  863         dt->dn_dir = dir;
  864         if (fwa->flags & IPFW_ARGS_OUT && fwa->ifp != NULL) {
  865                 NET_EPOCH_ASSERT();
  866                 dt->if_index = fwa->ifp->if_index;
  867                 dt->if_idxgen = fwa->ifp->if_idxgen;
  868         }
  869         /* dt->output tame is updated as we move through */
  870         dt->output_time = V_dn_cfg.curr_time;
  871         dt->iphdr_off = (dir & PROTO_LAYER2) ? ETHER_HDR_LEN : 0;
  872         return 0;
  873 }
  874 
  875 /*
  876  * dummynet hook for packets.
  877  * We use the argument to locate the flowset fs and the sched_set sch
  878  * associated to it. The we apply flow_mask and sched_mask to
  879  * determine the queue and scheduler instances.
  880  */
  881 int
  882 dummynet_io(struct mbuf **m0, struct ip_fw_args *fwa)
  883 {
  884         struct mbuf *m = *m0;
  885         struct dn_fsk *fs = NULL;
  886         struct dn_sch_inst *si;
  887         struct dn_queue *q = NULL;      /* default */
  888         int fs_id, dir;
  889 
  890         fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
  891                 ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
  892         /* XXXGL: convert args to dir */
  893         if (fwa->flags & IPFW_ARGS_IN)
  894                 dir = DIR_IN;
  895         else
  896                 dir = DIR_OUT;
  897         if (fwa->flags & IPFW_ARGS_ETHER)
  898                 dir |= PROTO_LAYER2;
  899         else if (fwa->flags & IPFW_ARGS_IP6)
  900                 dir |= PROTO_IPV6;
  901         DN_BH_WLOCK();
  902         V_dn_cfg.io_pkt++;
  903         /* we could actually tag outside the lock, but who cares... */
  904         if (tag_mbuf(m, dir, fwa))
  905                 goto dropit;
  906         /* XXX locate_flowset could be optimised with a direct ref. */
  907         fs = dn_ht_find(V_dn_cfg.fshash, fs_id, 0, NULL);
  908         if (fs == NULL)
  909                 goto dropit;    /* This queue/pipe does not exist! */
  910         if (fs->sched == NULL)  /* should not happen */
  911                 goto dropit;
  912         /* find scheduler instance, possibly applying sched_mask */
  913         si = ipdn_si_find(fs->sched, &(fwa->f_id));
  914         if (si == NULL)
  915                 goto dropit;
  916         /*
  917          * If the scheduler supports multiple queues, find the right one
  918          * (otherwise it will be ignored by enqueue).
  919          */
  920         if (fs->sched->fp->flags & DN_MULTIQUEUE) {
  921                 q = ipdn_q_find(fs, si, &(fwa->f_id));
  922                 if (q == NULL)
  923                         goto dropit;
  924         }
  925         if (fs->sched->fp->enqueue(si, q, m)) {
  926                 /* packet was dropped by enqueue() */
  927                 m = *m0 = NULL;
  928 
  929                 /* dn_enqueue already increases io_pkt_drop */
  930                 V_dn_cfg.io_pkt_drop--;
  931 
  932                 goto dropit;
  933         }
  934 
  935         if (si->kflags & DN_ACTIVE) {
  936                 m = *m0 = NULL; /* consumed */
  937                 goto done; /* already active, nothing to do */
  938         }
  939 
  940         /* compute the initial allowance */
  941         if (si->idle_time < V_dn_cfg.curr_time) {
  942             /* Do this only on the first packet on an idle pipe */
  943             struct dn_link *p = &fs->sched->link;
  944 
  945             si->sched_time = V_dn_cfg.curr_time;
  946             si->credit = V_dn_cfg.io_fast ? p->bandwidth : 0;
  947             if (p->burst) {
  948                 uint64_t burst = (V_dn_cfg.curr_time - si->idle_time) * p->bandwidth;
  949                 if (burst > p->burst)
  950                         burst = p->burst;
  951                 si->credit += burst;
  952             }
  953         }
  954         /* pass through scheduler and delay line */
  955         m = serve_sched(NULL, si, V_dn_cfg.curr_time);
  956 
  957         /* optimization -- pass it back to ipfw for immediate send */
  958         /* XXX Don't call dummynet_send() if scheduler return the packet
  959          *     just enqueued. This avoid a lock order reversal.
  960          *     
  961          */
  962         if (/*V_dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
  963                 /* fast io, rename the tag * to carry reinject info. */
  964                 struct m_tag *tag = m_tag_first(m);
  965 
  966                 tag->m_tag_cookie = MTAG_IPFW_RULE;
  967                 tag->m_tag_id = 0;
  968                 V_dn_cfg.io_pkt_fast++;
  969                 if (m->m_nextpkt != NULL) {
  970                         printf("dummynet: fast io: pkt chain detected!\n");
  971                         m->m_nextpkt = NULL;
  972                 }
  973                 m = NULL;
  974         } else {
  975                 *m0 = NULL;
  976         }
  977 done:
  978         DN_BH_WUNLOCK();
  979         if (m)
  980                 dummynet_send(m);
  981         return 0;
  982 
  983 dropit:
  984         V_dn_cfg.io_pkt_drop++;
  985         DN_BH_WUNLOCK();
  986         if (m)
  987                 FREE_PKT(m);
  988         *m0 = NULL;
  989         return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
  990 }
Cache object: dd00c7e152e546dc2ebf1108c45741d5
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/netpfil/ipfw/ip_dn_io.c

FreeBSD/Linux Kernel Cross Reference
sys/netpfil/ipfw/ip_dn_io.c