The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_poll.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2001-2002 Luigi Rizzo
    3  *
    4  * Supported by: the Xorp Project (www.xorp.org)
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
   16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
   19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   25  * SUCH DAMAGE.
   26  *
   27  * $FreeBSD: releng/5.1/sys/kern/kern_poll.c 111888 2003-03-04 23:19:55Z jlemon $
   28  */
   29 
   30 #include <sys/param.h>
   31 #include <sys/systm.h>
   32 #include <sys/kernel.h>
   33 #include <sys/socket.h>                 /* needed by net/if.h           */
   34 #include <sys/sysctl.h>
   35 
   36 #include <net/if.h>                     /* for IFF_* flags              */
   37 #include <net/netisr.h>                 /* for NETISR_POLL              */
   38 
   39 #include <sys/proc.h>
   40 #include <sys/resourcevar.h>
   41 #include <sys/kthread.h>
   42 
   43 #ifdef SMP
   44 #ifndef COMPILING_LINT
   45 #error DEVICE_POLLING is not compatible with SMP
   46 #endif
   47 #endif
   48 
   49 static void netisr_poll(void);          /* the two netisr handlers      */
   50 static void netisr_pollmore(void);
   51 
   52 void hardclock_device_poll(void);       /* hook from hardclock          */
   53 void ether_poll(int);                   /* polling while in trap        */
   54 
   55 /*
   56  * Polling support for [network] device drivers.
   57  *
   58  * Drivers which support this feature try to register with the
   59  * polling code.
   60  *
   61  * If registration is successful, the driver must disable interrupts,
   62  * and further I/O is performed through the handler, which is invoked
   63  * (at least once per clock tick) with 3 arguments: the "arg" passed at
   64  * register time (a struct ifnet pointer), a command, and a "count" limit.
   65  *
   66  * The command can be one of the following:
   67  *  POLL_ONLY: quick move of "count" packets from input/output queues.
   68  *  POLL_AND_CHECK_STATUS: as above, plus check status registers or do
   69  *      other more expensive operations. This command is issued periodically
   70  *      but less frequently than POLL_ONLY.
   71  *  POLL_DEREGISTER: deregister and return to interrupt mode.
   72  *
   73  * The first two commands are only issued if the interface is marked as
   74  * 'IFF_UP and IFF_RUNNING', the last one only if IFF_RUNNING is set.
   75  *
   76  * The count limit specifies how much work the handler can do during the
   77  * call -- typically this is the number of packets to be received, or
   78  * transmitted, etc. (drivers are free to interpret this number, as long
   79  * as the max time spent in the function grows roughly linearly with the
   80  * count).
   81  *
   82  * Deregistration can be requested by the driver itself (typically in the
   83  * *_stop() routine), or by the polling code, by invoking the handler.
   84  *
   85  * Polling can be globally enabled or disabled with the sysctl variable
   86  * kern.polling.enable (default is 0, disabled)
   87  *
   88  * A second variable controls the sharing of CPU between polling/kernel
   89  * network processing, and other activities (typically userlevel tasks):
   90  * kern.polling.user_frac (between 0 and 100, default 50) sets the share
   91  * of CPU allocated to user tasks. CPU is allocated proportionally to the
   92  * shares, by dynamically adjusting the "count" (poll_burst).
   93  *
   94  * Other parameters can should be left to their default values.
   95  * The following constraints hold
   96  *
   97  *      1 <= poll_each_burst <= poll_burst <= poll_burst_max
   98  *      0 <= poll_in_trap <= poll_each_burst
   99  *      MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
  100  */
  101 
  102 #define MIN_POLL_BURST_MAX      10
  103 #define MAX_POLL_BURST_MAX      1000
  104 
  105 SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
  106         "Device polling parameters");
  107 
  108 static u_int32_t poll_burst = 5;
  109 SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RW,
  110         &poll_burst, 0, "Current polling burst size");
  111 
  112 static u_int32_t poll_each_burst = 5;
  113 SYSCTL_UINT(_kern_polling, OID_AUTO, each_burst, CTLFLAG_RW,
  114         &poll_each_burst, 0, "Max size of each burst");
  115 
  116 static u_int32_t poll_burst_max = 150;  /* good for 100Mbit net and HZ=1000 */
  117 SYSCTL_UINT(_kern_polling, OID_AUTO, burst_max, CTLFLAG_RW,
  118         &poll_burst_max, 0, "Max Polling burst size");
  119 
  120 static u_int32_t poll_in_idle_loop=0;   /* do we poll in idle loop ? */
  121 SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW,
  122         &poll_in_idle_loop, 0, "Enable device polling in idle loop");
  123 
  124 u_int32_t poll_in_trap;                 /* used in trap.c */
  125 SYSCTL_UINT(_kern_polling, OID_AUTO, poll_in_trap, CTLFLAG_RW,
  126         &poll_in_trap, 0, "Poll burst size during a trap");
  127 
  128 static u_int32_t user_frac = 50;
  129 SYSCTL_UINT(_kern_polling, OID_AUTO, user_frac, CTLFLAG_RW,
  130         &user_frac, 0, "Desired user fraction of cpu time");
  131 
  132 static u_int32_t reg_frac = 20 ;
  133 SYSCTL_UINT(_kern_polling, OID_AUTO, reg_frac, CTLFLAG_RW,
  134         &reg_frac, 0, "Every this many cycles poll register");
  135 
  136 static u_int32_t short_ticks;
  137 SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RW,
  138         &short_ticks, 0, "Hardclock ticks shorter than they should be");
  139 
  140 static u_int32_t lost_polls;
  141 SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RW,
  142         &lost_polls, 0, "How many times we would have lost a poll tick");
  143 
  144 static u_int32_t pending_polls;
  145 SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RW,
  146         &pending_polls, 0, "Do we need to poll again");
  147 
  148 static int residual_burst = 0;
  149 SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RW,
  150         &residual_burst, 0, "# of residual cycles in burst");
  151 
  152 static u_int32_t poll_handlers; /* next free entry in pr[]. */
  153 SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD,
  154         &poll_handlers, 0, "Number of registered poll handlers");
  155 
  156 static int polling = 0;         /* global polling enable */
  157 SYSCTL_UINT(_kern_polling, OID_AUTO, enable, CTLFLAG_RW,
  158         &polling, 0, "Polling enabled");
  159 
  160 static u_int32_t phase;
  161 SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RW,
  162         &phase, 0, "Polling phase");
  163 
  164 static u_int32_t suspect;
  165 SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RW,
  166         &suspect, 0, "suspect event");
  167 
  168 static u_int32_t stalled;
  169 SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RW,
  170         &stalled, 0, "potential stalls");
  171 
  172 static u_int32_t idlepoll_sleeping; /* idlepoll is sleeping */
  173 SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD,
  174         &idlepoll_sleeping, 0, "idlepoll is sleeping");
  175 
  176 
  177 #define POLL_LIST_LEN  128
  178 struct pollrec {
  179         poll_handler_t  *handler;
  180         struct ifnet    *ifp;
  181 };
  182 
  183 static struct pollrec pr[POLL_LIST_LEN];
  184 
  185 static void
  186 init_device_poll(void)
  187 {
  188 
  189         netisr_register(NETISR_POLL, (netisr_t *)netisr_poll, NULL);
  190         netisr_register(NETISR_POLLMORE, (netisr_t *)netisr_pollmore, NULL);
  191 }
  192 SYSINIT(device_poll, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, init_device_poll, NULL)
  193 
  194 
  195 /*
  196  * Hook from hardclock. Tries to schedule a netisr, but keeps track
  197  * of lost ticks due to the previous handler taking too long.
  198  * Normally, this should not happen, because polling handler should
  199  * run for a short time. However, in some cases (e.g. when there are
  200  * changes in link status etc.) the drivers take a very long time
  201  * (even in the order of milliseconds) to reset and reconfigure the
  202  * device, causing apparent lost polls.
  203  *
  204  * The first part of the code is just for debugging purposes, and tries
  205  * to count how often hardclock ticks are shorter than they should,
  206  * meaning either stray interrupts or delayed events.
  207  */
  208 void
  209 hardclock_device_poll(void)
  210 {
  211         static struct timeval prev_t, t;
  212         int delta;
  213 
  214         if (poll_handlers == 0)
  215                 return;
  216 
  217         microuptime(&t);
  218         delta = (t.tv_usec - prev_t.tv_usec) +
  219                 (t.tv_sec - prev_t.tv_sec)*1000000;
  220         if (delta * hz < 500000)
  221                 short_ticks++;
  222         else
  223                 prev_t = t;
  224 
  225         if (pending_polls > 100) {
  226                 /*
  227                  * Too much, assume it has stalled (not always true
  228                  * see comment above).
  229                  */
  230                 stalled++;
  231                 pending_polls = 0;
  232                 phase = 0;
  233         }
  234 
  235         if (phase <= 2) {
  236                 if (phase != 0)
  237                         suspect++;
  238                 phase = 1;
  239                 schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
  240                 phase = 2;
  241         }
  242         if (pending_polls++ > 0)
  243                 lost_polls++;
  244 }
  245 
  246 /*
  247  * ether_poll is called from the idle loop or from the trap handler.
  248  */
  249 void
  250 ether_poll(int count)
  251 {
  252         int i;
  253 
  254         mtx_lock(&Giant);
  255 
  256         if (count > poll_each_burst)
  257                 count = poll_each_burst;
  258         for (i = 0 ; i < poll_handlers ; i++)
  259                 if (pr[i].handler && (IFF_UP|IFF_RUNNING) ==
  260                     (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) )
  261                         pr[i].handler(pr[i].ifp, 0, count); /* quick check */
  262         mtx_unlock(&Giant);
  263 }
  264 
  265 /*
  266  * netisr_pollmore is called after other netisr's, possibly scheduling
  267  * another NETISR_POLL call, or adapting the burst size for the next cycle.
  268  *
  269  * It is very bad to fetch large bursts of packets from a single card at once,
  270  * because the burst could take a long time to be completely processed, or
  271  * could saturate the intermediate queue (ipintrq or similar) leading to
  272  * losses or unfairness. To reduce the problem, and also to account better for
  273  * time spent in network-related processing, we split the burst in smaller
  274  * chunks of fixed size, giving control to the other netisr's between chunks.
  275  * This helps in improving the fairness, reducing livelock (because we
  276  * emulate more closely the "process to completion" that we have with
  277  * fastforwarding) and accounting for the work performed in low level
  278  * handling and forwarding.
  279  */
  280 
  281 static struct timeval poll_start_t;
  282 
  283 void
  284 netisr_pollmore()
  285 {
  286         struct timeval t;
  287         int kern_load;
  288         /* XXX run at splhigh() or equivalent */
  289 
  290         phase = 5;
  291         if (residual_burst > 0) {
  292                 schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
  293                 /* will run immediately on return, followed by netisrs */
  294                 return;
  295         }
  296         /* here we can account time spent in netisr's in this tick */
  297         microuptime(&t);
  298         kern_load = (t.tv_usec - poll_start_t.tv_usec) +
  299                 (t.tv_sec - poll_start_t.tv_sec)*1000000;       /* us */
  300         kern_load = (kern_load * hz) / 10000;                   /* 0..100 */
  301         if (kern_load > (100 - user_frac)) { /* try decrease ticks */
  302                 if (poll_burst > 1)
  303                         poll_burst--;
  304         } else {
  305                 if (poll_burst < poll_burst_max)
  306                         poll_burst++;
  307         }
  308 
  309         pending_polls--;
  310         if (pending_polls == 0) /* we are done */
  311                 phase = 0;
  312         else {
  313                 /*
  314                  * Last cycle was long and caused us to miss one or more
  315                  * hardclock ticks. Restart processing again, but slightly
  316                  * reduce the burst size to prevent that this happens again.
  317                  */
  318                 poll_burst -= (poll_burst / 8);
  319                 if (poll_burst < 1)
  320                         poll_burst = 1;
  321                 schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
  322                 phase = 6;
  323         }
  324 }
  325 
  326 /*
  327  * netisr_poll is scheduled by schednetisr when appropriate, typically once
  328  * per tick. It is called at splnet() so first thing to do is to upgrade to
  329  * splimp(), and call all registered handlers.
  330  */
  331 static void
  332 netisr_poll(void)
  333 {
  334         static int reg_frac_count;
  335         int i, cycles;
  336         enum poll_cmd arg = POLL_ONLY;
  337         mtx_lock(&Giant);
  338 
  339         phase = 3;
  340         if (residual_burst == 0) { /* first call in this tick */
  341                 microuptime(&poll_start_t);
  342                 /*
  343                  * Check that paremeters are consistent with runtime
  344                  * variables. Some of these tests could be done at sysctl
  345                  * time, but the savings would be very limited because we
  346                  * still have to check against reg_frac_count and
  347                  * poll_each_burst. So, instead of writing separate sysctl
  348                  * handlers, we do all here.
  349                  */
  350 
  351                 if (reg_frac > hz)
  352                         reg_frac = hz;
  353                 else if (reg_frac < 1)
  354                         reg_frac = 1;
  355                 if (reg_frac_count > reg_frac)
  356                         reg_frac_count = reg_frac - 1;
  357                 if (reg_frac_count-- == 0) {
  358                         arg = POLL_AND_CHECK_STATUS;
  359                         reg_frac_count = reg_frac - 1;
  360                 }
  361                 if (poll_burst_max < MIN_POLL_BURST_MAX)
  362                         poll_burst_max = MIN_POLL_BURST_MAX;
  363                 else if (poll_burst_max > MAX_POLL_BURST_MAX)
  364                         poll_burst_max = MAX_POLL_BURST_MAX;
  365 
  366                 if (poll_each_burst < 1)
  367                         poll_each_burst = 1;
  368                 else if (poll_each_burst > poll_burst_max)
  369                         poll_each_burst = poll_burst_max;
  370 
  371                 residual_burst = poll_burst;
  372         }
  373         cycles = (residual_burst < poll_each_burst) ?
  374                 residual_burst : poll_each_burst;
  375         residual_burst -= cycles;
  376 
  377         if (polling) {
  378                 for (i = 0 ; i < poll_handlers ; i++)
  379                         if (pr[i].handler && (IFF_UP|IFF_RUNNING) ==
  380                             (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) )
  381                                 pr[i].handler(pr[i].ifp, arg, cycles);
  382         } else {        /* unregister */
  383                 for (i = 0 ; i < poll_handlers ; i++) {
  384                         if (pr[i].handler &&
  385                             pr[i].ifp->if_flags & IFF_RUNNING) {
  386                                 pr[i].ifp->if_flags &= ~IFF_POLLING;
  387                                 pr[i].handler(pr[i].ifp, POLL_DEREGISTER, 1);
  388                         }
  389                         pr[i].handler=NULL;
  390                 }
  391                 residual_burst = 0;
  392                 poll_handlers = 0;
  393         }
  394         /* on -stable, schednetisr(NETISR_POLLMORE); */
  395         phase = 4;
  396         mtx_unlock(&Giant);
  397 }
  398 
  399 /*
  400  * Try to register routine for polling. Returns 1 if successful
  401  * (and polling should be enabled), 0 otherwise.
  402  * A device is not supposed to register itself multiple times.
  403  *
  404  * This is called from within the *_intr() functions, so we do not need
  405  * further locking.
  406  */
  407 int
  408 ether_poll_register(poll_handler_t *h, struct ifnet *ifp)
  409 {
  410         int s;
  411 
  412         if (polling == 0) /* polling disabled, cannot register */
  413                 return 0;
  414         if (h == NULL || ifp == NULL)           /* bad arguments        */
  415                 return 0;
  416         if ( !(ifp->if_flags & IFF_UP) )        /* must be up           */
  417                 return 0;
  418         if (ifp->if_flags & IFF_POLLING)        /* already polling      */
  419                 return 0;
  420 
  421         s = splhigh();
  422         if (poll_handlers >= POLL_LIST_LEN) {
  423                 /*
  424                  * List full, cannot register more entries.
  425                  * This should never happen; if it does, it is probably a
  426                  * broken driver trying to register multiple times. Checking
  427                  * this at runtime is expensive, and won't solve the problem
  428                  * anyways, so just report a few times and then give up.
  429                  */
  430                 static int verbose = 10 ;
  431                 splx(s);
  432                 if (verbose >0) {
  433                         printf("poll handlers list full, "
  434                                 "maybe a broken driver ?\n");
  435                         verbose--;
  436                 }
  437                 return 0; /* no polling for you */
  438         }
  439 
  440         pr[poll_handlers].handler = h;
  441         pr[poll_handlers].ifp = ifp;
  442         poll_handlers++;
  443         ifp->if_flags |= IFF_POLLING;
  444         splx(s);
  445         if (idlepoll_sleeping)
  446                 wakeup(&idlepoll_sleeping);
  447         return 1; /* polling enabled in next call */
  448 }
  449 
  450 /*
  451  * Remove interface from the polling list. Normally called by *_stop().
  452  * It is not an error to call it with IFF_POLLING clear, the call is
  453  * sufficiently rare to be preferable to save the space for the extra
  454  * test in each driver in exchange of one additional function call.
  455  */
  456 int
  457 ether_poll_deregister(struct ifnet *ifp)
  458 {
  459         int i;
  460 
  461         mtx_lock(&Giant);
  462         if ( !ifp || !(ifp->if_flags & IFF_POLLING) ) {
  463                 mtx_unlock(&Giant);
  464                 return 0;
  465         }
  466         for (i = 0 ; i < poll_handlers ; i++)
  467                 if (pr[i].ifp == ifp) /* found it */
  468                         break;
  469         ifp->if_flags &= ~IFF_POLLING; /* found or not... */
  470         if (i == poll_handlers) {
  471                 mtx_unlock(&Giant);
  472                 printf("ether_poll_deregister: ifp not found!!!\n");
  473                 return 0;
  474         }
  475         poll_handlers--;
  476         if (i < poll_handlers) { /* Last entry replaces this one. */
  477                 pr[i].handler = pr[poll_handlers].handler;
  478                 pr[i].ifp = pr[poll_handlers].ifp;
  479         }
  480         mtx_unlock(&Giant);
  481         return 1;
  482 }
  483 
  484 static void
  485 poll_idle(void)
  486 {
  487         struct thread *td = curthread;
  488         struct rtprio rtp;
  489         int pri;
  490 
  491         rtp.prio = RTP_PRIO_MAX;        /* lowest priority */
  492         rtp.type = RTP_PRIO_IDLE;
  493         mtx_lock_spin(&sched_lock);
  494         rtp_to_pri(&rtp, td->td_ksegrp);
  495         pri = td->td_priority;
  496         mtx_unlock_spin(&sched_lock);
  497 
  498         for (;;) {
  499                 if (poll_in_idle_loop && poll_handlers > 0) {
  500                         idlepoll_sleeping = 0;
  501                         mtx_lock(&Giant);
  502                         ether_poll(poll_each_burst);
  503                         mtx_unlock(&Giant);
  504                         mtx_assert(&Giant, MA_NOTOWNED);
  505                         mtx_lock_spin(&sched_lock);
  506                         td->td_proc->p_stats->p_ru.ru_nvcsw++;
  507                         mi_switch();
  508                         mtx_unlock_spin(&sched_lock);
  509                 } else {
  510                         idlepoll_sleeping = 1;
  511                         tsleep(&idlepoll_sleeping, pri, "pollid", hz * 3);
  512                 }
  513         }
  514 }
  515 
  516 static struct proc *idlepoll;
  517 static struct kproc_desc idlepoll_kp = {
  518          "idlepoll",
  519          poll_idle,
  520          &idlepoll
  521 };
  522 SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start, &idlepoll_kp)

Cache object: e12fa2a1b9dacd16a8e301e086b2b94f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.