The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kernel/rcutree.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Read-Copy Update mechanism for mutual exclusion
    3  *
    4  * This program is free software; you can redistribute it and/or modify
    5  * it under the terms of the GNU General Public License as published by
    6  * the Free Software Foundation; either version 2 of the License, or
    7  * (at your option) any later version.
    8  *
    9  * This program is distributed in the hope that it will be useful,
   10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   12  * GNU General Public License for more details.
   13  *
   14  * You should have received a copy of the GNU General Public License
   15  * along with this program; if not, write to the Free Software
   16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   17  *
   18  * Copyright IBM Corporation, 2008
   19  *
   20  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
   21  *          Manfred Spraul <manfred@colorfullife.com>
   22  *          Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
   23  *
   24  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
   25  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
   26  *
   27  * For detailed explanation of Read-Copy Update mechanism see -
   28  *      Documentation/RCU
   29  */
   30 #include <linux/types.h>
   31 #include <linux/kernel.h>
   32 #include <linux/init.h>
   33 #include <linux/spinlock.h>
   34 #include <linux/smp.h>
   35 #include <linux/rcupdate.h>
   36 #include <linux/interrupt.h>
   37 #include <linux/sched.h>
   38 #include <linux/nmi.h>
   39 #include <linux/atomic.h>
   40 #include <linux/bitops.h>
   41 #include <linux/export.h>
   42 #include <linux/completion.h>
   43 #include <linux/moduleparam.h>
   44 #include <linux/percpu.h>
   45 #include <linux/notifier.h>
   46 #include <linux/cpu.h>
   47 #include <linux/mutex.h>
   48 #include <linux/time.h>
   49 #include <linux/kernel_stat.h>
   50 #include <linux/wait.h>
   51 #include <linux/kthread.h>
   52 #include <linux/prefetch.h>
   53 #include <linux/delay.h>
   54 #include <linux/stop_machine.h>
   55 #include <linux/random.h>
   56 
   57 #include "rcutree.h"
   58 #include <trace/events/rcu.h>
   59 
   60 #include "rcu.h"
   61 
   62 /* Data structures. */
   63 
   64 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
   65 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
   66 
   67 #define RCU_STATE_INITIALIZER(sname, cr) { \
   68         .level = { &sname##_state.node[0] }, \
   69         .call = cr, \
   70         .fqs_state = RCU_GP_IDLE, \
   71         .gpnum = 0UL - 300UL, \
   72         .completed = 0UL - 300UL, \
   73         .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
   74         .orphan_nxttail = &sname##_state.orphan_nxtlist, \
   75         .orphan_donetail = &sname##_state.orphan_donelist, \
   76         .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
   77         .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
   78         .name = #sname, \
   79 }
   80 
   81 struct rcu_state rcu_sched_state =
   82         RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
   83 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
   84 
   85 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
   86 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
   87 
   88 static struct rcu_state *rcu_state;
   89 LIST_HEAD(rcu_struct_flavors);
   90 
   91 /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
   92 static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
   93 module_param(rcu_fanout_leaf, int, 0444);
   94 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
   95 static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
   96         NUM_RCU_LVL_0,
   97         NUM_RCU_LVL_1,
   98         NUM_RCU_LVL_2,
   99         NUM_RCU_LVL_3,
  100         NUM_RCU_LVL_4,
  101 };
  102 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
  103 
  104 /*
  105  * The rcu_scheduler_active variable transitions from zero to one just
  106  * before the first task is spawned.  So when this variable is zero, RCU
  107  * can assume that there is but one task, allowing RCU to (for example)
  108  * optimized synchronize_sched() to a simple barrier().  When this variable
  109  * is one, RCU must actually do all the hard work required to detect real
  110  * grace periods.  This variable is also used to suppress boot-time false
  111  * positives from lockdep-RCU error checking.
  112  */
  113 int rcu_scheduler_active __read_mostly;
  114 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  115 
  116 /*
  117  * The rcu_scheduler_fully_active variable transitions from zero to one
  118  * during the early_initcall() processing, which is after the scheduler
  119  * is capable of creating new tasks.  So RCU processing (for example,
  120  * creating tasks for RCU priority boosting) must be delayed until after
  121  * rcu_scheduler_fully_active transitions from zero to one.  We also
  122  * currently delay invocation of any RCU callbacks until after this point.
  123  *
  124  * It might later prove better for people registering RCU callbacks during
  125  * early boot to take responsibility for these callbacks, but one step at
  126  * a time.
  127  */
  128 static int rcu_scheduler_fully_active __read_mostly;
  129 
  130 #ifdef CONFIG_RCU_BOOST
  131 
  132 /*
  133  * Control variables for per-CPU and per-rcu_node kthreads.  These
  134  * handle all flavors of RCU.
  135  */
  136 static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
  137 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
  138 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
  139 DEFINE_PER_CPU(char, rcu_cpu_has_work);
  140 
  141 #endif /* #ifdef CONFIG_RCU_BOOST */
  142 
  143 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
  144 static void invoke_rcu_core(void);
  145 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
  146 
  147 /*
  148  * Track the rcutorture test sequence number and the update version
  149  * number within a given test.  The rcutorture_testseq is incremented
  150  * on every rcutorture module load and unload, so has an odd value
  151  * when a test is running.  The rcutorture_vernum is set to zero
  152  * when rcutorture starts and is incremented on each rcutorture update.
  153  * These variables enable correlating rcutorture output with the
  154  * RCU tracing information.
  155  */
  156 unsigned long rcutorture_testseq;
  157 unsigned long rcutorture_vernum;
  158 
  159 /*
  160  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  161  * permit this function to be invoked without holding the root rcu_node
  162  * structure's ->lock, but of course results can be subject to change.
  163  */
  164 static int rcu_gp_in_progress(struct rcu_state *rsp)
  165 {
  166         return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
  167 }
  168 
  169 /*
  170  * Note a quiescent state.  Because we do not need to know
  171  * how many quiescent states passed, just if there was at least
  172  * one since the start of the grace period, this just sets a flag.
  173  * The caller must have disabled preemption.
  174  */
  175 void rcu_sched_qs(int cpu)
  176 {
  177         struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
  178 
  179         if (rdp->passed_quiesce == 0)
  180                 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
  181         rdp->passed_quiesce = 1;
  182 }
  183 
  184 void rcu_bh_qs(int cpu)
  185 {
  186         struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
  187 
  188         if (rdp->passed_quiesce == 0)
  189                 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
  190         rdp->passed_quiesce = 1;
  191 }
  192 
  193 /*
  194  * Note a context switch.  This is a quiescent state for RCU-sched,
  195  * and requires special handling for preemptible RCU.
  196  * The caller must have disabled preemption.
  197  */
  198 void rcu_note_context_switch(int cpu)
  199 {
  200         trace_rcu_utilization("Start context switch");
  201         rcu_sched_qs(cpu);
  202         rcu_preempt_note_context_switch(cpu);
  203         trace_rcu_utilization("End context switch");
  204 }
  205 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  206 
  207 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
  208         .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
  209         .dynticks = ATOMIC_INIT(1),
  210 };
  211 
  212 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
  213 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
  214 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
  215 
  216 module_param(blimit, long, 0444);
  217 module_param(qhimark, long, 0444);
  218 module_param(qlowmark, long, 0444);
  219 
  220 int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
  221 int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
  222 
  223 module_param(rcu_cpu_stall_suppress, int, 0644);
  224 module_param(rcu_cpu_stall_timeout, int, 0644);
  225 
  226 static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
  227 static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
  228 
  229 module_param(jiffies_till_first_fqs, ulong, 0644);
  230 module_param(jiffies_till_next_fqs, ulong, 0644);
  231 
  232 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
  233 static void force_quiescent_state(struct rcu_state *rsp);
  234 static int rcu_pending(int cpu);
  235 
  236 /*
  237  * Return the number of RCU-sched batches processed thus far for debug & stats.
  238  */
  239 long rcu_batches_completed_sched(void)
  240 {
  241         return rcu_sched_state.completed;
  242 }
  243 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
  244 
  245 /*
  246  * Return the number of RCU BH batches processed thus far for debug & stats.
  247  */
  248 long rcu_batches_completed_bh(void)
  249 {
  250         return rcu_bh_state.completed;
  251 }
  252 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
  253 
  254 /*
  255  * Force a quiescent state for RCU BH.
  256  */
  257 void rcu_bh_force_quiescent_state(void)
  258 {
  259         force_quiescent_state(&rcu_bh_state);
  260 }
  261 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
  262 
  263 /*
  264  * Record the number of times rcutorture tests have been initiated and
  265  * terminated.  This information allows the debugfs tracing stats to be
  266  * correlated to the rcutorture messages, even when the rcutorture module
  267  * is being repeatedly loaded and unloaded.  In other words, we cannot
  268  * store this state in rcutorture itself.
  269  */
  270 void rcutorture_record_test_transition(void)
  271 {
  272         rcutorture_testseq++;
  273         rcutorture_vernum = 0;
  274 }
  275 EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
  276 
  277 /*
  278  * Record the number of writer passes through the current rcutorture test.
  279  * This is also used to correlate debugfs tracing stats with the rcutorture
  280  * messages.
  281  */
  282 void rcutorture_record_progress(unsigned long vernum)
  283 {
  284         rcutorture_vernum++;
  285 }
  286 EXPORT_SYMBOL_GPL(rcutorture_record_progress);
  287 
  288 /*
  289  * Force a quiescent state for RCU-sched.
  290  */
  291 void rcu_sched_force_quiescent_state(void)
  292 {
  293         force_quiescent_state(&rcu_sched_state);
  294 }
  295 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
  296 
  297 /*
  298  * Does the CPU have callbacks ready to be invoked?
  299  */
  300 static int
  301 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
  302 {
  303         return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
  304                rdp->nxttail[RCU_DONE_TAIL] != NULL;
  305 }
  306 
  307 /*
  308  * Does the current CPU require a yet-as-unscheduled grace period?
  309  */
  310 static int
  311 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
  312 {
  313         struct rcu_head **ntp;
  314 
  315         ntp = rdp->nxttail[RCU_DONE_TAIL +
  316                            (ACCESS_ONCE(rsp->completed) != rdp->completed)];
  317         return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
  318                !rcu_gp_in_progress(rsp);
  319 }
  320 
  321 /*
  322  * Return the root node of the specified rcu_state structure.
  323  */
  324 static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
  325 {
  326         return &rsp->node[0];
  327 }
  328 
  329 /*
  330  * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
  331  *
  332  * If the new value of the ->dynticks_nesting counter now is zero,
  333  * we really have entered idle, and must do the appropriate accounting.
  334  * The caller must have disabled interrupts.
  335  */
  336 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
  337                                 bool user)
  338 {
  339         trace_rcu_dyntick("Start", oldval, 0);
  340         if (!user && !is_idle_task(current)) {
  341                 struct task_struct *idle = idle_task(smp_processor_id());
  342 
  343                 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
  344                 ftrace_dump(DUMP_ORIG);
  345                 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
  346                           current->pid, current->comm,
  347                           idle->pid, idle->comm); /* must be idle task! */
  348         }
  349         rcu_prepare_for_idle(smp_processor_id());
  350         /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
  351         smp_mb__before_atomic_inc();  /* See above. */
  352         atomic_inc(&rdtp->dynticks);
  353         smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
  354         WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
  355 
  356         /*
  357          * It is illegal to enter an extended quiescent state while
  358          * in an RCU read-side critical section.
  359          */
  360         rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
  361                            "Illegal idle entry in RCU read-side critical section.");
  362         rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
  363                            "Illegal idle entry in RCU-bh read-side critical section.");
  364         rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
  365                            "Illegal idle entry in RCU-sched read-side critical section.");
  366 }
  367 
  368 /*
  369  * Enter an RCU extended quiescent state, which can be either the
  370  * idle loop or adaptive-tickless usermode execution.
  371  */
  372 static void rcu_eqs_enter(bool user)
  373 {
  374         long long oldval;
  375         struct rcu_dynticks *rdtp;
  376 
  377         rdtp = &__get_cpu_var(rcu_dynticks);
  378         oldval = rdtp->dynticks_nesting;
  379         WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
  380         if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
  381                 rdtp->dynticks_nesting = 0;
  382         else
  383                 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
  384         rcu_eqs_enter_common(rdtp, oldval, user);
  385 }
  386 
  387 /**
  388  * rcu_idle_enter - inform RCU that current CPU is entering idle
  389  *
  390  * Enter idle mode, in other words, -leave- the mode in which RCU
  391  * read-side critical sections can occur.  (Though RCU read-side
  392  * critical sections can occur in irq handlers in idle, a possibility
  393  * handled by irq_enter() and irq_exit().)
  394  *
  395  * We crowbar the ->dynticks_nesting field to zero to allow for
  396  * the possibility of usermode upcalls having messed up our count
  397  * of interrupt nesting level during the prior busy period.
  398  */
  399 void rcu_idle_enter(void)
  400 {
  401         unsigned long flags;
  402 
  403         local_irq_save(flags);
  404         rcu_eqs_enter(false);
  405         local_irq_restore(flags);
  406 }
  407 EXPORT_SYMBOL_GPL(rcu_idle_enter);
  408 
  409 #ifdef CONFIG_RCU_USER_QS
  410 /**
  411  * rcu_user_enter - inform RCU that we are resuming userspace.
  412  *
  413  * Enter RCU idle mode right before resuming userspace.  No use of RCU
  414  * is permitted between this call and rcu_user_exit(). This way the
  415  * CPU doesn't need to maintain the tick for RCU maintenance purposes
  416  * when the CPU runs in userspace.
  417  */
  418 void rcu_user_enter(void)
  419 {
  420         rcu_eqs_enter(1);
  421 }
  422 
  423 /**
  424  * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
  425  * after the current irq returns.
  426  *
  427  * This is similar to rcu_user_enter() but in the context of a non-nesting
  428  * irq. After this call, RCU enters into idle mode when the interrupt
  429  * returns.
  430  */
  431 void rcu_user_enter_after_irq(void)
  432 {
  433         unsigned long flags;
  434         struct rcu_dynticks *rdtp;
  435 
  436         local_irq_save(flags);
  437         rdtp = &__get_cpu_var(rcu_dynticks);
  438         /* Ensure this irq is interrupting a non-idle RCU state.  */
  439         WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
  440         rdtp->dynticks_nesting = 1;
  441         local_irq_restore(flags);
  442 }
  443 #endif /* CONFIG_RCU_USER_QS */
  444 
  445 /**
  446  * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
  447  *
  448  * Exit from an interrupt handler, which might possibly result in entering
  449  * idle mode, in other words, leaving the mode in which read-side critical
  450  * sections can occur.
  451  *
  452  * This code assumes that the idle loop never does anything that might
  453  * result in unbalanced calls to irq_enter() and irq_exit().  If your
  454  * architecture violates this assumption, RCU will give you what you
  455  * deserve, good and hard.  But very infrequently and irreproducibly.
  456  *
  457  * Use things like work queues to work around this limitation.
  458  *
  459  * You have been warned.
  460  */
  461 void rcu_irq_exit(void)
  462 {
  463         unsigned long flags;
  464         long long oldval;
  465         struct rcu_dynticks *rdtp;
  466 
  467         local_irq_save(flags);
  468         rdtp = &__get_cpu_var(rcu_dynticks);
  469         oldval = rdtp->dynticks_nesting;
  470         rdtp->dynticks_nesting--;
  471         WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
  472         if (rdtp->dynticks_nesting)
  473                 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
  474         else
  475                 rcu_eqs_enter_common(rdtp, oldval, true);
  476         local_irq_restore(flags);
  477 }
  478 
  479 /*
  480  * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
  481  *
  482  * If the new value of the ->dynticks_nesting counter was previously zero,
  483  * we really have exited idle, and must do the appropriate accounting.
  484  * The caller must have disabled interrupts.
  485  */
  486 static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
  487                                int user)
  488 {
  489         smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
  490         atomic_inc(&rdtp->dynticks);
  491         /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
  492         smp_mb__after_atomic_inc();  /* See above. */
  493         WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
  494         rcu_cleanup_after_idle(smp_processor_id());
  495         trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
  496         if (!user && !is_idle_task(current)) {
  497                 struct task_struct *idle = idle_task(smp_processor_id());
  498 
  499                 trace_rcu_dyntick("Error on exit: not idle task",
  500                                   oldval, rdtp->dynticks_nesting);
  501                 ftrace_dump(DUMP_ORIG);
  502                 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
  503                           current->pid, current->comm,
  504                           idle->pid, idle->comm); /* must be idle task! */
  505         }
  506 }
  507 
  508 /*
  509  * Exit an RCU extended quiescent state, which can be either the
  510  * idle loop or adaptive-tickless usermode execution.
  511  */
  512 static void rcu_eqs_exit(bool user)
  513 {
  514         struct rcu_dynticks *rdtp;
  515         long long oldval;
  516 
  517         rdtp = &__get_cpu_var(rcu_dynticks);
  518         oldval = rdtp->dynticks_nesting;
  519         WARN_ON_ONCE(oldval < 0);
  520         if (oldval & DYNTICK_TASK_NEST_MASK)
  521                 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
  522         else
  523                 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
  524         rcu_eqs_exit_common(rdtp, oldval, user);
  525 }
  526 
  527 /**
  528  * rcu_idle_exit - inform RCU that current CPU is leaving idle
  529  *
  530  * Exit idle mode, in other words, -enter- the mode in which RCU
  531  * read-side critical sections can occur.
  532  *
  533  * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
  534  * allow for the possibility of usermode upcalls messing up our count
  535  * of interrupt nesting level during the busy period that is just
  536  * now starting.
  537  */
  538 void rcu_idle_exit(void)
  539 {
  540         unsigned long flags;
  541 
  542         local_irq_save(flags);
  543         rcu_eqs_exit(false);
  544         local_irq_restore(flags);
  545 }
  546 EXPORT_SYMBOL_GPL(rcu_idle_exit);
  547 
  548 #ifdef CONFIG_RCU_USER_QS
  549 /**
  550  * rcu_user_exit - inform RCU that we are exiting userspace.
  551  *
  552  * Exit RCU idle mode while entering the kernel because it can
  553  * run a RCU read side critical section anytime.
  554  */
  555 void rcu_user_exit(void)
  556 {
  557         rcu_eqs_exit(1);
  558 }
  559 
  560 /**
  561  * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
  562  * idle mode after the current non-nesting irq returns.
  563  *
  564  * This is similar to rcu_user_exit() but in the context of an irq.
  565  * This is called when the irq has interrupted a userspace RCU idle mode
  566  * context. When the current non-nesting interrupt returns after this call,
  567  * the CPU won't restore the RCU idle mode.
  568  */
  569 void rcu_user_exit_after_irq(void)
  570 {
  571         unsigned long flags;
  572         struct rcu_dynticks *rdtp;
  573 
  574         local_irq_save(flags);
  575         rdtp = &__get_cpu_var(rcu_dynticks);
  576         /* Ensure we are interrupting an RCU idle mode. */
  577         WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
  578         rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
  579         local_irq_restore(flags);
  580 }
  581 #endif /* CONFIG_RCU_USER_QS */
  582 
  583 /**
  584  * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
  585  *
  586  * Enter an interrupt handler, which might possibly result in exiting
  587  * idle mode, in other words, entering the mode in which read-side critical
  588  * sections can occur.
  589  *
  590  * Note that the Linux kernel is fully capable of entering an interrupt
  591  * handler that it never exits, for example when doing upcalls to
  592  * user mode!  This code assumes that the idle loop never does upcalls to
  593  * user mode.  If your architecture does do upcalls from the idle loop (or
  594  * does anything else that results in unbalanced calls to the irq_enter()
  595  * and irq_exit() functions), RCU will give you what you deserve, good
  596  * and hard.  But very infrequently and irreproducibly.
  597  *
  598  * Use things like work queues to work around this limitation.
  599  *
  600  * You have been warned.
  601  */
  602 void rcu_irq_enter(void)
  603 {
  604         unsigned long flags;
  605         struct rcu_dynticks *rdtp;
  606         long long oldval;
  607 
  608         local_irq_save(flags);
  609         rdtp = &__get_cpu_var(rcu_dynticks);
  610         oldval = rdtp->dynticks_nesting;
  611         rdtp->dynticks_nesting++;
  612         WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
  613         if (oldval)
  614                 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
  615         else
  616                 rcu_eqs_exit_common(rdtp, oldval, true);
  617         local_irq_restore(flags);
  618 }
  619 
  620 /**
  621  * rcu_nmi_enter - inform RCU of entry to NMI context
  622  *
  623  * If the CPU was idle with dynamic ticks active, and there is no
  624  * irq handler running, this updates rdtp->dynticks_nmi to let the
  625  * RCU grace-period handling know that the CPU is active.
  626  */
  627 void rcu_nmi_enter(void)
  628 {
  629         struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
  630 
  631         if (rdtp->dynticks_nmi_nesting == 0 &&
  632             (atomic_read(&rdtp->dynticks) & 0x1))
  633                 return;
  634         rdtp->dynticks_nmi_nesting++;
  635         smp_mb__before_atomic_inc();  /* Force delay from prior write. */
  636         atomic_inc(&rdtp->dynticks);
  637         /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
  638         smp_mb__after_atomic_inc();  /* See above. */
  639         WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
  640 }
  641 
  642 /**
  643  * rcu_nmi_exit - inform RCU of exit from NMI context
  644  *
  645  * If the CPU was idle with dynamic ticks active, and there is no
  646  * irq handler running, this updates rdtp->dynticks_nmi to let the
  647  * RCU grace-period handling know that the CPU is no longer active.
  648  */
  649 void rcu_nmi_exit(void)
  650 {
  651         struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
  652 
  653         if (rdtp->dynticks_nmi_nesting == 0 ||
  654             --rdtp->dynticks_nmi_nesting != 0)
  655                 return;
  656         /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
  657         smp_mb__before_atomic_inc();  /* See above. */
  658         atomic_inc(&rdtp->dynticks);
  659         smp_mb__after_atomic_inc();  /* Force delay to next write. */
  660         WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
  661 }
  662 
  663 /**
  664  * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
  665  *
  666  * If the current CPU is in its idle loop and is neither in an interrupt
  667  * or NMI handler, return true.
  668  */
  669 int rcu_is_cpu_idle(void)
  670 {
  671         int ret;
  672 
  673         preempt_disable();
  674         ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
  675         preempt_enable();
  676         return ret;
  677 }
  678 EXPORT_SYMBOL(rcu_is_cpu_idle);
  679 
  680 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
  681 
  682 /*
  683  * Is the current CPU online?  Disable preemption to avoid false positives
  684  * that could otherwise happen due to the current CPU number being sampled,
  685  * this task being preempted, its old CPU being taken offline, resuming
  686  * on some other CPU, then determining that its old CPU is now offline.
  687  * It is OK to use RCU on an offline processor during initial boot, hence
  688  * the check for rcu_scheduler_fully_active.  Note also that it is OK
  689  * for a CPU coming online to use RCU for one jiffy prior to marking itself
  690  * online in the cpu_online_mask.  Similarly, it is OK for a CPU going
  691  * offline to continue to use RCU for one jiffy after marking itself
  692  * offline in the cpu_online_mask.  This leniency is necessary given the
  693  * non-atomic nature of the online and offline processing, for example,
  694  * the fact that a CPU enters the scheduler after completing the CPU_DYING
  695  * notifiers.
  696  *
  697  * This is also why RCU internally marks CPUs online during the
  698  * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
  699  *
  700  * Disable checking if in an NMI handler because we cannot safely report
  701  * errors from NMI handlers anyway.
  702  */
  703 bool rcu_lockdep_current_cpu_online(void)
  704 {
  705         struct rcu_data *rdp;
  706         struct rcu_node *rnp;
  707         bool ret;
  708 
  709         if (in_nmi())
  710                 return 1;
  711         preempt_disable();
  712         rdp = &__get_cpu_var(rcu_sched_data);
  713         rnp = rdp->mynode;
  714         ret = (rdp->grpmask & rnp->qsmaskinit) ||
  715               !rcu_scheduler_fully_active;
  716         preempt_enable();
  717         return ret;
  718 }
  719 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
  720 
  721 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
  722 
  723 /**
  724  * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
  725  *
  726  * If the current CPU is idle or running at a first-level (not nested)
  727  * interrupt from idle, return true.  The caller must have at least
  728  * disabled preemption.
  729  */
  730 int rcu_is_cpu_rrupt_from_idle(void)
  731 {
  732         return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
  733 }
  734 
  735 /*
  736  * Snapshot the specified CPU's dynticks counter so that we can later
  737  * credit them with an implicit quiescent state.  Return 1 if this CPU
  738  * is in dynticks idle mode, which is an extended quiescent state.
  739  */
  740 static int dyntick_save_progress_counter(struct rcu_data *rdp)
  741 {
  742         rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
  743         return (rdp->dynticks_snap & 0x1) == 0;
  744 }
  745 
  746 /*
  747  * Return true if the specified CPU has passed through a quiescent
  748  * state by virtue of being in or having passed through an dynticks
  749  * idle state since the last call to dyntick_save_progress_counter()
  750  * for this same CPU, or by virtue of having been offline.
  751  */
  752 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
  753 {
  754         unsigned int curr;
  755         unsigned int snap;
  756 
  757         curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
  758         snap = (unsigned int)rdp->dynticks_snap;
  759 
  760         /*
  761          * If the CPU passed through or entered a dynticks idle phase with
  762          * no active irq/NMI handlers, then we can safely pretend that the CPU
  763          * already acknowledged the request to pass through a quiescent
  764          * state.  Either way, that CPU cannot possibly be in an RCU
  765          * read-side critical section that started before the beginning
  766          * of the current RCU grace period.
  767          */
  768         if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
  769                 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
  770                 rdp->dynticks_fqs++;
  771                 return 1;
  772         }
  773 
  774         /*
  775          * Check for the CPU being offline, but only if the grace period
  776          * is old enough.  We don't need to worry about the CPU changing
  777          * state: If we see it offline even once, it has been through a
  778          * quiescent state.
  779          *
  780          * The reason for insisting that the grace period be at least
  781          * one jiffy old is that CPUs that are not quite online and that
  782          * have just gone offline can still execute RCU read-side critical
  783          * sections.
  784          */
  785         if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
  786                 return 0;  /* Grace period is not old enough. */
  787         barrier();
  788         if (cpu_is_offline(rdp->cpu)) {
  789                 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
  790                 rdp->offline_fqs++;
  791                 return 1;
  792         }
  793         return 0;
  794 }
  795 
  796 static int jiffies_till_stall_check(void)
  797 {
  798         int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
  799 
  800         /*
  801          * Limit check must be consistent with the Kconfig limits
  802          * for CONFIG_RCU_CPU_STALL_TIMEOUT.
  803          */
  804         if (till_stall_check < 3) {
  805                 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
  806                 till_stall_check = 3;
  807         } else if (till_stall_check > 300) {
  808                 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
  809                 till_stall_check = 300;
  810         }
  811         return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
  812 }
  813 
  814 static void record_gp_stall_check_time(struct rcu_state *rsp)
  815 {
  816         rsp->gp_start = jiffies;
  817         rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
  818 }
  819 
  820 /*
  821  * Dump stacks of all tasks running on stalled CPUs.  This is a fallback
  822  * for architectures that do not implement trigger_all_cpu_backtrace().
  823  * The NMI-triggered stack traces are more accurate because they are
  824  * printed by the target CPU.
  825  */
  826 static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
  827 {
  828         int cpu;
  829         unsigned long flags;
  830         struct rcu_node *rnp;
  831 
  832         rcu_for_each_leaf_node(rsp, rnp) {
  833                 raw_spin_lock_irqsave(&rnp->lock, flags);
  834                 if (rnp->qsmask != 0) {
  835                         for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
  836                                 if (rnp->qsmask & (1UL << cpu))
  837                                         dump_cpu_task(rnp->grplo + cpu);
  838                 }
  839                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
  840         }
  841 }
  842 
  843 static void print_other_cpu_stall(struct rcu_state *rsp)
  844 {
  845         int cpu;
  846         long delta;
  847         unsigned long flags;
  848         int ndetected = 0;
  849         struct rcu_node *rnp = rcu_get_root(rsp);
  850         long totqlen = 0;
  851 
  852         /* Only let one CPU complain about others per time interval. */
  853 
  854         raw_spin_lock_irqsave(&rnp->lock, flags);
  855         delta = jiffies - rsp->jiffies_stall;
  856         if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
  857                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
  858                 return;
  859         }
  860         rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
  861         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  862 
  863         /*
  864          * OK, time to rat on our buddy...
  865          * See Documentation/RCU/stallwarn.txt for info on how to debug
  866          * RCU CPU stall warnings.
  867          */
  868         printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
  869                rsp->name);
  870         print_cpu_stall_info_begin();
  871         rcu_for_each_leaf_node(rsp, rnp) {
  872                 raw_spin_lock_irqsave(&rnp->lock, flags);
  873                 ndetected += rcu_print_task_stall(rnp);
  874                 if (rnp->qsmask != 0) {
  875                         for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
  876                                 if (rnp->qsmask & (1UL << cpu)) {
  877                                         print_cpu_stall_info(rsp,
  878                                                              rnp->grplo + cpu);
  879                                         ndetected++;
  880                                 }
  881                 }
  882                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
  883         }
  884 
  885         /*
  886          * Now rat on any tasks that got kicked up to the root rcu_node
  887          * due to CPU offlining.
  888          */
  889         rnp = rcu_get_root(rsp);
  890         raw_spin_lock_irqsave(&rnp->lock, flags);
  891         ndetected += rcu_print_task_stall(rnp);
  892         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  893 
  894         print_cpu_stall_info_end();
  895         for_each_possible_cpu(cpu)
  896                 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
  897         pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
  898                smp_processor_id(), (long)(jiffies - rsp->gp_start),
  899                rsp->gpnum, rsp->completed, totqlen);
  900         if (ndetected == 0)
  901                 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
  902         else if (!trigger_all_cpu_backtrace())
  903                 rcu_dump_cpu_stacks(rsp);
  904 
  905         /* Complain about tasks blocking the grace period. */
  906 
  907         rcu_print_detail_task_stall(rsp);
  908 
  909         force_quiescent_state(rsp);  /* Kick them all. */
  910 }
  911 
  912 static void print_cpu_stall(struct rcu_state *rsp)
  913 {
  914         int cpu;
  915         unsigned long flags;
  916         struct rcu_node *rnp = rcu_get_root(rsp);
  917         long totqlen = 0;
  918 
  919         /*
  920          * OK, time to rat on ourselves...
  921          * See Documentation/RCU/stallwarn.txt for info on how to debug
  922          * RCU CPU stall warnings.
  923          */
  924         printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
  925         print_cpu_stall_info_begin();
  926         print_cpu_stall_info(rsp, smp_processor_id());
  927         print_cpu_stall_info_end();
  928         for_each_possible_cpu(cpu)
  929                 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
  930         pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
  931                 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
  932         if (!trigger_all_cpu_backtrace())
  933                 dump_stack();
  934 
  935         raw_spin_lock_irqsave(&rnp->lock, flags);
  936         if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
  937                 rsp->jiffies_stall = jiffies +
  938                                      3 * jiffies_till_stall_check() + 3;
  939         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  940 
  941         set_need_resched();  /* kick ourselves to get things going. */
  942 }
  943 
  944 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
  945 {
  946         unsigned long j;
  947         unsigned long js;
  948         struct rcu_node *rnp;
  949 
  950         if (rcu_cpu_stall_suppress)
  951                 return;
  952         j = ACCESS_ONCE(jiffies);
  953         js = ACCESS_ONCE(rsp->jiffies_stall);
  954         rnp = rdp->mynode;
  955         if (rcu_gp_in_progress(rsp) &&
  956             (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
  957 
  958                 /* We haven't checked in, so go dump stack. */
  959                 print_cpu_stall(rsp);
  960 
  961         } else if (rcu_gp_in_progress(rsp) &&
  962                    ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
  963 
  964                 /* They had a few time units to dump stack, so complain. */
  965                 print_other_cpu_stall(rsp);
  966         }
  967 }
  968 
  969 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
  970 {
  971         rcu_cpu_stall_suppress = 1;
  972         return NOTIFY_DONE;
  973 }
  974 
  975 /**
  976  * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
  977  *
  978  * Set the stall-warning timeout way off into the future, thus preventing
  979  * any RCU CPU stall-warning messages from appearing in the current set of
  980  * RCU grace periods.
  981  *
  982  * The caller must disable hard irqs.
  983  */
  984 void rcu_cpu_stall_reset(void)
  985 {
  986         struct rcu_state *rsp;
  987 
  988         for_each_rcu_flavor(rsp)
  989                 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
  990 }
  991 
  992 static struct notifier_block rcu_panic_block = {
  993         .notifier_call = rcu_panic,
  994 };
  995 
  996 static void __init check_cpu_stall_init(void)
  997 {
  998         atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
  999 }
 1000 
 1001 /*
 1002  * Update CPU-local rcu_data state to record the newly noticed grace period.
 1003  * This is used both when we started the grace period and when we notice
 1004  * that someone else started the grace period.  The caller must hold the
 1005  * ->lock of the leaf rcu_node structure corresponding to the current CPU,
 1006  *  and must have irqs disabled.
 1007  */
 1008 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 1009 {
 1010         if (rdp->gpnum != rnp->gpnum) {
 1011                 /*
 1012                  * If the current grace period is waiting for this CPU,
 1013                  * set up to detect a quiescent state, otherwise don't
 1014                  * go looking for one.
 1015                  */
 1016                 rdp->gpnum = rnp->gpnum;
 1017                 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
 1018                 rdp->passed_quiesce = 0;
 1019                 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 1020                 zero_cpu_stall_ticks(rdp);
 1021         }
 1022 }
 1023 
 1024 static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 1025 {
 1026         unsigned long flags;
 1027         struct rcu_node *rnp;
 1028 
 1029         local_irq_save(flags);
 1030         rnp = rdp->mynode;
 1031         if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
 1032             !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 1033                 local_irq_restore(flags);
 1034                 return;
 1035         }
 1036         __note_new_gpnum(rsp, rnp, rdp);
 1037         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1038 }
 1039 
 1040 /*
 1041  * Did someone else start a new RCU grace period start since we last
 1042  * checked?  Update local state appropriately if so.  Must be called
 1043  * on the CPU corresponding to rdp.
 1044  */
 1045 static int
 1046 check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 1047 {
 1048         unsigned long flags;
 1049         int ret = 0;
 1050 
 1051         local_irq_save(flags);
 1052         if (rdp->gpnum != rsp->gpnum) {
 1053                 note_new_gpnum(rsp, rdp);
 1054                 ret = 1;
 1055         }
 1056         local_irq_restore(flags);
 1057         return ret;
 1058 }
 1059 
 1060 /*
 1061  * Initialize the specified rcu_data structure's callback list to empty.
 1062  */
 1063 static void init_callback_list(struct rcu_data *rdp)
 1064 {
 1065         int i;
 1066 
 1067         rdp->nxtlist = NULL;
 1068         for (i = 0; i < RCU_NEXT_SIZE; i++)
 1069                 rdp->nxttail[i] = &rdp->nxtlist;
 1070         init_nocb_callback_list(rdp);
 1071 }
 1072 
 1073 /*
 1074  * Advance this CPU's callbacks, but only if the current grace period
 1075  * has ended.  This may be called only from the CPU to whom the rdp
 1076  * belongs.  In addition, the corresponding leaf rcu_node structure's
 1077  * ->lock must be held by the caller, with irqs disabled.
 1078  */
 1079 static void
 1080 __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 1081 {
 1082         /* Did another grace period end? */
 1083         if (rdp->completed != rnp->completed) {
 1084 
 1085                 /* Advance callbacks.  No harm if list empty. */
 1086                 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
 1087                 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
 1088                 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 1089 
 1090                 /* Remember that we saw this grace-period completion. */
 1091                 rdp->completed = rnp->completed;
 1092                 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
 1093 
 1094                 /*
 1095                  * If we were in an extended quiescent state, we may have
 1096                  * missed some grace periods that others CPUs handled on
 1097                  * our behalf. Catch up with this state to avoid noting
 1098                  * spurious new grace periods.  If another grace period
 1099                  * has started, then rnp->gpnum will have advanced, so
 1100                  * we will detect this later on.  Of course, any quiescent
 1101                  * states we found for the old GP are now invalid.
 1102                  */
 1103                 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
 1104                         rdp->gpnum = rdp->completed;
 1105                         rdp->passed_quiesce = 0;
 1106                 }
 1107 
 1108                 /*
 1109                  * If RCU does not need a quiescent state from this CPU,
 1110                  * then make sure that this CPU doesn't go looking for one.
 1111                  */
 1112                 if ((rnp->qsmask & rdp->grpmask) == 0)
 1113                         rdp->qs_pending = 0;
 1114         }
 1115 }
 1116 
 1117 /*
 1118  * Advance this CPU's callbacks, but only if the current grace period
 1119  * has ended.  This may be called only from the CPU to whom the rdp
 1120  * belongs.
 1121  */
 1122 static void
 1123 rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 1124 {
 1125         unsigned long flags;
 1126         struct rcu_node *rnp;
 1127 
 1128         local_irq_save(flags);
 1129         rnp = rdp->mynode;
 1130         if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
 1131             !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 1132                 local_irq_restore(flags);
 1133                 return;
 1134         }
 1135         __rcu_process_gp_end(rsp, rnp, rdp);
 1136         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1137 }
 1138 
 1139 /*
 1140  * Do per-CPU grace-period initialization for running CPU.  The caller
 1141  * must hold the lock of the leaf rcu_node structure corresponding to
 1142  * this CPU.
 1143  */
 1144 static void
 1145 rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 1146 {
 1147         /* Prior grace period ended, so advance callbacks for current CPU. */
 1148         __rcu_process_gp_end(rsp, rnp, rdp);
 1149 
 1150         /* Set state so that this CPU will detect the next quiescent state. */
 1151         __note_new_gpnum(rsp, rnp, rdp);
 1152 }
 1153 
 1154 /*
 1155  * Initialize a new grace period.
 1156  */
 1157 static int rcu_gp_init(struct rcu_state *rsp)
 1158 {
 1159         struct rcu_data *rdp;
 1160         struct rcu_node *rnp = rcu_get_root(rsp);
 1161 
 1162         raw_spin_lock_irq(&rnp->lock);
 1163         rsp->gp_flags = 0; /* Clear all flags: New grace period. */
 1164 
 1165         if (rcu_gp_in_progress(rsp)) {
 1166                 /* Grace period already in progress, don't start another.  */
 1167                 raw_spin_unlock_irq(&rnp->lock);
 1168                 return 0;
 1169         }
 1170 
 1171         /* Advance to a new grace period and initialize state. */
 1172         rsp->gpnum++;
 1173         trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
 1174         record_gp_stall_check_time(rsp);
 1175         raw_spin_unlock_irq(&rnp->lock);
 1176 
 1177         /* Exclude any concurrent CPU-hotplug operations. */
 1178         mutex_lock(&rsp->onoff_mutex);
 1179 
 1180         /*
 1181          * Set the quiescent-state-needed bits in all the rcu_node
 1182          * structures for all currently online CPUs in breadth-first order,
 1183          * starting from the root rcu_node structure, relying on the layout
 1184          * of the tree within the rsp->node[] array.  Note that other CPUs
 1185          * will access only the leaves of the hierarchy, thus seeing that no
 1186          * grace period is in progress, at least until the corresponding
 1187          * leaf node has been initialized.  In addition, we have excluded
 1188          * CPU-hotplug operations.
 1189          *
 1190          * The grace period cannot complete until the initialization
 1191          * process finishes, because this kthread handles both.
 1192          */
 1193         rcu_for_each_node_breadth_first(rsp, rnp) {
 1194                 raw_spin_lock_irq(&rnp->lock);
 1195                 rdp = this_cpu_ptr(rsp->rda);
 1196                 rcu_preempt_check_blocked_tasks(rnp);
 1197                 rnp->qsmask = rnp->qsmaskinit;
 1198                 rnp->gpnum = rsp->gpnum;
 1199                 WARN_ON_ONCE(rnp->completed != rsp->completed);
 1200                 rnp->completed = rsp->completed;
 1201                 if (rnp == rdp->mynode)
 1202                         rcu_start_gp_per_cpu(rsp, rnp, rdp);
 1203                 rcu_preempt_boost_start_gp(rnp);
 1204                 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
 1205                                             rnp->level, rnp->grplo,
 1206                                             rnp->grphi, rnp->qsmask);
 1207                 raw_spin_unlock_irq(&rnp->lock);
 1208 #ifdef CONFIG_PROVE_RCU_DELAY
 1209                 if ((random32() % (rcu_num_nodes * 8)) == 0)
 1210                         schedule_timeout_uninterruptible(2);
 1211 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
 1212                 cond_resched();
 1213         }
 1214 
 1215         mutex_unlock(&rsp->onoff_mutex);
 1216         return 1;
 1217 }
 1218 
 1219 /*
 1220  * Do one round of quiescent-state forcing.
 1221  */
 1222 int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 1223 {
 1224         int fqs_state = fqs_state_in;
 1225         struct rcu_node *rnp = rcu_get_root(rsp);
 1226 
 1227         rsp->n_force_qs++;
 1228         if (fqs_state == RCU_SAVE_DYNTICK) {
 1229                 /* Collect dyntick-idle snapshots. */
 1230                 force_qs_rnp(rsp, dyntick_save_progress_counter);
 1231                 fqs_state = RCU_FORCE_QS;
 1232         } else {
 1233                 /* Handle dyntick-idle and offline CPUs. */
 1234                 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
 1235         }
 1236         /* Clear flag to prevent immediate re-entry. */
 1237         if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 1238                 raw_spin_lock_irq(&rnp->lock);
 1239                 rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
 1240                 raw_spin_unlock_irq(&rnp->lock);
 1241         }
 1242         return fqs_state;
 1243 }
 1244 
 1245 /*
 1246  * Clean up after the old grace period.
 1247  */
 1248 static void rcu_gp_cleanup(struct rcu_state *rsp)
 1249 {
 1250         unsigned long gp_duration;
 1251         struct rcu_data *rdp;
 1252         struct rcu_node *rnp = rcu_get_root(rsp);
 1253 
 1254         raw_spin_lock_irq(&rnp->lock);
 1255         gp_duration = jiffies - rsp->gp_start;
 1256         if (gp_duration > rsp->gp_max)
 1257                 rsp->gp_max = gp_duration;
 1258 
 1259         /*
 1260          * We know the grace period is complete, but to everyone else
 1261          * it appears to still be ongoing.  But it is also the case
 1262          * that to everyone else it looks like there is nothing that
 1263          * they can do to advance the grace period.  It is therefore
 1264          * safe for us to drop the lock in order to mark the grace
 1265          * period as completed in all of the rcu_node structures.
 1266          */
 1267         raw_spin_unlock_irq(&rnp->lock);
 1268 
 1269         /*
 1270          * Propagate new ->completed value to rcu_node structures so
 1271          * that other CPUs don't have to wait until the start of the next
 1272          * grace period to process their callbacks.  This also avoids
 1273          * some nasty RCU grace-period initialization races by forcing
 1274          * the end of the current grace period to be completely recorded in
 1275          * all of the rcu_node structures before the beginning of the next
 1276          * grace period is recorded in any of the rcu_node structures.
 1277          */
 1278         rcu_for_each_node_breadth_first(rsp, rnp) {
 1279                 raw_spin_lock_irq(&rnp->lock);
 1280                 rnp->completed = rsp->gpnum;
 1281                 raw_spin_unlock_irq(&rnp->lock);
 1282                 cond_resched();
 1283         }
 1284         rnp = rcu_get_root(rsp);
 1285         raw_spin_lock_irq(&rnp->lock);
 1286 
 1287         rsp->completed = rsp->gpnum; /* Declare grace period done. */
 1288         trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 1289         rsp->fqs_state = RCU_GP_IDLE;
 1290         rdp = this_cpu_ptr(rsp->rda);
 1291         if (cpu_needs_another_gp(rsp, rdp))
 1292                 rsp->gp_flags = 1;
 1293         raw_spin_unlock_irq(&rnp->lock);
 1294 }
 1295 
 1296 /*
 1297  * Body of kthread that handles grace periods.
 1298  */
 1299 static int __noreturn rcu_gp_kthread(void *arg)
 1300 {
 1301         int fqs_state;
 1302         unsigned long j;
 1303         int ret;
 1304         struct rcu_state *rsp = arg;
 1305         struct rcu_node *rnp = rcu_get_root(rsp);
 1306 
 1307         for (;;) {
 1308 
 1309                 /* Handle grace-period start. */
 1310                 for (;;) {
 1311                         wait_event_interruptible(rsp->gp_wq,
 1312                                                  rsp->gp_flags &
 1313                                                  RCU_GP_FLAG_INIT);
 1314                         if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
 1315                             rcu_gp_init(rsp))
 1316                                 break;
 1317                         cond_resched();
 1318                         flush_signals(current);
 1319                 }
 1320 
 1321                 /* Handle quiescent-state forcing. */
 1322                 fqs_state = RCU_SAVE_DYNTICK;
 1323                 j = jiffies_till_first_fqs;
 1324                 if (j > HZ) {
 1325                         j = HZ;
 1326                         jiffies_till_first_fqs = HZ;
 1327                 }
 1328                 for (;;) {
 1329                         rsp->jiffies_force_qs = jiffies + j;
 1330                         ret = wait_event_interruptible_timeout(rsp->gp_wq,
 1331                                         (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
 1332                                         (!ACCESS_ONCE(rnp->qsmask) &&
 1333                                          !rcu_preempt_blocked_readers_cgp(rnp)),
 1334                                         j);
 1335                         /* If grace period done, leave loop. */
 1336                         if (!ACCESS_ONCE(rnp->qsmask) &&
 1337                             !rcu_preempt_blocked_readers_cgp(rnp))
 1338                                 break;
 1339                         /* If time for quiescent-state forcing, do it. */
 1340                         if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
 1341                                 fqs_state = rcu_gp_fqs(rsp, fqs_state);
 1342                                 cond_resched();
 1343                         } else {
 1344                                 /* Deal with stray signal. */
 1345                                 cond_resched();
 1346                                 flush_signals(current);
 1347                         }
 1348                         j = jiffies_till_next_fqs;
 1349                         if (j > HZ) {
 1350                                 j = HZ;
 1351                                 jiffies_till_next_fqs = HZ;
 1352                         } else if (j < 1) {
 1353                                 j = 1;
 1354                                 jiffies_till_next_fqs = 1;
 1355                         }
 1356                 }
 1357 
 1358                 /* Handle grace-period end. */
 1359                 rcu_gp_cleanup(rsp);
 1360         }
 1361 }
 1362 
 1363 /*
 1364  * Start a new RCU grace period if warranted, re-initializing the hierarchy
 1365  * in preparation for detecting the next grace period.  The caller must hold
 1366  * the root node's ->lock, which is released before return.  Hard irqs must
 1367  * be disabled.
 1368  *
 1369  * Note that it is legal for a dying CPU (which is marked as offline) to
 1370  * invoke this function.  This can happen when the dying CPU reports its
 1371  * quiescent state.
 1372  */
 1373 static void
 1374 rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 1375         __releases(rcu_get_root(rsp)->lock)
 1376 {
 1377         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 1378         struct rcu_node *rnp = rcu_get_root(rsp);
 1379 
 1380         if (!rsp->gp_kthread ||
 1381             !cpu_needs_another_gp(rsp, rdp)) {
 1382                 /*
 1383                  * Either we have not yet spawned the grace-period
 1384                  * task, this CPU does not need another grace period,
 1385                  * or a grace period is already in progress.
 1386                  * Either way, don't start a new grace period.
 1387                  */
 1388                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1389                 return;
 1390         }
 1391 
 1392         /*
 1393          * Because there is no grace period in progress right now,
 1394          * any callbacks we have up to this point will be satisfied
 1395          * by the next grace period.  So promote all callbacks to be
 1396          * handled after the end of the next grace period.  If the
 1397          * CPU is not yet aware of the end of the previous grace period,
 1398          * we need to allow for the callback advancement that will
 1399          * occur when it does become aware.  Deadlock prevents us from
 1400          * making it aware at this point: We cannot acquire a leaf
 1401          * rcu_node ->lock while holding the root rcu_node ->lock.
 1402          */
 1403         rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 1404         if (rdp->completed == rsp->completed)
 1405                 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 1406 
 1407         rsp->gp_flags = RCU_GP_FLAG_INIT;
 1408         raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
 1409 
 1410         /* Ensure that CPU is aware of completion of last grace period. */
 1411         rcu_process_gp_end(rsp, rdp);
 1412         local_irq_restore(flags);
 1413 
 1414         /* Wake up rcu_gp_kthread() to start the grace period. */
 1415         wake_up(&rsp->gp_wq);
 1416 }
 1417 
 1418 /*
 1419  * Report a full set of quiescent states to the specified rcu_state
 1420  * data structure.  This involves cleaning up after the prior grace
 1421  * period and letting rcu_start_gp() start up the next grace period
 1422  * if one is needed.  Note that the caller must hold rnp->lock, as
 1423  * required by rcu_start_gp(), which will release it.
 1424  */
 1425 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 1426         __releases(rcu_get_root(rsp)->lock)
 1427 {
 1428         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 1429         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
 1430         wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
 1431 }
 1432 
 1433 /*
 1434  * Similar to rcu_report_qs_rdp(), for which it is a helper function.
 1435  * Allows quiescent states for a group of CPUs to be reported at one go
 1436  * to the specified rcu_node structure, though all the CPUs in the group
 1437  * must be represented by the same rcu_node structure (which need not be
 1438  * a leaf rcu_node structure, though it often will be).  That structure's
 1439  * lock must be held upon entry, and it is released before return.
 1440  */
 1441 static void
 1442 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 1443                   struct rcu_node *rnp, unsigned long flags)
 1444         __releases(rnp->lock)
 1445 {
 1446         struct rcu_node *rnp_c;
 1447 
 1448         /* Walk up the rcu_node hierarchy. */
 1449         for (;;) {
 1450                 if (!(rnp->qsmask & mask)) {
 1451 
 1452                         /* Our bit has already been cleared, so done. */
 1453                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1454                         return;
 1455                 }
 1456                 rnp->qsmask &= ~mask;
 1457                 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
 1458                                                  mask, rnp->qsmask, rnp->level,
 1459                                                  rnp->grplo, rnp->grphi,
 1460                                                  !!rnp->gp_tasks);
 1461                 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 1462 
 1463                         /* Other bits still set at this level, so done. */
 1464                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1465                         return;
 1466                 }
 1467                 mask = rnp->grpmask;
 1468                 if (rnp->parent == NULL) {
 1469 
 1470                         /* No more levels.  Exit loop holding root lock. */
 1471 
 1472                         break;
 1473                 }
 1474                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1475                 rnp_c = rnp;
 1476                 rnp = rnp->parent;
 1477                 raw_spin_lock_irqsave(&rnp->lock, flags);
 1478                 WARN_ON_ONCE(rnp_c->qsmask);
 1479         }
 1480 
 1481         /*
 1482          * Get here if we are the last CPU to pass through a quiescent
 1483          * state for this grace period.  Invoke rcu_report_qs_rsp()
 1484          * to clean up and start the next grace period if one is needed.
 1485          */
 1486         rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
 1487 }
 1488 
 1489 /*
 1490  * Record a quiescent state for the specified CPU to that CPU's rcu_data
 1491  * structure.  This must be either called from the specified CPU, or
 1492  * called when the specified CPU is known to be offline (and when it is
 1493  * also known that no other CPU is concurrently trying to help the offline
 1494  * CPU).  The lastcomp argument is used to make sure we are still in the
 1495  * grace period of interest.  We don't want to end the current grace period
 1496  * based on quiescent states detected in an earlier grace period!
 1497  */
 1498 static void
 1499 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 1500 {
 1501         unsigned long flags;
 1502         unsigned long mask;
 1503         struct rcu_node *rnp;
 1504 
 1505         rnp = rdp->mynode;
 1506         raw_spin_lock_irqsave(&rnp->lock, flags);
 1507         if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
 1508             rnp->completed == rnp->gpnum) {
 1509 
 1510                 /*
 1511                  * The grace period in which this quiescent state was
 1512                  * recorded has ended, so don't report it upwards.
 1513                  * We will instead need a new quiescent state that lies
 1514                  * within the current grace period.
 1515                  */
 1516                 rdp->passed_quiesce = 0;        /* need qs for new gp. */
 1517                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1518                 return;
 1519         }
 1520         mask = rdp->grpmask;
 1521         if ((rnp->qsmask & mask) == 0) {
 1522                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1523         } else {
 1524                 rdp->qs_pending = 0;
 1525 
 1526                 /*
 1527                  * This GP can't end until cpu checks in, so all of our
 1528                  * callbacks can be processed during the next GP.
 1529                  */
 1530                 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 1531 
 1532                 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
 1533         }
 1534 }
 1535 
 1536 /*
 1537  * Check to see if there is a new grace period of which this CPU
 1538  * is not yet aware, and if so, set up local rcu_data state for it.
 1539  * Otherwise, see if this CPU has just passed through its first
 1540  * quiescent state for this grace period, and record that fact if so.
 1541  */
 1542 static void
 1543 rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 1544 {
 1545         /* If there is now a new grace period, record and return. */
 1546         if (check_for_new_grace_period(rsp, rdp))
 1547                 return;
 1548 
 1549         /*
 1550          * Does this CPU still need to do its part for current grace period?
 1551          * If no, return and let the other CPUs do their part as well.
 1552          */
 1553         if (!rdp->qs_pending)
 1554                 return;
 1555 
 1556         /*
 1557          * Was there a quiescent state since the beginning of the grace
 1558          * period? If no, then exit and wait for the next call.
 1559          */
 1560         if (!rdp->passed_quiesce)
 1561                 return;
 1562 
 1563         /*
 1564          * Tell RCU we are done (but rcu_report_qs_rdp() will be the
 1565          * judge of that).
 1566          */
 1567         rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
 1568 }
 1569 
 1570 #ifdef CONFIG_HOTPLUG_CPU
 1571 
 1572 /*
 1573  * Send the specified CPU's RCU callbacks to the orphanage.  The
 1574  * specified CPU must be offline, and the caller must hold the
 1575  * ->orphan_lock.
 1576  */
 1577 static void
 1578 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 1579                           struct rcu_node *rnp, struct rcu_data *rdp)
 1580 {
 1581         /* No-CBs CPUs do not have orphanable callbacks. */
 1582         if (is_nocb_cpu(rdp->cpu))
 1583                 return;
 1584 
 1585         /*
 1586          * Orphan the callbacks.  First adjust the counts.  This is safe
 1587          * because _rcu_barrier() excludes CPU-hotplug operations, so it
 1588          * cannot be running now.  Thus no memory barrier is required.
 1589          */
 1590         if (rdp->nxtlist != NULL) {
 1591                 rsp->qlen_lazy += rdp->qlen_lazy;
 1592                 rsp->qlen += rdp->qlen;
 1593                 rdp->n_cbs_orphaned += rdp->qlen;
 1594                 rdp->qlen_lazy = 0;
 1595                 ACCESS_ONCE(rdp->qlen) = 0;
 1596         }
 1597 
 1598         /*
 1599          * Next, move those callbacks still needing a grace period to
 1600          * the orphanage, where some other CPU will pick them up.
 1601          * Some of the callbacks might have gone partway through a grace
 1602          * period, but that is too bad.  They get to start over because we
 1603          * cannot assume that grace periods are synchronized across CPUs.
 1604          * We don't bother updating the ->nxttail[] array yet, instead
 1605          * we just reset the whole thing later on.
 1606          */
 1607         if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
 1608                 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
 1609                 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
 1610                 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
 1611         }
 1612 
 1613         /*
 1614          * Then move the ready-to-invoke callbacks to the orphanage,
 1615          * where some other CPU will pick them up.  These will not be
 1616          * required to pass though another grace period: They are done.
 1617          */
 1618         if (rdp->nxtlist != NULL) {
 1619                 *rsp->orphan_donetail = rdp->nxtlist;
 1620                 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
 1621         }
 1622 
 1623         /* Finally, initialize the rcu_data structure's list to empty.  */
 1624         init_callback_list(rdp);
 1625 }
 1626 
 1627 /*
 1628  * Adopt the RCU callbacks from the specified rcu_state structure's
 1629  * orphanage.  The caller must hold the ->orphan_lock.
 1630  */
 1631 static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 1632 {
 1633         int i;
 1634         struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
 1635 
 1636         /* No-CBs CPUs are handled specially. */
 1637         if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
 1638                 return;
 1639 
 1640         /* Do the accounting first. */
 1641         rdp->qlen_lazy += rsp->qlen_lazy;
 1642         rdp->qlen += rsp->qlen;
 1643         rdp->n_cbs_adopted += rsp->qlen;
 1644         if (rsp->qlen_lazy != rsp->qlen)
 1645                 rcu_idle_count_callbacks_posted();
 1646         rsp->qlen_lazy = 0;
 1647         rsp->qlen = 0;
 1648 
 1649         /*
 1650          * We do not need a memory barrier here because the only way we
 1651          * can get here if there is an rcu_barrier() in flight is if
 1652          * we are the task doing the rcu_barrier().
 1653          */
 1654 
 1655         /* First adopt the ready-to-invoke callbacks. */
 1656         if (rsp->orphan_donelist != NULL) {
 1657                 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
 1658                 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
 1659                 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
 1660                         if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
 1661                                 rdp->nxttail[i] = rsp->orphan_donetail;
 1662                 rsp->orphan_donelist = NULL;
 1663                 rsp->orphan_donetail = &rsp->orphan_donelist;
 1664         }
 1665 
 1666         /* And then adopt the callbacks that still need a grace period. */
 1667         if (rsp->orphan_nxtlist != NULL) {
 1668                 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
 1669                 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
 1670                 rsp->orphan_nxtlist = NULL;
 1671                 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
 1672         }
 1673 }
 1674 
 1675 /*
 1676  * Trace the fact that this CPU is going offline.
 1677  */
 1678 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 1679 {
 1680         RCU_TRACE(unsigned long mask);
 1681         RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
 1682         RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
 1683 
 1684         RCU_TRACE(mask = rdp->grpmask);
 1685         trace_rcu_grace_period(rsp->name,
 1686                                rnp->gpnum + 1 - !!(rnp->qsmask & mask),
 1687                                "cpuofl");
 1688 }
 1689 
 1690 /*
 1691  * The CPU has been completely removed, and some other CPU is reporting
 1692  * this fact from process context.  Do the remainder of the cleanup,
 1693  * including orphaning the outgoing CPU's RCU callbacks, and also
 1694  * adopting them.  There can only be one CPU hotplug operation at a time,
 1695  * so no other CPU can be attempting to update rcu_cpu_kthread_task.
 1696  */
 1697 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 1698 {
 1699         unsigned long flags;
 1700         unsigned long mask;
 1701         int need_report = 0;
 1702         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 1703         struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 1704 
 1705         /* Adjust any no-longer-needed kthreads. */
 1706         rcu_boost_kthread_setaffinity(rnp, -1);
 1707 
 1708         /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
 1709 
 1710         /* Exclude any attempts to start a new grace period. */
 1711         mutex_lock(&rsp->onoff_mutex);
 1712         raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
 1713 
 1714         /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
 1715         rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
 1716         rcu_adopt_orphan_cbs(rsp);
 1717 
 1718         /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
 1719         mask = rdp->grpmask;    /* rnp->grplo is constant. */
 1720         do {
 1721                 raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
 1722                 rnp->qsmaskinit &= ~mask;
 1723                 if (rnp->qsmaskinit != 0) {
 1724                         if (rnp != rdp->mynode)
 1725                                 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 1726                         break;
 1727                 }
 1728                 if (rnp == rdp->mynode)
 1729                         need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
 1730                 else
 1731                         raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 1732                 mask = rnp->grpmask;
 1733                 rnp = rnp->parent;
 1734         } while (rnp != NULL);
 1735 
 1736         /*
 1737          * We still hold the leaf rcu_node structure lock here, and
 1738          * irqs are still disabled.  The reason for this subterfuge is
 1739          * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
 1740          * held leads to deadlock.
 1741          */
 1742         raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
 1743         rnp = rdp->mynode;
 1744         if (need_report & RCU_OFL_TASKS_NORM_GP)
 1745                 rcu_report_unblock_qs_rnp(rnp, flags);
 1746         else
 1747                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1748         if (need_report & RCU_OFL_TASKS_EXP_GP)
 1749                 rcu_report_exp_rnp(rsp, rnp, true);
 1750         WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 1751                   "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 1752                   cpu, rdp->qlen, rdp->nxtlist);
 1753         init_callback_list(rdp);
 1754         /* Disallow further callbacks on this CPU. */
 1755         rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 1756         mutex_unlock(&rsp->onoff_mutex);
 1757 }
 1758 
 1759 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 1760 
 1761 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 1762 {
 1763 }
 1764 
 1765 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 1766 {
 1767 }
 1768 
 1769 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 1770 
 1771 /*
 1772  * Invoke any RCU callbacks that have made it to the end of their grace
 1773  * period.  Thottle as specified by rdp->blimit.
 1774  */
 1775 static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 1776 {
 1777         unsigned long flags;
 1778         struct rcu_head *next, *list, **tail;
 1779         long bl, count, count_lazy;
 1780         int i;
 1781 
 1782         /* If no callbacks are ready, just return.*/
 1783         if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
 1784                 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
 1785                 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
 1786                                     need_resched(), is_idle_task(current),
 1787                                     rcu_is_callbacks_kthread());
 1788                 return;
 1789         }
 1790 
 1791         /*
 1792          * Extract the list of ready callbacks, disabling to prevent
 1793          * races with call_rcu() from interrupt handlers.
 1794          */
 1795         local_irq_save(flags);
 1796         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
 1797         bl = rdp->blimit;
 1798         trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
 1799         list = rdp->nxtlist;
 1800         rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
 1801         *rdp->nxttail[RCU_DONE_TAIL] = NULL;
 1802         tail = rdp->nxttail[RCU_DONE_TAIL];
 1803         for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
 1804                 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
 1805                         rdp->nxttail[i] = &rdp->nxtlist;
 1806         local_irq_restore(flags);
 1807 
 1808         /* Invoke callbacks. */
 1809         count = count_lazy = 0;
 1810         while (list) {
 1811                 next = list->next;
 1812                 prefetch(next);
 1813                 debug_rcu_head_unqueue(list);
 1814                 if (__rcu_reclaim(rsp->name, list))
 1815                         count_lazy++;
 1816                 list = next;
 1817                 /* Stop only if limit reached and CPU has something to do. */
 1818                 if (++count >= bl &&
 1819                     (need_resched() ||
 1820                      (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
 1821                         break;
 1822         }
 1823 
 1824         local_irq_save(flags);
 1825         trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
 1826                             is_idle_task(current),
 1827                             rcu_is_callbacks_kthread());
 1828 
 1829         /* Update count, and requeue any remaining callbacks. */
 1830         if (list != NULL) {
 1831                 *tail = rdp->nxtlist;
 1832                 rdp->nxtlist = list;
 1833                 for (i = 0; i < RCU_NEXT_SIZE; i++)
 1834                         if (&rdp->nxtlist == rdp->nxttail[i])
 1835                                 rdp->nxttail[i] = tail;
 1836                         else
 1837                                 break;
 1838         }
 1839         smp_mb(); /* List handling before counting for rcu_barrier(). */
 1840         rdp->qlen_lazy -= count_lazy;
 1841         ACCESS_ONCE(rdp->qlen) -= count;
 1842         rdp->n_cbs_invoked += count;
 1843 
 1844         /* Reinstate batch limit if we have worked down the excess. */
 1845         if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
 1846                 rdp->blimit = blimit;
 1847 
 1848         /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
 1849         if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
 1850                 rdp->qlen_last_fqs_check = 0;
 1851                 rdp->n_force_qs_snap = rsp->n_force_qs;
 1852         } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
 1853                 rdp->qlen_last_fqs_check = rdp->qlen;
 1854         WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
 1855 
 1856         local_irq_restore(flags);
 1857 
 1858         /* Re-invoke RCU core processing if there are callbacks remaining. */
 1859         if (cpu_has_callbacks_ready_to_invoke(rdp))
 1860                 invoke_rcu_core();
 1861 }
 1862 
 1863 /*
 1864  * Check to see if this CPU is in a non-context-switch quiescent state
 1865  * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
 1866  * Also schedule RCU core processing.
 1867  *
 1868  * This function must be called from hardirq context.  It is normally
 1869  * invoked from the scheduling-clock interrupt.  If rcu_pending returns
 1870  * false, there is no point in invoking rcu_check_callbacks().
 1871  */
 1872 void rcu_check_callbacks(int cpu, int user)
 1873 {
 1874         trace_rcu_utilization("Start scheduler-tick");
 1875         increment_cpu_stall_ticks();
 1876         if (user || rcu_is_cpu_rrupt_from_idle()) {
 1877 
 1878                 /*
 1879                  * Get here if this CPU took its interrupt from user
 1880                  * mode or from the idle loop, and if this is not a
 1881                  * nested interrupt.  In this case, the CPU is in
 1882                  * a quiescent state, so note it.
 1883                  *
 1884                  * No memory barrier is required here because both
 1885                  * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
 1886                  * variables that other CPUs neither access nor modify,
 1887                  * at least not while the corresponding CPU is online.
 1888                  */
 1889 
 1890                 rcu_sched_qs(cpu);
 1891                 rcu_bh_qs(cpu);
 1892 
 1893         } else if (!in_softirq()) {
 1894 
 1895                 /*
 1896                  * Get here if this CPU did not take its interrupt from
 1897                  * softirq, in other words, if it is not interrupting
 1898                  * a rcu_bh read-side critical section.  This is an _bh
 1899                  * critical section, so note it.
 1900                  */
 1901 
 1902                 rcu_bh_qs(cpu);
 1903         }
 1904         rcu_preempt_check_callbacks(cpu);
 1905         if (rcu_pending(cpu))
 1906                 invoke_rcu_core();
 1907         trace_rcu_utilization("End scheduler-tick");
 1908 }
 1909 
 1910 /*
 1911  * Scan the leaf rcu_node structures, processing dyntick state for any that
 1912  * have not yet encountered a quiescent state, using the function specified.
 1913  * Also initiate boosting for any threads blocked on the root rcu_node.
 1914  *
 1915  * The caller must have suppressed start of new grace periods.
 1916  */
 1917 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
 1918 {
 1919         unsigned long bit;
 1920         int cpu;
 1921         unsigned long flags;
 1922         unsigned long mask;
 1923         struct rcu_node *rnp;
 1924 
 1925         rcu_for_each_leaf_node(rsp, rnp) {
 1926                 cond_resched();
 1927                 mask = 0;
 1928                 raw_spin_lock_irqsave(&rnp->lock, flags);
 1929                 if (!rcu_gp_in_progress(rsp)) {
 1930                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1931                         return;
 1932                 }
 1933                 if (rnp->qsmask == 0) {
 1934                         rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
 1935                         continue;
 1936                 }
 1937                 cpu = rnp->grplo;
 1938                 bit = 1;
 1939                 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
 1940                         if ((rnp->qsmask & bit) != 0 &&
 1941                             f(per_cpu_ptr(rsp->rda, cpu)))
 1942                                 mask |= bit;
 1943                 }
 1944                 if (mask != 0) {
 1945 
 1946                         /* rcu_report_qs_rnp() releases rnp->lock. */
 1947                         rcu_report_qs_rnp(mask, rsp, rnp, flags);
 1948                         continue;
 1949                 }
 1950                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1951         }
 1952         rnp = rcu_get_root(rsp);
 1953         if (rnp->qsmask == 0) {
 1954                 raw_spin_lock_irqsave(&rnp->lock, flags);
 1955                 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
 1956         }
 1957 }
 1958 
 1959 /*
 1960  * Force quiescent states on reluctant CPUs, and also detect which
 1961  * CPUs are in dyntick-idle mode.
 1962  */
 1963 static void force_quiescent_state(struct rcu_state *rsp)
 1964 {
 1965         unsigned long flags;
 1966         bool ret;
 1967         struct rcu_node *rnp;
 1968         struct rcu_node *rnp_old = NULL;
 1969 
 1970         /* Funnel through hierarchy to reduce memory contention. */
 1971         rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
 1972         for (; rnp != NULL; rnp = rnp->parent) {
 1973                 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
 1974                       !raw_spin_trylock(&rnp->fqslock);
 1975                 if (rnp_old != NULL)
 1976                         raw_spin_unlock(&rnp_old->fqslock);
 1977                 if (ret) {
 1978                         rsp->n_force_qs_lh++;
 1979                         return;
 1980                 }
 1981                 rnp_old = rnp;
 1982         }
 1983         /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
 1984 
 1985         /* Reached the root of the rcu_node tree, acquire lock. */
 1986         raw_spin_lock_irqsave(&rnp_old->lock, flags);
 1987         raw_spin_unlock(&rnp_old->fqslock);
 1988         if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 1989                 rsp->n_force_qs_lh++;
 1990                 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 1991                 return;  /* Someone beat us to it. */
 1992         }
 1993         rsp->gp_flags |= RCU_GP_FLAG_FQS;
 1994         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 1995         wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
 1996 }
 1997 
 1998 /*
 1999  * This does the RCU core processing work for the specified rcu_state
 2000  * and rcu_data structures.  This may be called only from the CPU to
 2001  * whom the rdp belongs.
 2002  */
 2003 static void
 2004 __rcu_process_callbacks(struct rcu_state *rsp)
 2005 {
 2006         unsigned long flags;
 2007         struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
 2008 
 2009         WARN_ON_ONCE(rdp->beenonline == 0);
 2010 
 2011         /*
 2012          * Advance callbacks in response to end of earlier grace
 2013          * period that some other CPU ended.
 2014          */
 2015         rcu_process_gp_end(rsp, rdp);
 2016 
 2017         /* Update RCU state based on any recent quiescent states. */
 2018         rcu_check_quiescent_state(rsp, rdp);
 2019 
 2020         /* Does this CPU require a not-yet-started grace period? */
 2021         if (cpu_needs_another_gp(rsp, rdp)) {
 2022                 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
 2023                 rcu_start_gp(rsp, flags);  /* releases above lock */
 2024         }
 2025 
 2026         /* If there are callbacks ready, invoke them. */
 2027         if (cpu_has_callbacks_ready_to_invoke(rdp))
 2028                 invoke_rcu_callbacks(rsp, rdp);
 2029 }
 2030 
 2031 /*
 2032  * Do RCU core processing for the current CPU.
 2033  */
 2034 static void rcu_process_callbacks(struct softirq_action *unused)
 2035 {
 2036         struct rcu_state *rsp;
 2037 
 2038         if (cpu_is_offline(smp_processor_id()))
 2039                 return;
 2040         trace_rcu_utilization("Start RCU core");
 2041         for_each_rcu_flavor(rsp)
 2042                 __rcu_process_callbacks(rsp);
 2043         trace_rcu_utilization("End RCU core");
 2044 }
 2045 
 2046 /*
 2047  * Schedule RCU callback invocation.  If the specified type of RCU
 2048  * does not support RCU priority boosting, just do a direct call,
 2049  * otherwise wake up the per-CPU kernel kthread.  Note that because we
 2050  * are running on the current CPU with interrupts disabled, the
 2051  * rcu_cpu_kthread_task cannot disappear out from under us.
 2052  */
 2053 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 2054 {
 2055         if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
 2056                 return;
 2057         if (likely(!rsp->boost)) {
 2058                 rcu_do_batch(rsp, rdp);
 2059                 return;
 2060         }
 2061         invoke_rcu_callbacks_kthread();
 2062 }
 2063 
 2064 static void invoke_rcu_core(void)
 2065 {
 2066         raise_softirq(RCU_SOFTIRQ);
 2067 }
 2068 
 2069 /*
 2070  * Handle any core-RCU processing required by a call_rcu() invocation.
 2071  */
 2072 static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 2073                             struct rcu_head *head, unsigned long flags)
 2074 {
 2075         /*
 2076          * If called from an extended quiescent state, invoke the RCU
 2077          * core in order to force a re-evaluation of RCU's idleness.
 2078          */
 2079         if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
 2080                 invoke_rcu_core();
 2081 
 2082         /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
 2083         if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
 2084                 return;
 2085 
 2086         /*
 2087          * Force the grace period if too many callbacks or too long waiting.
 2088          * Enforce hysteresis, and don't invoke force_quiescent_state()
 2089          * if some other CPU has recently done so.  Also, don't bother
 2090          * invoking force_quiescent_state() if the newly enqueued callback
 2091          * is the only one waiting for a grace period to complete.
 2092          */
 2093         if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
 2094 
 2095                 /* Are we ignoring a completed grace period? */
 2096                 rcu_process_gp_end(rsp, rdp);
 2097                 check_for_new_grace_period(rsp, rdp);
 2098 
 2099                 /* Start a new grace period if one not already started. */
 2100                 if (!rcu_gp_in_progress(rsp)) {
 2101                         unsigned long nestflag;
 2102                         struct rcu_node *rnp_root = rcu_get_root(rsp);
 2103 
 2104                         raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
 2105                         rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
 2106                 } else {
 2107                         /* Give the grace period a kick. */
 2108                         rdp->blimit = LONG_MAX;
 2109                         if (rsp->n_force_qs == rdp->n_force_qs_snap &&
 2110                             *rdp->nxttail[RCU_DONE_TAIL] != head)
 2111                                 force_quiescent_state(rsp);
 2112                         rdp->n_force_qs_snap = rsp->n_force_qs;
 2113                         rdp->qlen_last_fqs_check = rdp->qlen;
 2114                 }
 2115         }
 2116 }
 2117 
 2118 /*
 2119  * Helper function for call_rcu() and friends.  The cpu argument will
 2120  * normally be -1, indicating "currently running CPU".  It may specify
 2121  * a CPU only if that CPU is a no-CBs CPU.  Currently, only _rcu_barrier()
 2122  * is expected to specify a CPU.
 2123  */
 2124 static void
 2125 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 2126            struct rcu_state *rsp, int cpu, bool lazy)
 2127 {
 2128         unsigned long flags;
 2129         struct rcu_data *rdp;
 2130 
 2131         WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
 2132         debug_rcu_head_queue(head);
 2133         head->func = func;
 2134         head->next = NULL;
 2135 
 2136         /*
 2137          * Opportunistically note grace-period endings and beginnings.
 2138          * Note that we might see a beginning right after we see an
 2139          * end, but never vice versa, since this CPU has to pass through
 2140          * a quiescent state betweentimes.
 2141          */
 2142         local_irq_save(flags);
 2143         rdp = this_cpu_ptr(rsp->rda);
 2144 
 2145         /* Add the callback to our list. */
 2146         if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
 2147                 int offline;
 2148 
 2149                 if (cpu != -1)
 2150                         rdp = per_cpu_ptr(rsp->rda, cpu);
 2151                 offline = !__call_rcu_nocb(rdp, head, lazy);
 2152                 WARN_ON_ONCE(offline);
 2153                 /* _call_rcu() is illegal on offline CPU; leak the callback. */
 2154                 local_irq_restore(flags);
 2155                 return;
 2156         }
 2157         ACCESS_ONCE(rdp->qlen)++;
 2158         if (lazy)
 2159                 rdp->qlen_lazy++;
 2160         else
 2161                 rcu_idle_count_callbacks_posted();
 2162         smp_mb();  /* Count before adding callback for rcu_barrier(). */
 2163         *rdp->nxttail[RCU_NEXT_TAIL] = head;
 2164         rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 2165 
 2166         if (__is_kfree_rcu_offset((unsigned long)func))
 2167                 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
 2168                                          rdp->qlen_lazy, rdp->qlen);
 2169         else
 2170                 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
 2171 
 2172         /* Go handle any RCU core processing required. */
 2173         __call_rcu_core(rsp, rdp, head, flags);
 2174         local_irq_restore(flags);
 2175 }
 2176 
 2177 /*
 2178  * Queue an RCU-sched callback for invocation after a grace period.
 2179  */
 2180 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 2181 {
 2182         __call_rcu(head, func, &rcu_sched_state, -1, 0);
 2183 }
 2184 EXPORT_SYMBOL_GPL(call_rcu_sched);
 2185 
 2186 /*
 2187  * Queue an RCU callback for invocation after a quicker grace period.
 2188  */
 2189 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 2190 {
 2191         __call_rcu(head, func, &rcu_bh_state, -1, 0);
 2192 }
 2193 EXPORT_SYMBOL_GPL(call_rcu_bh);
 2194 
 2195 /*
 2196  * Because a context switch is a grace period for RCU-sched and RCU-bh,
 2197  * any blocking grace-period wait automatically implies a grace period
 2198  * if there is only one CPU online at any point time during execution
 2199  * of either synchronize_sched() or synchronize_rcu_bh().  It is OK to
 2200  * occasionally incorrectly indicate that there are multiple CPUs online
 2201  * when there was in fact only one the whole time, as this just adds
 2202  * some overhead: RCU still operates correctly.
 2203  */
 2204 static inline int rcu_blocking_is_gp(void)
 2205 {
 2206         int ret;
 2207 
 2208         might_sleep();  /* Check for RCU read-side critical section. */
 2209         preempt_disable();
 2210         ret = num_online_cpus() <= 1;
 2211         preempt_enable();
 2212         return ret;
 2213 }
 2214 
 2215 /**
 2216  * synchronize_sched - wait until an rcu-sched grace period has elapsed.
 2217  *
 2218  * Control will return to the caller some time after a full rcu-sched
 2219  * grace period has elapsed, in other words after all currently executing
 2220  * rcu-sched read-side critical sections have completed.   These read-side
 2221  * critical sections are delimited by rcu_read_lock_sched() and
 2222  * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
 2223  * local_irq_disable(), and so on may be used in place of
 2224  * rcu_read_lock_sched().
 2225  *
 2226  * This means that all preempt_disable code sequences, including NMI and
 2227  * non-threaded hardware-interrupt handlers, in progress on entry will
 2228  * have completed before this primitive returns.  However, this does not
 2229  * guarantee that softirq handlers will have completed, since in some
 2230  * kernels, these handlers can run in process context, and can block.
 2231  *
 2232  * Note that this guarantee implies further memory-ordering guarantees.
 2233  * On systems with more than one CPU, when synchronize_sched() returns,
 2234  * each CPU is guaranteed to have executed a full memory barrier since the
 2235  * end of its last RCU-sched read-side critical section whose beginning
 2236  * preceded the call to synchronize_sched().  In addition, each CPU having
 2237  * an RCU read-side critical section that extends beyond the return from
 2238  * synchronize_sched() is guaranteed to have executed a full memory barrier
 2239  * after the beginning of synchronize_sched() and before the beginning of
 2240  * that RCU read-side critical section.  Note that these guarantees include
 2241  * CPUs that are offline, idle, or executing in user mode, as well as CPUs
 2242  * that are executing in the kernel.
 2243  *
 2244  * Furthermore, if CPU A invoked synchronize_sched(), which returned
 2245  * to its caller on CPU B, then both CPU A and CPU B are guaranteed
 2246  * to have executed a full memory barrier during the execution of
 2247  * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
 2248  * again only if the system has more than one CPU).
 2249  *
 2250  * This primitive provides the guarantees made by the (now removed)
 2251  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
 2252  * guarantees that rcu_read_lock() sections will have completed.
 2253  * In "classic RCU", these two guarantees happen to be one and
 2254  * the same, but can differ in realtime RCU implementations.
 2255  */
 2256 void synchronize_sched(void)
 2257 {
 2258         rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
 2259                            !lock_is_held(&rcu_lock_map) &&
 2260                            !lock_is_held(&rcu_sched_lock_map),
 2261                            "Illegal synchronize_sched() in RCU-sched read-side critical section");
 2262         if (rcu_blocking_is_gp())
 2263                 return;
 2264         if (rcu_expedited)
 2265                 synchronize_sched_expedited();
 2266         else
 2267                 wait_rcu_gp(call_rcu_sched);
 2268 }
 2269 EXPORT_SYMBOL_GPL(synchronize_sched);
 2270 
 2271 /**
 2272  * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
 2273  *
 2274  * Control will return to the caller some time after a full rcu_bh grace
 2275  * period has elapsed, in other words after all currently executing rcu_bh
 2276  * read-side critical sections have completed.  RCU read-side critical
 2277  * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
 2278  * and may be nested.
 2279  *
 2280  * See the description of synchronize_sched() for more detailed information
 2281  * on memory ordering guarantees.
 2282  */
 2283 void synchronize_rcu_bh(void)
 2284 {
 2285         rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
 2286                            !lock_is_held(&rcu_lock_map) &&
 2287                            !lock_is_held(&rcu_sched_lock_map),
 2288                            "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
 2289         if (rcu_blocking_is_gp())
 2290                 return;
 2291         if (rcu_expedited)
 2292                 synchronize_rcu_bh_expedited();
 2293         else
 2294                 wait_rcu_gp(call_rcu_bh);
 2295 }
 2296 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 2297 
 2298 static int synchronize_sched_expedited_cpu_stop(void *data)
 2299 {
 2300         /*
 2301          * There must be a full memory barrier on each affected CPU
 2302          * between the time that try_stop_cpus() is called and the
 2303          * time that it returns.
 2304          *
 2305          * In the current initial implementation of cpu_stop, the
 2306          * above condition is already met when the control reaches
 2307          * this point and the following smp_mb() is not strictly
 2308          * necessary.  Do smp_mb() anyway for documentation and
 2309          * robustness against future implementation changes.
 2310          */
 2311         smp_mb(); /* See above comment block. */
 2312         return 0;
 2313 }
 2314 
 2315 /**
 2316  * synchronize_sched_expedited - Brute-force RCU-sched grace period
 2317  *
 2318  * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
 2319  * approach to force the grace period to end quickly.  This consumes
 2320  * significant time on all CPUs and is unfriendly to real-time workloads,
 2321  * so is thus not recommended for any sort of common-case code.  In fact,
 2322  * if you are using synchronize_sched_expedited() in a loop, please
 2323  * restructure your code to batch your updates, and then use a single
 2324  * synchronize_sched() instead.
 2325  *
 2326  * Note that it is illegal to call this function while holding any lock
 2327  * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
 2328  * to call this function from a CPU-hotplug notifier.  Failing to observe
 2329  * these restriction will result in deadlock.
 2330  *
 2331  * This implementation can be thought of as an application of ticket
 2332  * locking to RCU, with sync_sched_expedited_started and
 2333  * sync_sched_expedited_done taking on the roles of the halves
 2334  * of the ticket-lock word.  Each task atomically increments
 2335  * sync_sched_expedited_started upon entry, snapshotting the old value,
 2336  * then attempts to stop all the CPUs.  If this succeeds, then each
 2337  * CPU will have executed a context switch, resulting in an RCU-sched
 2338  * grace period.  We are then done, so we use atomic_cmpxchg() to
 2339  * update sync_sched_expedited_done to match our snapshot -- but
 2340  * only if someone else has not already advanced past our snapshot.
 2341  *
 2342  * On the other hand, if try_stop_cpus() fails, we check the value
 2343  * of sync_sched_expedited_done.  If it has advanced past our
 2344  * initial snapshot, then someone else must have forced a grace period
 2345  * some time after we took our snapshot.  In this case, our work is
 2346  * done for us, and we can simply return.  Otherwise, we try again,
 2347  * but keep our initial snapshot for purposes of checking for someone
 2348  * doing our work for us.
 2349  *
 2350  * If we fail too many times in a row, we fall back to synchronize_sched().
 2351  */
 2352 void synchronize_sched_expedited(void)
 2353 {
 2354         long firstsnap, s, snap;
 2355         int trycount = 0;
 2356         struct rcu_state *rsp = &rcu_sched_state;
 2357 
 2358         /*
 2359          * If we are in danger of counter wrap, just do synchronize_sched().
 2360          * By allowing sync_sched_expedited_started to advance no more than
 2361          * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
 2362          * that more than 3.5 billion CPUs would be required to force a
 2363          * counter wrap on a 32-bit system.  Quite a few more CPUs would of
 2364          * course be required on a 64-bit system.
 2365          */
 2366         if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
 2367                          (ulong)atomic_long_read(&rsp->expedited_done) +
 2368                          ULONG_MAX / 8)) {
 2369                 synchronize_sched();
 2370                 atomic_long_inc(&rsp->expedited_wrap);
 2371                 return;
 2372         }
 2373 
 2374         /*
 2375          * Take a ticket.  Note that atomic_inc_return() implies a
 2376          * full memory barrier.
 2377          */
 2378         snap = atomic_long_inc_return(&rsp->expedited_start);
 2379         firstsnap = snap;
 2380         get_online_cpus();
 2381         WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 2382 
 2383         /*
 2384          * Each pass through the following loop attempts to force a
 2385          * context switch on each CPU.
 2386          */
 2387         while (try_stop_cpus(cpu_online_mask,
 2388                              synchronize_sched_expedited_cpu_stop,
 2389                              NULL) == -EAGAIN) {
 2390                 put_online_cpus();
 2391                 atomic_long_inc(&rsp->expedited_tryfail);
 2392 
 2393                 /* Check to see if someone else did our work for us. */
 2394                 s = atomic_long_read(&rsp->expedited_done);
 2395                 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
 2396                         /* ensure test happens before caller kfree */
 2397                         smp_mb__before_atomic_inc(); /* ^^^ */
 2398                         atomic_long_inc(&rsp->expedited_workdone1);
 2399                         return;
 2400                 }
 2401 
 2402                 /* No joy, try again later.  Or just synchronize_sched(). */
 2403                 if (trycount++ < 10) {
 2404                         udelay(trycount * num_online_cpus());
 2405                 } else {
 2406                         wait_rcu_gp(call_rcu_sched);
 2407                         atomic_long_inc(&rsp->expedited_normal);
 2408                         return;
 2409                 }
 2410 
 2411                 /* Recheck to see if someone else did our work for us. */
 2412                 s = atomic_long_read(&rsp->expedited_done);
 2413                 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
 2414                         /* ensure test happens before caller kfree */
 2415                         smp_mb__before_atomic_inc(); /* ^^^ */
 2416                         atomic_long_inc(&rsp->expedited_workdone2);
 2417                         return;
 2418                 }
 2419 
 2420                 /*
 2421                  * Refetching sync_sched_expedited_started allows later
 2422                  * callers to piggyback on our grace period.  We retry
 2423                  * after they started, so our grace period works for them,
 2424                  * and they started after our first try, so their grace
 2425                  * period works for us.
 2426                  */
 2427                 get_online_cpus();
 2428                 snap = atomic_long_read(&rsp->expedited_start);
 2429                 smp_mb(); /* ensure read is before try_stop_cpus(). */
 2430         }
 2431         atomic_long_inc(&rsp->expedited_stoppedcpus);
 2432 
 2433         /*
 2434          * Everyone up to our most recent fetch is covered by our grace
 2435          * period.  Update the counter, but only if our work is still
 2436          * relevant -- which it won't be if someone who started later
 2437          * than we did already did their update.
 2438          */
 2439         do {
 2440                 atomic_long_inc(&rsp->expedited_done_tries);
 2441                 s = atomic_long_read(&rsp->expedited_done);
 2442                 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
 2443                         /* ensure test happens before caller kfree */
 2444                         smp_mb__before_atomic_inc(); /* ^^^ */
 2445                         atomic_long_inc(&rsp->expedited_done_lost);
 2446                         break;
 2447                 }
 2448         } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
 2449         atomic_long_inc(&rsp->expedited_done_exit);
 2450 
 2451         put_online_cpus();
 2452 }
 2453 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 2454 
 2455 /*
 2456  * Check to see if there is any immediate RCU-related work to be done
 2457  * by the current CPU, for the specified type of RCU, returning 1 if so.
 2458  * The checks are in order of increasing expense: checks that can be
 2459  * carried out against CPU-local state are performed first.  However,
 2460  * we must check for CPU stalls first, else we might not get a chance.
 2461  */
 2462 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 2463 {
 2464         struct rcu_node *rnp = rdp->mynode;
 2465 
 2466         rdp->n_rcu_pending++;
 2467 
 2468         /* Check for CPU stalls, if enabled. */
 2469         check_cpu_stall(rsp, rdp);
 2470 
 2471         /* Is the RCU core waiting for a quiescent state from this CPU? */
 2472         if (rcu_scheduler_fully_active &&
 2473             rdp->qs_pending && !rdp->passed_quiesce) {
 2474                 rdp->n_rp_qs_pending++;
 2475         } else if (rdp->qs_pending && rdp->passed_quiesce) {
 2476                 rdp->n_rp_report_qs++;
 2477                 return 1;
 2478         }
 2479 
 2480         /* Does this CPU have callbacks ready to invoke? */
 2481         if (cpu_has_callbacks_ready_to_invoke(rdp)) {
 2482                 rdp->n_rp_cb_ready++;
 2483                 return 1;
 2484         }
 2485 
 2486         /* Has RCU gone idle with this CPU needing another grace period? */
 2487         if (cpu_needs_another_gp(rsp, rdp)) {
 2488                 rdp->n_rp_cpu_needs_gp++;
 2489                 return 1;
 2490         }
 2491 
 2492         /* Has another RCU grace period completed?  */
 2493         if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
 2494                 rdp->n_rp_gp_completed++;
 2495                 return 1;
 2496         }
 2497 
 2498         /* Has a new RCU grace period started? */
 2499         if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
 2500                 rdp->n_rp_gp_started++;
 2501                 return 1;
 2502         }
 2503 
 2504         /* nothing to do */
 2505         rdp->n_rp_need_nothing++;
 2506         return 0;
 2507 }
 2508 
 2509 /*
 2510  * Check to see if there is any immediate RCU-related work to be done
 2511  * by the current CPU, returning 1 if so.  This function is part of the
 2512  * RCU implementation; it is -not- an exported member of the RCU API.
 2513  */
 2514 static int rcu_pending(int cpu)
 2515 {
 2516         struct rcu_state *rsp;
 2517 
 2518         for_each_rcu_flavor(rsp)
 2519                 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
 2520                         return 1;
 2521         return 0;
 2522 }
 2523 
 2524 /*
 2525  * Check to see if any future RCU-related work will need to be done
 2526  * by the current CPU, even if none need be done immediately, returning
 2527  * 1 if so.
 2528  */
 2529 static int rcu_cpu_has_callbacks(int cpu)
 2530 {
 2531         struct rcu_state *rsp;
 2532 
 2533         /* RCU callbacks either ready or pending? */
 2534         for_each_rcu_flavor(rsp)
 2535                 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
 2536                         return 1;
 2537         return 0;
 2538 }
 2539 
 2540 /*
 2541  * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
 2542  * the compiler is expected to optimize this away.
 2543  */
 2544 static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
 2545                                int cpu, unsigned long done)
 2546 {
 2547         trace_rcu_barrier(rsp->name, s, cpu,
 2548                           atomic_read(&rsp->barrier_cpu_count), done);
 2549 }
 2550 
 2551 /*
 2552  * RCU callback function for _rcu_barrier().  If we are last, wake
 2553  * up the task executing _rcu_barrier().
 2554  */
 2555 static void rcu_barrier_callback(struct rcu_head *rhp)
 2556 {
 2557         struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
 2558         struct rcu_state *rsp = rdp->rsp;
 2559 
 2560         if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
 2561                 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
 2562                 complete(&rsp->barrier_completion);
 2563         } else {
 2564                 _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
 2565         }
 2566 }
 2567 
 2568 /*
 2569  * Called with preemption disabled, and from cross-cpu IRQ context.
 2570  */
 2571 static void rcu_barrier_func(void *type)
 2572 {
 2573         struct rcu_state *rsp = type;
 2574         struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
 2575 
 2576         _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
 2577         atomic_inc(&rsp->barrier_cpu_count);
 2578         rsp->call(&rdp->barrier_head, rcu_barrier_callback);
 2579 }
 2580 
 2581 /*
 2582  * Orchestrate the specified type of RCU barrier, waiting for all
 2583  * RCU callbacks of the specified type to complete.
 2584  */
 2585 static void _rcu_barrier(struct rcu_state *rsp)
 2586 {
 2587         int cpu;
 2588         struct rcu_data *rdp;
 2589         unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
 2590         unsigned long snap_done;
 2591 
 2592         _rcu_barrier_trace(rsp, "Begin", -1, snap);
 2593 
 2594         /* Take mutex to serialize concurrent rcu_barrier() requests. */
 2595         mutex_lock(&rsp->barrier_mutex);
 2596 
 2597         /*
 2598          * Ensure that all prior references, including to ->n_barrier_done,
 2599          * are ordered before the _rcu_barrier() machinery.
 2600          */
 2601         smp_mb();  /* See above block comment. */
 2602 
 2603         /*
 2604          * Recheck ->n_barrier_done to see if others did our work for us.
 2605          * This means checking ->n_barrier_done for an even-to-odd-to-even
 2606          * transition.  The "if" expression below therefore rounds the old
 2607          * value up to the next even number and adds two before comparing.
 2608          */
 2609         snap_done = ACCESS_ONCE(rsp->n_barrier_done);
 2610         _rcu_barrier_trace(rsp, "Check", -1, snap_done);
 2611         if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
 2612                 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
 2613                 smp_mb(); /* caller's subsequent code after above check. */
 2614                 mutex_unlock(&rsp->barrier_mutex);
 2615                 return;
 2616         }
 2617 
 2618         /*
 2619          * Increment ->n_barrier_done to avoid duplicate work.  Use
 2620          * ACCESS_ONCE() to prevent the compiler from speculating
 2621          * the increment to precede the early-exit check.
 2622          */
 2623         ACCESS_ONCE(rsp->n_barrier_done)++;
 2624         WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
 2625         _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
 2626         smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
 2627 
 2628         /*
 2629          * Initialize the count to one rather than to zero in order to
 2630          * avoid a too-soon return to zero in case of a short grace period
 2631          * (or preemption of this task).  Exclude CPU-hotplug operations
 2632          * to ensure that no offline CPU has callbacks queued.
 2633          */
 2634         init_completion(&rsp->barrier_completion);
 2635         atomic_set(&rsp->barrier_cpu_count, 1);
 2636         get_online_cpus();
 2637 
 2638         /*
 2639          * Force each CPU with callbacks to register a new callback.
 2640          * When that callback is invoked, we will know that all of the
 2641          * corresponding CPU's preceding callbacks have been invoked.
 2642          */
 2643         for_each_possible_cpu(cpu) {
 2644                 if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
 2645                         continue;
 2646                 rdp = per_cpu_ptr(rsp->rda, cpu);
 2647                 if (is_nocb_cpu(cpu)) {
 2648                         _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
 2649                                            rsp->n_barrier_done);
 2650                         atomic_inc(&rsp->barrier_cpu_count);
 2651                         __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
 2652                                    rsp, cpu, 0);
 2653                 } else if (ACCESS_ONCE(rdp->qlen)) {
 2654                         _rcu_barrier_trace(rsp, "OnlineQ", cpu,
 2655                                            rsp->n_barrier_done);
 2656                         smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
 2657                 } else {
 2658                         _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
 2659                                            rsp->n_barrier_done);
 2660                 }
 2661         }
 2662         put_online_cpus();
 2663 
 2664         /*
 2665          * Now that we have an rcu_barrier_callback() callback on each
 2666          * CPU, and thus each counted, remove the initial count.
 2667          */
 2668         if (atomic_dec_and_test(&rsp->barrier_cpu_count))
 2669                 complete(&rsp->barrier_completion);
 2670 
 2671         /* Increment ->n_barrier_done to prevent duplicate work. */
 2672         smp_mb(); /* Keep increment after above mechanism. */
 2673         ACCESS_ONCE(rsp->n_barrier_done)++;
 2674         WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
 2675         _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
 2676         smp_mb(); /* Keep increment before caller's subsequent code. */
 2677 
 2678         /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
 2679         wait_for_completion(&rsp->barrier_completion);
 2680 
 2681         /* Other rcu_barrier() invocations can now safely proceed. */
 2682         mutex_unlock(&rsp->barrier_mutex);
 2683 }
 2684 
 2685 /**
 2686  * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
 2687  */
 2688 void rcu_barrier_bh(void)
 2689 {
 2690         _rcu_barrier(&rcu_bh_state);
 2691 }
 2692 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 2693 
 2694 /**
 2695  * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
 2696  */
 2697 void rcu_barrier_sched(void)
 2698 {
 2699         _rcu_barrier(&rcu_sched_state);
 2700 }
 2701 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 2702 
 2703 /*
 2704  * Do boot-time initialization of a CPU's per-CPU RCU data.
 2705  */
 2706 static void __init
 2707 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 2708 {
 2709         unsigned long flags;
 2710         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 2711         struct rcu_node *rnp = rcu_get_root(rsp);
 2712 
 2713         /* Set up local state, ensuring consistent view of global state. */
 2714         raw_spin_lock_irqsave(&rnp->lock, flags);
 2715         rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
 2716         init_callback_list(rdp);
 2717         rdp->qlen_lazy = 0;
 2718         ACCESS_ONCE(rdp->qlen) = 0;
 2719         rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 2720         WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
 2721         WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
 2722 #ifdef CONFIG_RCU_USER_QS
 2723         WARN_ON_ONCE(rdp->dynticks->in_user);
 2724 #endif
 2725         rdp->cpu = cpu;
 2726         rdp->rsp = rsp;
 2727         rcu_boot_init_nocb_percpu_data(rdp);
 2728         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 2729 }
 2730 
 2731 /*
 2732  * Initialize a CPU's per-CPU RCU data.  Note that only one online or
 2733  * offline event can be happening at a given time.  Note also that we
 2734  * can accept some slop in the rsp->completed access due to the fact
 2735  * that this CPU cannot possibly have any RCU callbacks in flight yet.
 2736  */
 2737 static void __cpuinit
 2738 rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 2739 {
 2740         unsigned long flags;
 2741         unsigned long mask;
 2742         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 2743         struct rcu_node *rnp = rcu_get_root(rsp);
 2744 
 2745         /* Exclude new grace periods. */
 2746         mutex_lock(&rsp->onoff_mutex);
 2747 
 2748         /* Set up local state, ensuring consistent view of global state. */
 2749         raw_spin_lock_irqsave(&rnp->lock, flags);
 2750         rdp->beenonline = 1;     /* We have now been online. */
 2751         rdp->preemptible = preemptible;
 2752         rdp->qlen_last_fqs_check = 0;
 2753         rdp->n_force_qs_snap = rsp->n_force_qs;
 2754         rdp->blimit = blimit;
 2755         init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
 2756         rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 2757         atomic_set(&rdp->dynticks->dynticks,
 2758                    (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
 2759         rcu_prepare_for_idle_init(cpu);
 2760         raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
 2761 
 2762         /* Add CPU to rcu_node bitmasks. */
 2763         rnp = rdp->mynode;
 2764         mask = rdp->grpmask;
 2765         do {
 2766                 /* Exclude any attempts to start a new GP on small systems. */
 2767                 raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
 2768                 rnp->qsmaskinit |= mask;
 2769                 mask = rnp->grpmask;
 2770                 if (rnp == rdp->mynode) {
 2771                         /*
 2772                          * If there is a grace period in progress, we will
 2773                          * set up to wait for it next time we run the
 2774                          * RCU core code.
 2775                          */
 2776                         rdp->gpnum = rnp->completed;
 2777                         rdp->completed = rnp->completed;
 2778                         rdp->passed_quiesce = 0;
 2779                         rdp->qs_pending = 0;
 2780                         trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
 2781                 }
 2782                 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
 2783                 rnp = rnp->parent;
 2784         } while (rnp != NULL && !(rnp->qsmaskinit & mask));
 2785         local_irq_restore(flags);
 2786 
 2787         mutex_unlock(&rsp->onoff_mutex);
 2788 }
 2789 
 2790 static void __cpuinit rcu_prepare_cpu(int cpu)
 2791 {
 2792         struct rcu_state *rsp;
 2793 
 2794         for_each_rcu_flavor(rsp)
 2795                 rcu_init_percpu_data(cpu, rsp,
 2796                                      strcmp(rsp->name, "rcu_preempt") == 0);
 2797 }
 2798 
 2799 /*
 2800  * Handle CPU online/offline notification events.
 2801  */
 2802 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 2803                                     unsigned long action, void *hcpu)
 2804 {
 2805         long cpu = (long)hcpu;
 2806         struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 2807         struct rcu_node *rnp = rdp->mynode;
 2808         struct rcu_state *rsp;
 2809         int ret = NOTIFY_OK;
 2810 
 2811         trace_rcu_utilization("Start CPU hotplug");
 2812         switch (action) {
 2813         case CPU_UP_PREPARE:
 2814         case CPU_UP_PREPARE_FROZEN:
 2815                 rcu_prepare_cpu(cpu);
 2816                 rcu_prepare_kthreads(cpu);
 2817                 break;
 2818         case CPU_ONLINE:
 2819         case CPU_DOWN_FAILED:
 2820                 rcu_boost_kthread_setaffinity(rnp, -1);
 2821                 break;
 2822         case CPU_DOWN_PREPARE:
 2823                 if (nocb_cpu_expendable(cpu))
 2824                         rcu_boost_kthread_setaffinity(rnp, cpu);
 2825                 else
 2826                         ret = NOTIFY_BAD;
 2827                 break;
 2828         case CPU_DYING:
 2829         case CPU_DYING_FROZEN:
 2830                 /*
 2831                  * The whole machine is "stopped" except this CPU, so we can
 2832                  * touch any data without introducing corruption. We send the
 2833                  * dying CPU's callbacks to an arbitrarily chosen online CPU.
 2834                  */
 2835                 for_each_rcu_flavor(rsp)
 2836                         rcu_cleanup_dying_cpu(rsp);
 2837                 rcu_cleanup_after_idle(cpu);
 2838                 break;
 2839         case CPU_DEAD:
 2840         case CPU_DEAD_FROZEN:
 2841         case CPU_UP_CANCELED:
 2842         case CPU_UP_CANCELED_FROZEN:
 2843                 for_each_rcu_flavor(rsp)
 2844                         rcu_cleanup_dead_cpu(cpu, rsp);
 2845                 break;
 2846         default:
 2847                 break;
 2848         }
 2849         trace_rcu_utilization("End CPU hotplug");
 2850         return ret;
 2851 }
 2852 
 2853 /*
 2854  * Spawn the kthread that handles this RCU flavor's grace periods.
 2855  */
 2856 static int __init rcu_spawn_gp_kthread(void)
 2857 {
 2858         unsigned long flags;
 2859         struct rcu_node *rnp;
 2860         struct rcu_state *rsp;
 2861         struct task_struct *t;
 2862 
 2863         for_each_rcu_flavor(rsp) {
 2864                 t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
 2865                 BUG_ON(IS_ERR(t));
 2866                 rnp = rcu_get_root(rsp);
 2867                 raw_spin_lock_irqsave(&rnp->lock, flags);
 2868                 rsp->gp_kthread = t;
 2869                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 2870                 rcu_spawn_nocb_kthreads(rsp);
 2871         }
 2872         return 0;
 2873 }
 2874 early_initcall(rcu_spawn_gp_kthread);
 2875 
 2876 /*
 2877  * This function is invoked towards the end of the scheduler's initialization
 2878  * process.  Before this is called, the idle task might contain
 2879  * RCU read-side critical sections (during which time, this idle
 2880  * task is booting the system).  After this function is called, the
 2881  * idle tasks are prohibited from containing RCU read-side critical
 2882  * sections.  This function also enables RCU lockdep checking.
 2883  */
 2884 void rcu_scheduler_starting(void)
 2885 {
 2886         WARN_ON(num_online_cpus() != 1);
 2887         WARN_ON(nr_context_switches() > 0);
 2888         rcu_scheduler_active = 1;
 2889 }
 2890 
 2891 /*
 2892  * Compute the per-level fanout, either using the exact fanout specified
 2893  * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
 2894  */
 2895 #ifdef CONFIG_RCU_FANOUT_EXACT
 2896 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 2897 {
 2898         int i;
 2899 
 2900         for (i = rcu_num_lvls - 1; i > 0; i--)
 2901                 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
 2902         rsp->levelspread[0] = rcu_fanout_leaf;
 2903 }
 2904 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 2905 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 2906 {
 2907         int ccur;
 2908         int cprv;
 2909         int i;
 2910 
 2911         cprv = nr_cpu_ids;
 2912         for (i = rcu_num_lvls - 1; i >= 0; i--) {
 2913                 ccur = rsp->levelcnt[i];
 2914                 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
 2915                 cprv = ccur;
 2916         }
 2917 }
 2918 #endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
 2919 
 2920 /*
 2921  * Helper function for rcu_init() that initializes one rcu_state structure.
 2922  */
 2923 static void __init rcu_init_one(struct rcu_state *rsp,
 2924                 struct rcu_data __percpu *rda)
 2925 {
 2926         static char *buf[] = { "rcu_node_0",
 2927                                "rcu_node_1",
 2928                                "rcu_node_2",
 2929                                "rcu_node_3" };  /* Match MAX_RCU_LVLS */
 2930         static char *fqs[] = { "rcu_node_fqs_0",
 2931                                "rcu_node_fqs_1",
 2932                                "rcu_node_fqs_2",
 2933                                "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
 2934         int cpustride = 1;
 2935         int i;
 2936         int j;
 2937         struct rcu_node *rnp;
 2938 
 2939         BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
 2940 
 2941         /* Initialize the level-tracking arrays. */
 2942 
 2943         for (i = 0; i < rcu_num_lvls; i++)
 2944                 rsp->levelcnt[i] = num_rcu_lvl[i];
 2945         for (i = 1; i < rcu_num_lvls; i++)
 2946                 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
 2947         rcu_init_levelspread(rsp);
 2948 
 2949         /* Initialize the elements themselves, starting from the leaves. */
 2950 
 2951         for (i = rcu_num_lvls - 1; i >= 0; i--) {
 2952                 cpustride *= rsp->levelspread[i];
 2953                 rnp = rsp->level[i];
 2954                 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
 2955                         raw_spin_lock_init(&rnp->lock);
 2956                         lockdep_set_class_and_name(&rnp->lock,
 2957                                                    &rcu_node_class[i], buf[i]);
 2958                         raw_spin_lock_init(&rnp->fqslock);
 2959                         lockdep_set_class_and_name(&rnp->fqslock,
 2960                                                    &rcu_fqs_class[i], fqs[i]);
 2961                         rnp->gpnum = rsp->gpnum;
 2962                         rnp->completed = rsp->completed;
 2963                         rnp->qsmask = 0;
 2964                         rnp->qsmaskinit = 0;
 2965                         rnp->grplo = j * cpustride;
 2966                         rnp->grphi = (j + 1) * cpustride - 1;
 2967                         if (rnp->grphi >= NR_CPUS)
 2968                                 rnp->grphi = NR_CPUS - 1;
 2969                         if (i == 0) {
 2970                                 rnp->grpnum = 0;
 2971                                 rnp->grpmask = 0;
 2972                                 rnp->parent = NULL;
 2973                         } else {
 2974                                 rnp->grpnum = j % rsp->levelspread[i - 1];
 2975                                 rnp->grpmask = 1UL << rnp->grpnum;
 2976                                 rnp->parent = rsp->level[i - 1] +
 2977                                               j / rsp->levelspread[i - 1];
 2978                         }
 2979                         rnp->level = i;
 2980                         INIT_LIST_HEAD(&rnp->blkd_tasks);
 2981                 }
 2982         }
 2983 
 2984         rsp->rda = rda;
 2985         init_waitqueue_head(&rsp->gp_wq);
 2986         rnp = rsp->level[rcu_num_lvls - 1];
 2987         for_each_possible_cpu(i) {
 2988                 while (i > rnp->grphi)
 2989                         rnp++;
 2990                 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
 2991                 rcu_boot_init_percpu_data(i, rsp);
 2992         }
 2993         list_add(&rsp->flavors, &rcu_struct_flavors);
 2994 }
 2995 
 2996 /*
 2997  * Compute the rcu_node tree geometry from kernel parameters.  This cannot
 2998  * replace the definitions in rcutree.h because those are needed to size
 2999  * the ->node array in the rcu_state structure.
 3000  */
 3001 static void __init rcu_init_geometry(void)
 3002 {
 3003         int i;
 3004         int j;
 3005         int n = nr_cpu_ids;
 3006         int rcu_capacity[MAX_RCU_LVLS + 1];
 3007 
 3008         /* If the compile-time values are accurate, just leave. */
 3009         if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
 3010             nr_cpu_ids == NR_CPUS)
 3011                 return;
 3012 
 3013         /*
 3014          * Compute number of nodes that can be handled an rcu_node tree
 3015          * with the given number of levels.  Setting rcu_capacity[0] makes
 3016          * some of the arithmetic easier.
 3017          */
 3018         rcu_capacity[0] = 1;
 3019         rcu_capacity[1] = rcu_fanout_leaf;
 3020         for (i = 2; i <= MAX_RCU_LVLS; i++)
 3021                 rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
 3022 
 3023         /*
 3024          * The boot-time rcu_fanout_leaf parameter is only permitted
 3025          * to increase the leaf-level fanout, not decrease it.  Of course,
 3026          * the leaf-level fanout cannot exceed the number of bits in
 3027          * the rcu_node masks.  Finally, the tree must be able to accommodate
 3028          * the configured number of CPUs.  Complain and fall back to the
 3029          * compile-time values if these limits are exceeded.
 3030          */
 3031         if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
 3032             rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
 3033             n > rcu_capacity[MAX_RCU_LVLS]) {
 3034                 WARN_ON(1);
 3035                 return;
 3036         }
 3037 
 3038         /* Calculate the number of rcu_nodes at each level of the tree. */
 3039         for (i = 1; i <= MAX_RCU_LVLS; i++)
 3040                 if (n <= rcu_capacity[i]) {
 3041                         for (j = 0; j <= i; j++)
 3042                                 num_rcu_lvl[j] =
 3043                                         DIV_ROUND_UP(n, rcu_capacity[i - j]);
 3044                         rcu_num_lvls = i;
 3045                         for (j = i + 1; j <= MAX_RCU_LVLS; j++)
 3046                                 num_rcu_lvl[j] = 0;
 3047                         break;
 3048                 }
 3049 
 3050         /* Calculate the total number of rcu_node structures. */
 3051         rcu_num_nodes = 0;
 3052         for (i = 0; i <= MAX_RCU_LVLS; i++)
 3053                 rcu_num_nodes += num_rcu_lvl[i];
 3054         rcu_num_nodes -= n;
 3055 }
 3056 
 3057 void __init rcu_init(void)
 3058 {
 3059         int cpu;
 3060 
 3061         rcu_bootup_announce();
 3062         rcu_init_geometry();
 3063         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
 3064         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 3065         __rcu_init_preempt();
 3066         rcu_init_nocb();
 3067          open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 3068 
 3069         /*
 3070          * We don't need protection against CPU-hotplug here because
 3071          * this is called early in boot, before either interrupts
 3072          * or the scheduler are operational.
 3073          */
 3074         cpu_notifier(rcu_cpu_notify, 0);
 3075         for_each_online_cpu(cpu)
 3076                 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
 3077         check_cpu_stall_init();
 3078 }
 3079 
 3080 #include "rcutree_plugin.h"

Cache object: b8e84a9ce96406b12988da7881c87137


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.