The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kernel/rcutree_plugin.h

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Read-Copy Update mechanism for mutual exclusion (tree-based version)
    3  * Internal non-public definitions that provide either classic
    4  * or preemptible semantics.
    5  *
    6  * This program is free software; you can redistribute it and/or modify
    7  * it under the terms of the GNU General Public License as published by
    8  * the Free Software Foundation; either version 2 of the License, or
    9  * (at your option) any later version.
   10  *
   11  * This program is distributed in the hope that it will be useful,
   12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14  * GNU General Public License for more details.
   15  *
   16  * You should have received a copy of the GNU General Public License
   17  * along with this program; if not, write to the Free Software
   18  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   19  *
   20  * Copyright Red Hat, 2009
   21  * Copyright IBM Corporation, 2009
   22  *
   23  * Author: Ingo Molnar <mingo@elte.hu>
   24  *         Paul E. McKenney <paulmck@linux.vnet.ibm.com>
   25  */
   26 
   27 #include <linux/delay.h>
   28 #include <linux/gfp.h>
   29 #include <linux/oom.h>
   30 #include <linux/smpboot.h>
   31 
   32 #define RCU_KTHREAD_PRIO 1
   33 
   34 #ifdef CONFIG_RCU_BOOST
   35 #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
   36 #else
   37 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
   38 #endif
   39 
   40 #ifdef CONFIG_RCU_NOCB_CPU
   41 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
   42 static bool have_rcu_nocb_mask;     /* Was rcu_nocb_mask allocated? */
   43 static bool rcu_nocb_poll;          /* Offload kthread are to poll. */
   44 module_param(rcu_nocb_poll, bool, 0444);
   45 static char __initdata nocb_buf[NR_CPUS * 5];
   46 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
   47 
   48 /*
   49  * Check the RCU kernel configuration parameters and print informative
   50  * messages about anything out of the ordinary.  If you like #ifdef, you
   51  * will love this function.
   52  */
   53 static void __init rcu_bootup_announce_oddness(void)
   54 {
   55 #ifdef CONFIG_RCU_TRACE
   56         printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
   57 #endif
   58 #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
   59         printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
   60                CONFIG_RCU_FANOUT);
   61 #endif
   62 #ifdef CONFIG_RCU_FANOUT_EXACT
   63         printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
   64 #endif
   65 #ifdef CONFIG_RCU_FAST_NO_HZ
   66         printk(KERN_INFO
   67                "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
   68 #endif
   69 #ifdef CONFIG_PROVE_RCU
   70         printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
   71 #endif
   72 #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
   73         printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
   74 #endif
   75 #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
   76         printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n");
   77 #endif
   78 #if defined(CONFIG_RCU_CPU_STALL_INFO)
   79         printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
   80 #endif
   81 #if NUM_RCU_LVL_4 != 0
   82         printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
   83 #endif
   84         if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
   85                 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
   86         if (nr_cpu_ids != NR_CPUS)
   87                 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
   88 #ifdef CONFIG_RCU_NOCB_CPU
   89         if (have_rcu_nocb_mask) {
   90                 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
   91                         cpumask_clear_cpu(0, rcu_nocb_mask);
   92                         pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
   93                 }
   94                 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
   95                 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
   96                 if (rcu_nocb_poll)
   97                         pr_info("\tExperimental polled no-CBs CPUs.\n");
   98         }
   99 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  100 }
  101 
  102 #ifdef CONFIG_TREE_PREEMPT_RCU
  103 
  104 struct rcu_state rcu_preempt_state =
  105         RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
  106 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
  107 static struct rcu_state *rcu_state = &rcu_preempt_state;
  108 
  109 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  110 
  111 /*
  112  * Tell them what RCU they are running.
  113  */
  114 static void __init rcu_bootup_announce(void)
  115 {
  116         printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
  117         rcu_bootup_announce_oddness();
  118 }
  119 
  120 /*
  121  * Return the number of RCU-preempt batches processed thus far
  122  * for debug and statistics.
  123  */
  124 long rcu_batches_completed_preempt(void)
  125 {
  126         return rcu_preempt_state.completed;
  127 }
  128 EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
  129 
  130 /*
  131  * Return the number of RCU batches processed thus far for debug & stats.
  132  */
  133 long rcu_batches_completed(void)
  134 {
  135         return rcu_batches_completed_preempt();
  136 }
  137 EXPORT_SYMBOL_GPL(rcu_batches_completed);
  138 
  139 /*
  140  * Force a quiescent state for preemptible RCU.
  141  */
  142 void rcu_force_quiescent_state(void)
  143 {
  144         force_quiescent_state(&rcu_preempt_state);
  145 }
  146 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
  147 
  148 /*
  149  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  150  * that this just means that the task currently running on the CPU is
  151  * not in a quiescent state.  There might be any number of tasks blocked
  152  * while in an RCU read-side critical section.
  153  *
  154  * Unlike the other rcu_*_qs() functions, callers to this function
  155  * must disable irqs in order to protect the assignment to
  156  * ->rcu_read_unlock_special.
  157  */
  158 static void rcu_preempt_qs(int cpu)
  159 {
  160         struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
  161 
  162         if (rdp->passed_quiesce == 0)
  163                 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
  164         rdp->passed_quiesce = 1;
  165         current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
  166 }
  167 
  168 /*
  169  * We have entered the scheduler, and the current task might soon be
  170  * context-switched away from.  If this task is in an RCU read-side
  171  * critical section, we will no longer be able to rely on the CPU to
  172  * record that fact, so we enqueue the task on the blkd_tasks list.
  173  * The task will dequeue itself when it exits the outermost enclosing
  174  * RCU read-side critical section.  Therefore, the current grace period
  175  * cannot be permitted to complete until the blkd_tasks list entries
  176  * predating the current grace period drain, in other words, until
  177  * rnp->gp_tasks becomes NULL.
  178  *
  179  * Caller must disable preemption.
  180  */
  181 static void rcu_preempt_note_context_switch(int cpu)
  182 {
  183         struct task_struct *t = current;
  184         unsigned long flags;
  185         struct rcu_data *rdp;
  186         struct rcu_node *rnp;
  187 
  188         if (t->rcu_read_lock_nesting > 0 &&
  189             (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
  190 
  191                 /* Possibly blocking in an RCU read-side critical section. */
  192                 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
  193                 rnp = rdp->mynode;
  194                 raw_spin_lock_irqsave(&rnp->lock, flags);
  195                 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
  196                 t->rcu_blocked_node = rnp;
  197 
  198                 /*
  199                  * If this CPU has already checked in, then this task
  200                  * will hold up the next grace period rather than the
  201                  * current grace period.  Queue the task accordingly.
  202                  * If the task is queued for the current grace period
  203                  * (i.e., this CPU has not yet passed through a quiescent
  204                  * state for the current grace period), then as long
  205                  * as that task remains queued, the current grace period
  206                  * cannot end.  Note that there is some uncertainty as
  207                  * to exactly when the current grace period started.
  208                  * We take a conservative approach, which can result
  209                  * in unnecessarily waiting on tasks that started very
  210                  * slightly after the current grace period began.  C'est
  211                  * la vie!!!
  212                  *
  213                  * But first, note that the current CPU must still be
  214                  * on line!
  215                  */
  216                 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
  217                 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
  218                 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
  219                         list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
  220                         rnp->gp_tasks = &t->rcu_node_entry;
  221 #ifdef CONFIG_RCU_BOOST
  222                         if (rnp->boost_tasks != NULL)
  223                                 rnp->boost_tasks = rnp->gp_tasks;
  224 #endif /* #ifdef CONFIG_RCU_BOOST */
  225                 } else {
  226                         list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
  227                         if (rnp->qsmask & rdp->grpmask)
  228                                 rnp->gp_tasks = &t->rcu_node_entry;
  229                 }
  230                 trace_rcu_preempt_task(rdp->rsp->name,
  231                                        t->pid,
  232                                        (rnp->qsmask & rdp->grpmask)
  233                                        ? rnp->gpnum
  234                                        : rnp->gpnum + 1);
  235                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
  236         } else if (t->rcu_read_lock_nesting < 0 &&
  237                    t->rcu_read_unlock_special) {
  238 
  239                 /*
  240                  * Complete exit from RCU read-side critical section on
  241                  * behalf of preempted instance of __rcu_read_unlock().
  242                  */
  243                 rcu_read_unlock_special(t);
  244         }
  245 
  246         /*
  247          * Either we were not in an RCU read-side critical section to
  248          * begin with, or we have now recorded that critical section
  249          * globally.  Either way, we can now note a quiescent state
  250          * for this CPU.  Again, if we were in an RCU read-side critical
  251          * section, and if that critical section was blocking the current
  252          * grace period, then the fact that the task has been enqueued
  253          * means that we continue to block the current grace period.
  254          */
  255         local_irq_save(flags);
  256         rcu_preempt_qs(cpu);
  257         local_irq_restore(flags);
  258 }
  259 
  260 /*
  261  * Check for preempted RCU readers blocking the current grace period
  262  * for the specified rcu_node structure.  If the caller needs a reliable
  263  * answer, it must hold the rcu_node's ->lock.
  264  */
  265 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
  266 {
  267         return rnp->gp_tasks != NULL;
  268 }
  269 
  270 /*
  271  * Record a quiescent state for all tasks that were previously queued
  272  * on the specified rcu_node structure and that were blocking the current
  273  * RCU grace period.  The caller must hold the specified rnp->lock with
  274  * irqs disabled, and this lock is released upon return, but irqs remain
  275  * disabled.
  276  */
  277 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
  278         __releases(rnp->lock)
  279 {
  280         unsigned long mask;
  281         struct rcu_node *rnp_p;
  282 
  283         if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
  284                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
  285                 return;  /* Still need more quiescent states! */
  286         }
  287 
  288         rnp_p = rnp->parent;
  289         if (rnp_p == NULL) {
  290                 /*
  291                  * Either there is only one rcu_node in the tree,
  292                  * or tasks were kicked up to root rcu_node due to
  293                  * CPUs going offline.
  294                  */
  295                 rcu_report_qs_rsp(&rcu_preempt_state, flags);
  296                 return;
  297         }
  298 
  299         /* Report up the rest of the hierarchy. */
  300         mask = rnp->grpmask;
  301         raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
  302         raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
  303         rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
  304 }
  305 
  306 /*
  307  * Advance a ->blkd_tasks-list pointer to the next entry, instead
  308  * returning NULL if at the end of the list.
  309  */
  310 static struct list_head *rcu_next_node_entry(struct task_struct *t,
  311                                              struct rcu_node *rnp)
  312 {
  313         struct list_head *np;
  314 
  315         np = t->rcu_node_entry.next;
  316         if (np == &rnp->blkd_tasks)
  317                 np = NULL;
  318         return np;
  319 }
  320 
  321 /*
  322  * Handle special cases during rcu_read_unlock(), such as needing to
  323  * notify RCU core processing or task having blocked during the RCU
  324  * read-side critical section.
  325  */
  326 void rcu_read_unlock_special(struct task_struct *t)
  327 {
  328         int empty;
  329         int empty_exp;
  330         int empty_exp_now;
  331         unsigned long flags;
  332         struct list_head *np;
  333 #ifdef CONFIG_RCU_BOOST
  334         struct rt_mutex *rbmp = NULL;
  335 #endif /* #ifdef CONFIG_RCU_BOOST */
  336         struct rcu_node *rnp;
  337         int special;
  338 
  339         /* NMI handlers cannot block and cannot safely manipulate state. */
  340         if (in_nmi())
  341                 return;
  342 
  343         local_irq_save(flags);
  344 
  345         /*
  346          * If RCU core is waiting for this CPU to exit critical section,
  347          * let it know that we have done so.
  348          */
  349         special = t->rcu_read_unlock_special;
  350         if (special & RCU_READ_UNLOCK_NEED_QS) {
  351                 rcu_preempt_qs(smp_processor_id());
  352         }
  353 
  354         /* Hardware IRQ handlers cannot block. */
  355         if (in_irq() || in_serving_softirq()) {
  356                 local_irq_restore(flags);
  357                 return;
  358         }
  359 
  360         /* Clean up if blocked during RCU read-side critical section. */
  361         if (special & RCU_READ_UNLOCK_BLOCKED) {
  362                 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
  363 
  364                 /*
  365                  * Remove this task from the list it blocked on.  The
  366                  * task can migrate while we acquire the lock, but at
  367                  * most one time.  So at most two passes through loop.
  368                  */
  369                 for (;;) {
  370                         rnp = t->rcu_blocked_node;
  371                         raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
  372                         if (rnp == t->rcu_blocked_node)
  373                                 break;
  374                         raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
  375                 }
  376                 empty = !rcu_preempt_blocked_readers_cgp(rnp);
  377                 empty_exp = !rcu_preempted_readers_exp(rnp);
  378                 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
  379                 np = rcu_next_node_entry(t, rnp);
  380                 list_del_init(&t->rcu_node_entry);
  381                 t->rcu_blocked_node = NULL;
  382                 trace_rcu_unlock_preempted_task("rcu_preempt",
  383                                                 rnp->gpnum, t->pid);
  384                 if (&t->rcu_node_entry == rnp->gp_tasks)
  385                         rnp->gp_tasks = np;
  386                 if (&t->rcu_node_entry == rnp->exp_tasks)
  387                         rnp->exp_tasks = np;
  388 #ifdef CONFIG_RCU_BOOST
  389                 if (&t->rcu_node_entry == rnp->boost_tasks)
  390                         rnp->boost_tasks = np;
  391                 /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
  392                 if (t->rcu_boost_mutex) {
  393                         rbmp = t->rcu_boost_mutex;
  394                         t->rcu_boost_mutex = NULL;
  395                 }
  396 #endif /* #ifdef CONFIG_RCU_BOOST */
  397 
  398                 /*
  399                  * If this was the last task on the current list, and if
  400                  * we aren't waiting on any CPUs, report the quiescent state.
  401                  * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
  402                  * so we must take a snapshot of the expedited state.
  403                  */
  404                 empty_exp_now = !rcu_preempted_readers_exp(rnp);
  405                 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
  406                         trace_rcu_quiescent_state_report("preempt_rcu",
  407                                                          rnp->gpnum,
  408                                                          0, rnp->qsmask,
  409                                                          rnp->level,
  410                                                          rnp->grplo,
  411                                                          rnp->grphi,
  412                                                          !!rnp->gp_tasks);
  413                         rcu_report_unblock_qs_rnp(rnp, flags);
  414                 } else {
  415                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  416                 }
  417 
  418 #ifdef CONFIG_RCU_BOOST
  419                 /* Unboost if we were boosted. */
  420                 if (rbmp)
  421                         rt_mutex_unlock(rbmp);
  422 #endif /* #ifdef CONFIG_RCU_BOOST */
  423 
  424                 /*
  425                  * If this was the last task on the expedited lists,
  426                  * then we need to report up the rcu_node hierarchy.
  427                  */
  428                 if (!empty_exp && empty_exp_now)
  429                         rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
  430         } else {
  431                 local_irq_restore(flags);
  432         }
  433 }
  434 
  435 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
  436 
  437 /*
  438  * Dump detailed information for all tasks blocking the current RCU
  439  * grace period on the specified rcu_node structure.
  440  */
  441 static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
  442 {
  443         unsigned long flags;
  444         struct task_struct *t;
  445 
  446         raw_spin_lock_irqsave(&rnp->lock, flags);
  447         if (!rcu_preempt_blocked_readers_cgp(rnp)) {
  448                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
  449                 return;
  450         }
  451         t = list_entry(rnp->gp_tasks,
  452                        struct task_struct, rcu_node_entry);
  453         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
  454                 sched_show_task(t);
  455         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  456 }
  457 
  458 /*
  459  * Dump detailed information for all tasks blocking the current RCU
  460  * grace period.
  461  */
  462 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
  463 {
  464         struct rcu_node *rnp = rcu_get_root(rsp);
  465 
  466         rcu_print_detail_task_stall_rnp(rnp);
  467         rcu_for_each_leaf_node(rsp, rnp)
  468                 rcu_print_detail_task_stall_rnp(rnp);
  469 }
  470 
  471 #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
  472 
  473 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
  474 {
  475 }
  476 
  477 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
  478 
  479 #ifdef CONFIG_RCU_CPU_STALL_INFO
  480 
  481 static void rcu_print_task_stall_begin(struct rcu_node *rnp)
  482 {
  483         printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
  484                rnp->level, rnp->grplo, rnp->grphi);
  485 }
  486 
  487 static void rcu_print_task_stall_end(void)
  488 {
  489         printk(KERN_CONT "\n");
  490 }
  491 
  492 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
  493 
  494 static void rcu_print_task_stall_begin(struct rcu_node *rnp)
  495 {
  496 }
  497 
  498 static void rcu_print_task_stall_end(void)
  499 {
  500 }
  501 
  502 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
  503 
  504 /*
  505  * Scan the current list of tasks blocked within RCU read-side critical
  506  * sections, printing out the tid of each.
  507  */
  508 static int rcu_print_task_stall(struct rcu_node *rnp)
  509 {
  510         struct task_struct *t;
  511         int ndetected = 0;
  512 
  513         if (!rcu_preempt_blocked_readers_cgp(rnp))
  514                 return 0;
  515         rcu_print_task_stall_begin(rnp);
  516         t = list_entry(rnp->gp_tasks,
  517                        struct task_struct, rcu_node_entry);
  518         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
  519                 printk(KERN_CONT " P%d", t->pid);
  520                 ndetected++;
  521         }
  522         rcu_print_task_stall_end();
  523         return ndetected;
  524 }
  525 
  526 /*
  527  * Check that the list of blocked tasks for the newly completed grace
  528  * period is in fact empty.  It is a serious bug to complete a grace
  529  * period that still has RCU readers blocked!  This function must be
  530  * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
  531  * must be held by the caller.
  532  *
  533  * Also, if there are blocked tasks on the list, they automatically
  534  * block the newly created grace period, so set up ->gp_tasks accordingly.
  535  */
  536 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
  537 {
  538         WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
  539         if (!list_empty(&rnp->blkd_tasks))
  540                 rnp->gp_tasks = rnp->blkd_tasks.next;
  541         WARN_ON_ONCE(rnp->qsmask);
  542 }
  543 
  544 #ifdef CONFIG_HOTPLUG_CPU
  545 
  546 /*
  547  * Handle tasklist migration for case in which all CPUs covered by the
  548  * specified rcu_node have gone offline.  Move them up to the root
  549  * rcu_node.  The reason for not just moving them to the immediate
  550  * parent is to remove the need for rcu_read_unlock_special() to
  551  * make more than two attempts to acquire the target rcu_node's lock.
  552  * Returns true if there were tasks blocking the current RCU grace
  553  * period.
  554  *
  555  * Returns 1 if there was previously a task blocking the current grace
  556  * period on the specified rcu_node structure.
  557  *
  558  * The caller must hold rnp->lock with irqs disabled.
  559  */
  560 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
  561                                      struct rcu_node *rnp,
  562                                      struct rcu_data *rdp)
  563 {
  564         struct list_head *lp;
  565         struct list_head *lp_root;
  566         int retval = 0;
  567         struct rcu_node *rnp_root = rcu_get_root(rsp);
  568         struct task_struct *t;
  569 
  570         if (rnp == rnp_root) {
  571                 WARN_ONCE(1, "Last CPU thought to be offlined?");
  572                 return 0;  /* Shouldn't happen: at least one CPU online. */
  573         }
  574 
  575         /* If we are on an internal node, complain bitterly. */
  576         WARN_ON_ONCE(rnp != rdp->mynode);
  577 
  578         /*
  579          * Move tasks up to root rcu_node.  Don't try to get fancy for
  580          * this corner-case operation -- just put this node's tasks
  581          * at the head of the root node's list, and update the root node's
  582          * ->gp_tasks and ->exp_tasks pointers to those of this node's,
  583          * if non-NULL.  This might result in waiting for more tasks than
  584          * absolutely necessary, but this is a good performance/complexity
  585          * tradeoff.
  586          */
  587         if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
  588                 retval |= RCU_OFL_TASKS_NORM_GP;
  589         if (rcu_preempted_readers_exp(rnp))
  590                 retval |= RCU_OFL_TASKS_EXP_GP;
  591         lp = &rnp->blkd_tasks;
  592         lp_root = &rnp_root->blkd_tasks;
  593         while (!list_empty(lp)) {
  594                 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
  595                 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
  596                 list_del(&t->rcu_node_entry);
  597                 t->rcu_blocked_node = rnp_root;
  598                 list_add(&t->rcu_node_entry, lp_root);
  599                 if (&t->rcu_node_entry == rnp->gp_tasks)
  600                         rnp_root->gp_tasks = rnp->gp_tasks;
  601                 if (&t->rcu_node_entry == rnp->exp_tasks)
  602                         rnp_root->exp_tasks = rnp->exp_tasks;
  603 #ifdef CONFIG_RCU_BOOST
  604                 if (&t->rcu_node_entry == rnp->boost_tasks)
  605                         rnp_root->boost_tasks = rnp->boost_tasks;
  606 #endif /* #ifdef CONFIG_RCU_BOOST */
  607                 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
  608         }
  609 
  610         rnp->gp_tasks = NULL;
  611         rnp->exp_tasks = NULL;
  612 #ifdef CONFIG_RCU_BOOST
  613         rnp->boost_tasks = NULL;
  614         /*
  615          * In case root is being boosted and leaf was not.  Make sure
  616          * that we boost the tasks blocking the current grace period
  617          * in this case.
  618          */
  619         raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
  620         if (rnp_root->boost_tasks != NULL &&
  621             rnp_root->boost_tasks != rnp_root->gp_tasks &&
  622             rnp_root->boost_tasks != rnp_root->exp_tasks)
  623                 rnp_root->boost_tasks = rnp_root->gp_tasks;
  624         raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
  625 #endif /* #ifdef CONFIG_RCU_BOOST */
  626 
  627         return retval;
  628 }
  629 
  630 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
  631 
  632 /*
  633  * Check for a quiescent state from the current CPU.  When a task blocks,
  634  * the task is recorded in the corresponding CPU's rcu_node structure,
  635  * which is checked elsewhere.
  636  *
  637  * Caller must disable hard irqs.
  638  */
  639 static void rcu_preempt_check_callbacks(int cpu)
  640 {
  641         struct task_struct *t = current;
  642 
  643         if (t->rcu_read_lock_nesting == 0) {
  644                 rcu_preempt_qs(cpu);
  645                 return;
  646         }
  647         if (t->rcu_read_lock_nesting > 0 &&
  648             per_cpu(rcu_preempt_data, cpu).qs_pending)
  649                 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
  650 }
  651 
  652 #ifdef CONFIG_RCU_BOOST
  653 
  654 static void rcu_preempt_do_callbacks(void)
  655 {
  656         rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
  657 }
  658 
  659 #endif /* #ifdef CONFIG_RCU_BOOST */
  660 
  661 /*
  662  * Queue a preemptible-RCU callback for invocation after a grace period.
  663  */
  664 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  665 {
  666         __call_rcu(head, func, &rcu_preempt_state, -1, 0);
  667 }
  668 EXPORT_SYMBOL_GPL(call_rcu);
  669 
  670 /*
  671  * Queue an RCU callback for lazy invocation after a grace period.
  672  * This will likely be later named something like "call_rcu_lazy()",
  673  * but this change will require some way of tagging the lazy RCU
  674  * callbacks in the list of pending callbacks.  Until then, this
  675  * function may only be called from __kfree_rcu().
  676  */
  677 void kfree_call_rcu(struct rcu_head *head,
  678                     void (*func)(struct rcu_head *rcu))
  679 {
  680         __call_rcu(head, func, &rcu_preempt_state, -1, 1);
  681 }
  682 EXPORT_SYMBOL_GPL(kfree_call_rcu);
  683 
  684 /**
  685  * synchronize_rcu - wait until a grace period has elapsed.
  686  *
  687  * Control will return to the caller some time after a full grace
  688  * period has elapsed, in other words after all currently executing RCU
  689  * read-side critical sections have completed.  Note, however, that
  690  * upon return from synchronize_rcu(), the caller might well be executing
  691  * concurrently with new RCU read-side critical sections that began while
  692  * synchronize_rcu() was waiting.  RCU read-side critical sections are
  693  * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
  694  *
  695  * See the description of synchronize_sched() for more detailed information
  696  * on memory ordering guarantees.
  697  */
  698 void synchronize_rcu(void)
  699 {
  700         rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
  701                            !lock_is_held(&rcu_lock_map) &&
  702                            !lock_is_held(&rcu_sched_lock_map),
  703                            "Illegal synchronize_rcu() in RCU read-side critical section");
  704         if (!rcu_scheduler_active)
  705                 return;
  706         if (rcu_expedited)
  707                 synchronize_rcu_expedited();
  708         else
  709                 wait_rcu_gp(call_rcu);
  710 }
  711 EXPORT_SYMBOL_GPL(synchronize_rcu);
  712 
  713 static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
  714 static unsigned long sync_rcu_preempt_exp_count;
  715 static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
  716 
  717 /*
  718  * Return non-zero if there are any tasks in RCU read-side critical
  719  * sections blocking the current preemptible-RCU expedited grace period.
  720  * If there is no preemptible-RCU expedited grace period currently in
  721  * progress, returns zero unconditionally.
  722  */
  723 static int rcu_preempted_readers_exp(struct rcu_node *rnp)
  724 {
  725         return rnp->exp_tasks != NULL;
  726 }
  727 
  728 /*
  729  * return non-zero if there is no RCU expedited grace period in progress
  730  * for the specified rcu_node structure, in other words, if all CPUs and
  731  * tasks covered by the specified rcu_node structure have done their bit
  732  * for the current expedited grace period.  Works only for preemptible
  733  * RCU -- other RCU implementation use other means.
  734  *
  735  * Caller must hold sync_rcu_preempt_exp_mutex.
  736  */
  737 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  738 {
  739         return !rcu_preempted_readers_exp(rnp) &&
  740                ACCESS_ONCE(rnp->expmask) == 0;
  741 }
  742 
  743 /*
  744  * Report the exit from RCU read-side critical section for the last task
  745  * that queued itself during or before the current expedited preemptible-RCU
  746  * grace period.  This event is reported either to the rcu_node structure on
  747  * which the task was queued or to one of that rcu_node structure's ancestors,
  748  * recursively up the tree.  (Calm down, calm down, we do the recursion
  749  * iteratively!)
  750  *
  751  * Most callers will set the "wake" flag, but the task initiating the
  752  * expedited grace period need not wake itself.
  753  *
  754  * Caller must hold sync_rcu_preempt_exp_mutex.
  755  */
  756 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
  757                                bool wake)
  758 {
  759         unsigned long flags;
  760         unsigned long mask;
  761 
  762         raw_spin_lock_irqsave(&rnp->lock, flags);
  763         for (;;) {
  764                 if (!sync_rcu_preempt_exp_done(rnp)) {
  765                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  766                         break;
  767                 }
  768                 if (rnp->parent == NULL) {
  769                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  770                         if (wake)
  771                                 wake_up(&sync_rcu_preempt_exp_wq);
  772                         break;
  773                 }
  774                 mask = rnp->grpmask;
  775                 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
  776                 rnp = rnp->parent;
  777                 raw_spin_lock(&rnp->lock); /* irqs already disabled */
  778                 rnp->expmask &= ~mask;
  779         }
  780 }
  781 
  782 /*
  783  * Snapshot the tasks blocking the newly started preemptible-RCU expedited
  784  * grace period for the specified rcu_node structure.  If there are no such
  785  * tasks, report it up the rcu_node hierarchy.
  786  *
  787  * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
  788  * CPU hotplug operations.
  789  */
  790 static void
  791 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
  792 {
  793         unsigned long flags;
  794         int must_wait = 0;
  795 
  796         raw_spin_lock_irqsave(&rnp->lock, flags);
  797         if (list_empty(&rnp->blkd_tasks)) {
  798                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
  799         } else {
  800                 rnp->exp_tasks = rnp->blkd_tasks.next;
  801                 rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
  802                 must_wait = 1;
  803         }
  804         if (!must_wait)
  805                 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
  806 }
  807 
  808 /**
  809  * synchronize_rcu_expedited - Brute-force RCU grace period
  810  *
  811  * Wait for an RCU-preempt grace period, but expedite it.  The basic
  812  * idea is to invoke synchronize_sched_expedited() to push all the tasks to
  813  * the ->blkd_tasks lists and wait for this list to drain.  This consumes
  814  * significant time on all CPUs and is unfriendly to real-time workloads,
  815  * so is thus not recommended for any sort of common-case code.
  816  * In fact, if you are using synchronize_rcu_expedited() in a loop,
  817  * please restructure your code to batch your updates, and then Use a
  818  * single synchronize_rcu() instead.
  819  *
  820  * Note that it is illegal to call this function while holding any lock
  821  * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
  822  * to call this function from a CPU-hotplug notifier.  Failing to observe
  823  * these restriction will result in deadlock.
  824  */
  825 void synchronize_rcu_expedited(void)
  826 {
  827         unsigned long flags;
  828         struct rcu_node *rnp;
  829         struct rcu_state *rsp = &rcu_preempt_state;
  830         unsigned long snap;
  831         int trycount = 0;
  832 
  833         smp_mb(); /* Caller's modifications seen first by other CPUs. */
  834         snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
  835         smp_mb(); /* Above access cannot bleed into critical section. */
  836 
  837         /*
  838          * Block CPU-hotplug operations.  This means that any CPU-hotplug
  839          * operation that finds an rcu_node structure with tasks in the
  840          * process of being boosted will know that all tasks blocking
  841          * this expedited grace period will already be in the process of
  842          * being boosted.  This simplifies the process of moving tasks
  843          * from leaf to root rcu_node structures.
  844          */
  845         get_online_cpus();
  846 
  847         /*
  848          * Acquire lock, falling back to synchronize_rcu() if too many
  849          * lock-acquisition failures.  Of course, if someone does the
  850          * expedited grace period for us, just leave.
  851          */
  852         while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
  853                 if (ULONG_CMP_LT(snap,
  854                     ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
  855                         put_online_cpus();
  856                         goto mb_ret; /* Others did our work for us. */
  857                 }
  858                 if (trycount++ < 10) {
  859                         udelay(trycount * num_online_cpus());
  860                 } else {
  861                         put_online_cpus();
  862                         wait_rcu_gp(call_rcu);
  863                         return;
  864                 }
  865         }
  866         if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
  867                 put_online_cpus();
  868                 goto unlock_mb_ret; /* Others did our work for us. */
  869         }
  870 
  871         /* force all RCU readers onto ->blkd_tasks lists. */
  872         synchronize_sched_expedited();
  873 
  874         /* Initialize ->expmask for all non-leaf rcu_node structures. */
  875         rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
  876                 raw_spin_lock_irqsave(&rnp->lock, flags);
  877                 rnp->expmask = rnp->qsmaskinit;
  878                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
  879         }
  880 
  881         /* Snapshot current state of ->blkd_tasks lists. */
  882         rcu_for_each_leaf_node(rsp, rnp)
  883                 sync_rcu_preempt_exp_init(rsp, rnp);
  884         if (NUM_RCU_NODES > 1)
  885                 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
  886 
  887         put_online_cpus();
  888 
  889         /* Wait for snapshotted ->blkd_tasks lists to drain. */
  890         rnp = rcu_get_root(rsp);
  891         wait_event(sync_rcu_preempt_exp_wq,
  892                    sync_rcu_preempt_exp_done(rnp));
  893 
  894         /* Clean up and exit. */
  895         smp_mb(); /* ensure expedited GP seen before counter increment. */
  896         ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
  897 unlock_mb_ret:
  898         mutex_unlock(&sync_rcu_preempt_exp_mutex);
  899 mb_ret:
  900         smp_mb(); /* ensure subsequent action seen after grace period. */
  901 }
  902 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
  903 
  904 /**
  905  * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
  906  *
  907  * Note that this primitive does not necessarily wait for an RCU grace period
  908  * to complete.  For example, if there are no RCU callbacks queued anywhere
  909  * in the system, then rcu_barrier() is within its rights to return
  910  * immediately, without waiting for anything, much less an RCU grace period.
  911  */
  912 void rcu_barrier(void)
  913 {
  914         _rcu_barrier(&rcu_preempt_state);
  915 }
  916 EXPORT_SYMBOL_GPL(rcu_barrier);
  917 
  918 /*
  919  * Initialize preemptible RCU's state structures.
  920  */
  921 static void __init __rcu_init_preempt(void)
  922 {
  923         rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
  924 }
  925 
  926 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  927 
  928 static struct rcu_state *rcu_state = &rcu_sched_state;
  929 
  930 /*
  931  * Tell them what RCU they are running.
  932  */
  933 static void __init rcu_bootup_announce(void)
  934 {
  935         printk(KERN_INFO "Hierarchical RCU implementation.\n");
  936         rcu_bootup_announce_oddness();
  937 }
  938 
  939 /*
  940  * Return the number of RCU batches processed thus far for debug & stats.
  941  */
  942 long rcu_batches_completed(void)
  943 {
  944         return rcu_batches_completed_sched();
  945 }
  946 EXPORT_SYMBOL_GPL(rcu_batches_completed);
  947 
  948 /*
  949  * Force a quiescent state for RCU, which, because there is no preemptible
  950  * RCU, becomes the same as rcu-sched.
  951  */
  952 void rcu_force_quiescent_state(void)
  953 {
  954         rcu_sched_force_quiescent_state();
  955 }
  956 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
  957 
  958 /*
  959  * Because preemptible RCU does not exist, we never have to check for
  960  * CPUs being in quiescent states.
  961  */
  962 static void rcu_preempt_note_context_switch(int cpu)
  963 {
  964 }
  965 
  966 /*
  967  * Because preemptible RCU does not exist, there are never any preempted
  968  * RCU readers.
  969  */
  970 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
  971 {
  972         return 0;
  973 }
  974 
  975 #ifdef CONFIG_HOTPLUG_CPU
  976 
  977 /* Because preemptible RCU does not exist, no quieting of tasks. */
  978 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
  979 {
  980         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  981 }
  982 
  983 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
  984 
  985 /*
  986  * Because preemptible RCU does not exist, we never have to check for
  987  * tasks blocked within RCU read-side critical sections.
  988  */
  989 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
  990 {
  991 }
  992 
  993 /*
  994  * Because preemptible RCU does not exist, we never have to check for
  995  * tasks blocked within RCU read-side critical sections.
  996  */
  997 static int rcu_print_task_stall(struct rcu_node *rnp)
  998 {
  999         return 0;
 1000 }
 1001 
 1002 /*
 1003  * Because there is no preemptible RCU, there can be no readers blocked,
 1004  * so there is no need to check for blocked tasks.  So check only for
 1005  * bogus qsmask values.
 1006  */
 1007 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 1008 {
 1009         WARN_ON_ONCE(rnp->qsmask);
 1010 }
 1011 
 1012 #ifdef CONFIG_HOTPLUG_CPU
 1013 
 1014 /*
 1015  * Because preemptible RCU does not exist, it never needs to migrate
 1016  * tasks that were blocked within RCU read-side critical sections, and
 1017  * such non-existent tasks cannot possibly have been blocking the current
 1018  * grace period.
 1019  */
 1020 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 1021                                      struct rcu_node *rnp,
 1022                                      struct rcu_data *rdp)
 1023 {
 1024         return 0;
 1025 }
 1026 
 1027 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 1028 
 1029 /*
 1030  * Because preemptible RCU does not exist, it never has any callbacks
 1031  * to check.
 1032  */
 1033 static void rcu_preempt_check_callbacks(int cpu)
 1034 {
 1035 }
 1036 
 1037 /*
 1038  * Queue an RCU callback for lazy invocation after a grace period.
 1039  * This will likely be later named something like "call_rcu_lazy()",
 1040  * but this change will require some way of tagging the lazy RCU
 1041  * callbacks in the list of pending callbacks.  Until then, this
 1042  * function may only be called from __kfree_rcu().
 1043  *
 1044  * Because there is no preemptible RCU, we use RCU-sched instead.
 1045  */
 1046 void kfree_call_rcu(struct rcu_head *head,
 1047                     void (*func)(struct rcu_head *rcu))
 1048 {
 1049         __call_rcu(head, func, &rcu_sched_state, -1, 1);
 1050 }
 1051 EXPORT_SYMBOL_GPL(kfree_call_rcu);
 1052 
 1053 /*
 1054  * Wait for an rcu-preempt grace period, but make it happen quickly.
 1055  * But because preemptible RCU does not exist, map to rcu-sched.
 1056  */
 1057 void synchronize_rcu_expedited(void)
 1058 {
 1059         synchronize_sched_expedited();
 1060 }
 1061 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 1062 
 1063 #ifdef CONFIG_HOTPLUG_CPU
 1064 
 1065 /*
 1066  * Because preemptible RCU does not exist, there is never any need to
 1067  * report on tasks preempted in RCU read-side critical sections during
 1068  * expedited RCU grace periods.
 1069  */
 1070 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 1071                                bool wake)
 1072 {
 1073 }
 1074 
 1075 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 1076 
 1077 /*
 1078  * Because preemptible RCU does not exist, rcu_barrier() is just
 1079  * another name for rcu_barrier_sched().
 1080  */
 1081 void rcu_barrier(void)
 1082 {
 1083         rcu_barrier_sched();
 1084 }
 1085 EXPORT_SYMBOL_GPL(rcu_barrier);
 1086 
 1087 /*
 1088  * Because preemptible RCU does not exist, it need not be initialized.
 1089  */
 1090 static void __init __rcu_init_preempt(void)
 1091 {
 1092 }
 1093 
 1094 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 1095 
 1096 #ifdef CONFIG_RCU_BOOST
 1097 
 1098 #include "rtmutex_common.h"
 1099 
 1100 #ifdef CONFIG_RCU_TRACE
 1101 
 1102 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 1103 {
 1104         if (list_empty(&rnp->blkd_tasks))
 1105                 rnp->n_balk_blkd_tasks++;
 1106         else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
 1107                 rnp->n_balk_exp_gp_tasks++;
 1108         else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
 1109                 rnp->n_balk_boost_tasks++;
 1110         else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
 1111                 rnp->n_balk_notblocked++;
 1112         else if (rnp->gp_tasks != NULL &&
 1113                  ULONG_CMP_LT(jiffies, rnp->boost_time))
 1114                 rnp->n_balk_notyet++;
 1115         else
 1116                 rnp->n_balk_nos++;
 1117 }
 1118 
 1119 #else /* #ifdef CONFIG_RCU_TRACE */
 1120 
 1121 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 1122 {
 1123 }
 1124 
 1125 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 1126 
 1127 static void rcu_wake_cond(struct task_struct *t, int status)
 1128 {
 1129         /*
 1130          * If the thread is yielding, only wake it when this
 1131          * is invoked from idle
 1132          */
 1133         if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
 1134                 wake_up_process(t);
 1135 }
 1136 
 1137 /*
 1138  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
 1139  * or ->boost_tasks, advancing the pointer to the next task in the
 1140  * ->blkd_tasks list.
 1141  *
 1142  * Note that irqs must be enabled: boosting the task can block.
 1143  * Returns 1 if there are more tasks needing to be boosted.
 1144  */
 1145 static int rcu_boost(struct rcu_node *rnp)
 1146 {
 1147         unsigned long flags;
 1148         struct rt_mutex mtx;
 1149         struct task_struct *t;
 1150         struct list_head *tb;
 1151 
 1152         if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
 1153                 return 0;  /* Nothing left to boost. */
 1154 
 1155         raw_spin_lock_irqsave(&rnp->lock, flags);
 1156 
 1157         /*
 1158          * Recheck under the lock: all tasks in need of boosting
 1159          * might exit their RCU read-side critical sections on their own.
 1160          */
 1161         if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
 1162                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1163                 return 0;
 1164         }
 1165 
 1166         /*
 1167          * Preferentially boost tasks blocking expedited grace periods.
 1168          * This cannot starve the normal grace periods because a second
 1169          * expedited grace period must boost all blocked tasks, including
 1170          * those blocking the pre-existing normal grace period.
 1171          */
 1172         if (rnp->exp_tasks != NULL) {
 1173                 tb = rnp->exp_tasks;
 1174                 rnp->n_exp_boosts++;
 1175         } else {
 1176                 tb = rnp->boost_tasks;
 1177                 rnp->n_normal_boosts++;
 1178         }
 1179         rnp->n_tasks_boosted++;
 1180 
 1181         /*
 1182          * We boost task t by manufacturing an rt_mutex that appears to
 1183          * be held by task t.  We leave a pointer to that rt_mutex where
 1184          * task t can find it, and task t will release the mutex when it
 1185          * exits its outermost RCU read-side critical section.  Then
 1186          * simply acquiring this artificial rt_mutex will boost task
 1187          * t's priority.  (Thanks to tglx for suggesting this approach!)
 1188          *
 1189          * Note that task t must acquire rnp->lock to remove itself from
 1190          * the ->blkd_tasks list, which it will do from exit() if from
 1191          * nowhere else.  We therefore are guaranteed that task t will
 1192          * stay around at least until we drop rnp->lock.  Note that
 1193          * rnp->lock also resolves races between our priority boosting
 1194          * and task t's exiting its outermost RCU read-side critical
 1195          * section.
 1196          */
 1197         t = container_of(tb, struct task_struct, rcu_node_entry);
 1198         rt_mutex_init_proxy_locked(&mtx, t);
 1199         t->rcu_boost_mutex = &mtx;
 1200         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1201         rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
 1202         rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
 1203 
 1204         return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
 1205                ACCESS_ONCE(rnp->boost_tasks) != NULL;
 1206 }
 1207 
 1208 /*
 1209  * Priority-boosting kthread.  One per leaf rcu_node and one for the
 1210  * root rcu_node.
 1211  */
 1212 static int rcu_boost_kthread(void *arg)
 1213 {
 1214         struct rcu_node *rnp = (struct rcu_node *)arg;
 1215         int spincnt = 0;
 1216         int more2boost;
 1217 
 1218         trace_rcu_utilization("Start boost kthread@init");
 1219         for (;;) {
 1220                 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
 1221                 trace_rcu_utilization("End boost kthread@rcu_wait");
 1222                 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
 1223                 trace_rcu_utilization("Start boost kthread@rcu_wait");
 1224                 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
 1225                 more2boost = rcu_boost(rnp);
 1226                 if (more2boost)
 1227                         spincnt++;
 1228                 else
 1229                         spincnt = 0;
 1230                 if (spincnt > 10) {
 1231                         rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
 1232                         trace_rcu_utilization("End boost kthread@rcu_yield");
 1233                         schedule_timeout_interruptible(2);
 1234                         trace_rcu_utilization("Start boost kthread@rcu_yield");
 1235                         spincnt = 0;
 1236                 }
 1237         }
 1238         /* NOTREACHED */
 1239         trace_rcu_utilization("End boost kthread@notreached");
 1240         return 0;
 1241 }
 1242 
 1243 /*
 1244  * Check to see if it is time to start boosting RCU readers that are
 1245  * blocking the current grace period, and, if so, tell the per-rcu_node
 1246  * kthread to start boosting them.  If there is an expedited grace
 1247  * period in progress, it is always time to boost.
 1248  *
 1249  * The caller must hold rnp->lock, which this function releases.
 1250  * The ->boost_kthread_task is immortal, so we don't need to worry
 1251  * about it going away.
 1252  */
 1253 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 1254 {
 1255         struct task_struct *t;
 1256 
 1257         if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
 1258                 rnp->n_balk_exp_gp_tasks++;
 1259                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1260                 return;
 1261         }
 1262         if (rnp->exp_tasks != NULL ||
 1263             (rnp->gp_tasks != NULL &&
 1264              rnp->boost_tasks == NULL &&
 1265              rnp->qsmask == 0 &&
 1266              ULONG_CMP_GE(jiffies, rnp->boost_time))) {
 1267                 if (rnp->exp_tasks == NULL)
 1268                         rnp->boost_tasks = rnp->gp_tasks;
 1269                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1270                 t = rnp->boost_kthread_task;
 1271                 if (t)
 1272                         rcu_wake_cond(t, rnp->boost_kthread_status);
 1273         } else {
 1274                 rcu_initiate_boost_trace(rnp);
 1275                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1276         }
 1277 }
 1278 
 1279 /*
 1280  * Wake up the per-CPU kthread to invoke RCU callbacks.
 1281  */
 1282 static void invoke_rcu_callbacks_kthread(void)
 1283 {
 1284         unsigned long flags;
 1285 
 1286         local_irq_save(flags);
 1287         __this_cpu_write(rcu_cpu_has_work, 1);
 1288         if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
 1289             current != __this_cpu_read(rcu_cpu_kthread_task)) {
 1290                 rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
 1291                               __this_cpu_read(rcu_cpu_kthread_status));
 1292         }
 1293         local_irq_restore(flags);
 1294 }
 1295 
 1296 /*
 1297  * Is the current CPU running the RCU-callbacks kthread?
 1298  * Caller must have preemption disabled.
 1299  */
 1300 static bool rcu_is_callbacks_kthread(void)
 1301 {
 1302         return __get_cpu_var(rcu_cpu_kthread_task) == current;
 1303 }
 1304 
 1305 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
 1306 
 1307 /*
 1308  * Do priority-boost accounting for the start of a new grace period.
 1309  */
 1310 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 1311 {
 1312         rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
 1313 }
 1314 
 1315 /*
 1316  * Create an RCU-boost kthread for the specified node if one does not
 1317  * already exist.  We only create this kthread for preemptible RCU.
 1318  * Returns zero if all is well, a negated errno otherwise.
 1319  */
 1320 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 1321                                                  struct rcu_node *rnp)
 1322 {
 1323         int rnp_index = rnp - &rsp->node[0];
 1324         unsigned long flags;
 1325         struct sched_param sp;
 1326         struct task_struct *t;
 1327 
 1328         if (&rcu_preempt_state != rsp)
 1329                 return 0;
 1330 
 1331         if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
 1332                 return 0;
 1333 
 1334         rsp->boost = 1;
 1335         if (rnp->boost_kthread_task != NULL)
 1336                 return 0;
 1337         t = kthread_create(rcu_boost_kthread, (void *)rnp,
 1338                            "rcub/%d", rnp_index);
 1339         if (IS_ERR(t))
 1340                 return PTR_ERR(t);
 1341         raw_spin_lock_irqsave(&rnp->lock, flags);
 1342         rnp->boost_kthread_task = t;
 1343         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1344         sp.sched_priority = RCU_BOOST_PRIO;
 1345         sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 1346         wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
 1347         return 0;
 1348 }
 1349 
 1350 static void rcu_kthread_do_work(void)
 1351 {
 1352         rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
 1353         rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 1354         rcu_preempt_do_callbacks();
 1355 }
 1356 
 1357 static void rcu_cpu_kthread_setup(unsigned int cpu)
 1358 {
 1359         struct sched_param sp;
 1360 
 1361         sp.sched_priority = RCU_KTHREAD_PRIO;
 1362         sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
 1363 }
 1364 
 1365 static void rcu_cpu_kthread_park(unsigned int cpu)
 1366 {
 1367         per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
 1368 }
 1369 
 1370 static int rcu_cpu_kthread_should_run(unsigned int cpu)
 1371 {
 1372         return __get_cpu_var(rcu_cpu_has_work);
 1373 }
 1374 
 1375 /*
 1376  * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
 1377  * RCU softirq used in flavors and configurations of RCU that do not
 1378  * support RCU priority boosting.
 1379  */
 1380 static void rcu_cpu_kthread(unsigned int cpu)
 1381 {
 1382         unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
 1383         char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
 1384         int spincnt;
 1385 
 1386         for (spincnt = 0; spincnt < 10; spincnt++) {
 1387                 trace_rcu_utilization("Start CPU kthread@rcu_wait");
 1388                 local_bh_disable();
 1389                 *statusp = RCU_KTHREAD_RUNNING;
 1390                 this_cpu_inc(rcu_cpu_kthread_loops);
 1391                 local_irq_disable();
 1392                 work = *workp;
 1393                 *workp = 0;
 1394                 local_irq_enable();
 1395                 if (work)
 1396                         rcu_kthread_do_work();
 1397                 local_bh_enable();
 1398                 if (*workp == 0) {
 1399                         trace_rcu_utilization("End CPU kthread@rcu_wait");
 1400                         *statusp = RCU_KTHREAD_WAITING;
 1401                         return;
 1402                 }
 1403         }
 1404         *statusp = RCU_KTHREAD_YIELDING;
 1405         trace_rcu_utilization("Start CPU kthread@rcu_yield");
 1406         schedule_timeout_interruptible(2);
 1407         trace_rcu_utilization("End CPU kthread@rcu_yield");
 1408         *statusp = RCU_KTHREAD_WAITING;
 1409 }
 1410 
 1411 /*
 1412  * Set the per-rcu_node kthread's affinity to cover all CPUs that are
 1413  * served by the rcu_node in question.  The CPU hotplug lock is still
 1414  * held, so the value of rnp->qsmaskinit will be stable.
 1415  *
 1416  * We don't include outgoingcpu in the affinity set, use -1 if there is
 1417  * no outgoing CPU.  If there are no CPUs left in the affinity set,
 1418  * this function allows the kthread to execute on any CPU.
 1419  */
 1420 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 1421 {
 1422         struct task_struct *t = rnp->boost_kthread_task;
 1423         unsigned long mask = rnp->qsmaskinit;
 1424         cpumask_var_t cm;
 1425         int cpu;
 1426 
 1427         if (!t)
 1428                 return;
 1429         if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
 1430                 return;
 1431         for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
 1432                 if ((mask & 0x1) && cpu != outgoingcpu)
 1433                         cpumask_set_cpu(cpu, cm);
 1434         if (cpumask_weight(cm) == 0) {
 1435                 cpumask_setall(cm);
 1436                 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
 1437                         cpumask_clear_cpu(cpu, cm);
 1438                 WARN_ON_ONCE(cpumask_weight(cm) == 0);
 1439         }
 1440         set_cpus_allowed_ptr(t, cm);
 1441         free_cpumask_var(cm);
 1442 }
 1443 
 1444 static struct smp_hotplug_thread rcu_cpu_thread_spec = {
 1445         .store                  = &rcu_cpu_kthread_task,
 1446         .thread_should_run      = rcu_cpu_kthread_should_run,
 1447         .thread_fn              = rcu_cpu_kthread,
 1448         .thread_comm            = "rcuc/%u",
 1449         .setup                  = rcu_cpu_kthread_setup,
 1450         .park                   = rcu_cpu_kthread_park,
 1451 };
 1452 
 1453 /*
 1454  * Spawn all kthreads -- called as soon as the scheduler is running.
 1455  */
 1456 static int __init rcu_spawn_kthreads(void)
 1457 {
 1458         struct rcu_node *rnp;
 1459         int cpu;
 1460 
 1461         rcu_scheduler_fully_active = 1;
 1462         for_each_possible_cpu(cpu)
 1463                 per_cpu(rcu_cpu_has_work, cpu) = 0;
 1464         BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
 1465         rnp = rcu_get_root(rcu_state);
 1466         (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
 1467         if (NUM_RCU_NODES > 1) {
 1468                 rcu_for_each_leaf_node(rcu_state, rnp)
 1469                         (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
 1470         }
 1471         return 0;
 1472 }
 1473 early_initcall(rcu_spawn_kthreads);
 1474 
 1475 static void __cpuinit rcu_prepare_kthreads(int cpu)
 1476 {
 1477         struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 1478         struct rcu_node *rnp = rdp->mynode;
 1479 
 1480         /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
 1481         if (rcu_scheduler_fully_active)
 1482                 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
 1483 }
 1484 
 1485 #else /* #ifdef CONFIG_RCU_BOOST */
 1486 
 1487 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 1488 {
 1489         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 1490 }
 1491 
 1492 static void invoke_rcu_callbacks_kthread(void)
 1493 {
 1494         WARN_ON_ONCE(1);
 1495 }
 1496 
 1497 static bool rcu_is_callbacks_kthread(void)
 1498 {
 1499         return false;
 1500 }
 1501 
 1502 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 1503 {
 1504 }
 1505 
 1506 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 1507 {
 1508 }
 1509 
 1510 static int __init rcu_scheduler_really_started(void)
 1511 {
 1512         rcu_scheduler_fully_active = 1;
 1513         return 0;
 1514 }
 1515 early_initcall(rcu_scheduler_really_started);
 1516 
 1517 static void __cpuinit rcu_prepare_kthreads(int cpu)
 1518 {
 1519 }
 1520 
 1521 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 1522 
 1523 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 1524 
 1525 /*
 1526  * Check to see if any future RCU-related work will need to be done
 1527  * by the current CPU, even if none need be done immediately, returning
 1528  * 1 if so.  This function is part of the RCU implementation; it is -not-
 1529  * an exported member of the RCU API.
 1530  *
 1531  * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
 1532  * any flavor of RCU.
 1533  */
 1534 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 1535 {
 1536         *delta_jiffies = ULONG_MAX;
 1537         return rcu_cpu_has_callbacks(cpu);
 1538 }
 1539 
 1540 /*
 1541  * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
 1542  */
 1543 static void rcu_prepare_for_idle_init(int cpu)
 1544 {
 1545 }
 1546 
 1547 /*
 1548  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
 1549  * after it.
 1550  */
 1551 static void rcu_cleanup_after_idle(int cpu)
 1552 {
 1553 }
 1554 
 1555 /*
 1556  * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
 1557  * is nothing.
 1558  */
 1559 static void rcu_prepare_for_idle(int cpu)
 1560 {
 1561 }
 1562 
 1563 /*
 1564  * Don't bother keeping a running count of the number of RCU callbacks
 1565  * posted because CONFIG_RCU_FAST_NO_HZ=n.
 1566  */
 1567 static void rcu_idle_count_callbacks_posted(void)
 1568 {
 1569 }
 1570 
 1571 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 1572 
 1573 /*
 1574  * This code is invoked when a CPU goes idle, at which point we want
 1575  * to have the CPU do everything required for RCU so that it can enter
 1576  * the energy-efficient dyntick-idle mode.  This is handled by a
 1577  * state machine implemented by rcu_prepare_for_idle() below.
 1578  *
 1579  * The following three proprocessor symbols control this state machine:
 1580  *
 1581  * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
 1582  *      to satisfy RCU.  Beyond this point, it is better to incur a periodic
 1583  *      scheduling-clock interrupt than to loop through the state machine
 1584  *      at full power.
 1585  * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
 1586  *      optional if RCU does not need anything immediately from this
 1587  *      CPU, even if this CPU still has RCU callbacks queued.  The first
 1588  *      times through the state machine are mandatory: we need to give
 1589  *      the state machine a chance to communicate a quiescent state
 1590  *      to the RCU core.
 1591  * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
 1592  *      to sleep in dyntick-idle mode with RCU callbacks pending.  This
 1593  *      is sized to be roughly one RCU grace period.  Those energy-efficiency
 1594  *      benchmarkers who might otherwise be tempted to set this to a large
 1595  *      number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
 1596  *      system.  And if you are -that- concerned about energy efficiency,
 1597  *      just power the system down and be done with it!
 1598  * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
 1599  *      permitted to sleep in dyntick-idle mode with only lazy RCU
 1600  *      callbacks pending.  Setting this too high can OOM your system.
 1601  *
 1602  * The values below work well in practice.  If future workloads require
 1603  * adjustment, they can be converted into kernel config parameters, though
 1604  * making the state machine smarter might be a better option.
 1605  */
 1606 #define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
 1607 #define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
 1608 #define RCU_IDLE_GP_DELAY 4             /* Roughly one grace period. */
 1609 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
 1610 
 1611 extern int tick_nohz_enabled;
 1612 
 1613 /*
 1614  * Does the specified flavor of RCU have non-lazy callbacks pending on
 1615  * the specified CPU?  Both RCU flavor and CPU are specified by the
 1616  * rcu_data structure.
 1617  */
 1618 static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
 1619 {
 1620         return rdp->qlen != rdp->qlen_lazy;
 1621 }
 1622 
 1623 #ifdef CONFIG_TREE_PREEMPT_RCU
 1624 
 1625 /*
 1626  * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there
 1627  * is no RCU-preempt in the kernel.)
 1628  */
 1629 static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
 1630 {
 1631         struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
 1632 
 1633         return __rcu_cpu_has_nonlazy_callbacks(rdp);
 1634 }
 1635 
 1636 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 1637 
 1638 static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
 1639 {
 1640         return 0;
 1641 }
 1642 
 1643 #endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
 1644 
 1645 /*
 1646  * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
 1647  */
 1648 static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
 1649 {
 1650         return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
 1651                __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
 1652                rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
 1653 }
 1654 
 1655 /*
 1656  * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
 1657  * callbacks on this CPU, (2) this CPU has not yet attempted to enter
 1658  * dyntick-idle mode, or (3) this CPU is in the process of attempting to
 1659  * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
 1660  * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
 1661  * it is better to incur scheduling-clock interrupts than to spin
 1662  * continuously for the same time duration!
 1663  *
 1664  * The delta_jiffies argument is used to store the time when RCU is
 1665  * going to need the CPU again if it still has callbacks.  The reason
 1666  * for this is that rcu_prepare_for_idle() might need to post a timer,
 1667  * but if so, it will do so after tick_nohz_stop_sched_tick() has set
 1668  * the wakeup time for this CPU.  This means that RCU's timer can be
 1669  * delayed until the wakeup time, which defeats the purpose of posting
 1670  * a timer.
 1671  */
 1672 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 1673 {
 1674         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 1675 
 1676         /* Flag a new idle sojourn to the idle-entry state machine. */
 1677         rdtp->idle_first_pass = 1;
 1678         /* If no callbacks, RCU doesn't need the CPU. */
 1679         if (!rcu_cpu_has_callbacks(cpu)) {
 1680                 *delta_jiffies = ULONG_MAX;
 1681                 return 0;
 1682         }
 1683         if (rdtp->dyntick_holdoff == jiffies) {
 1684                 /* RCU recently tried and failed, so don't try again. */
 1685                 *delta_jiffies = 1;
 1686                 return 1;
 1687         }
 1688         /* Set up for the possibility that RCU will post a timer. */
 1689         if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
 1690                 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
 1691                                           RCU_IDLE_GP_DELAY) - jiffies;
 1692         } else {
 1693                 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
 1694                 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
 1695         }
 1696         return 0;
 1697 }
 1698 
 1699 /*
 1700  * Handler for smp_call_function_single().  The only point of this
 1701  * handler is to wake the CPU up, so the handler does only tracing.
 1702  */
 1703 void rcu_idle_demigrate(void *unused)
 1704 {
 1705         trace_rcu_prep_idle("Demigrate");
 1706 }
 1707 
 1708 /*
 1709  * Timer handler used to force CPU to start pushing its remaining RCU
 1710  * callbacks in the case where it entered dyntick-idle mode with callbacks
 1711  * pending.  The hander doesn't really need to do anything because the
 1712  * real work is done upon re-entry to idle, or by the next scheduling-clock
 1713  * interrupt should idle not be re-entered.
 1714  *
 1715  * One special case: the timer gets migrated without awakening the CPU
 1716  * on which the timer was scheduled on.  In this case, we must wake up
 1717  * that CPU.  We do so with smp_call_function_single().
 1718  */
 1719 static void rcu_idle_gp_timer_func(unsigned long cpu_in)
 1720 {
 1721         int cpu = (int)cpu_in;
 1722 
 1723         trace_rcu_prep_idle("Timer");
 1724         if (cpu != smp_processor_id())
 1725                 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
 1726         else
 1727                 WARN_ON_ONCE(1); /* Getting here can hang the system... */
 1728 }
 1729 
 1730 /*
 1731  * Initialize the timer used to pull CPUs out of dyntick-idle mode.
 1732  */
 1733 static void rcu_prepare_for_idle_init(int cpu)
 1734 {
 1735         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 1736 
 1737         rdtp->dyntick_holdoff = jiffies - 1;
 1738         setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
 1739         rdtp->idle_gp_timer_expires = jiffies - 1;
 1740         rdtp->idle_first_pass = 1;
 1741 }
 1742 
 1743 /*
 1744  * Clean up for exit from idle.  Because we are exiting from idle, there
 1745  * is no longer any point to ->idle_gp_timer, so cancel it.  This will
 1746  * do nothing if this timer is not active, so just cancel it unconditionally.
 1747  */
 1748 static void rcu_cleanup_after_idle(int cpu)
 1749 {
 1750         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 1751 
 1752         del_timer(&rdtp->idle_gp_timer);
 1753         trace_rcu_prep_idle("Cleanup after idle");
 1754         rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
 1755 }
 1756 
 1757 /*
 1758  * Check to see if any RCU-related work can be done by the current CPU,
 1759  * and if so, schedule a softirq to get it done.  This function is part
 1760  * of the RCU implementation; it is -not- an exported member of the RCU API.
 1761  *
 1762  * The idea is for the current CPU to clear out all work required by the
 1763  * RCU core for the current grace period, so that this CPU can be permitted
 1764  * to enter dyntick-idle mode.  In some cases, it will need to be awakened
 1765  * at the end of the grace period by whatever CPU ends the grace period.
 1766  * This allows CPUs to go dyntick-idle more quickly, and to reduce the
 1767  * number of wakeups by a modest integer factor.
 1768  *
 1769  * Because it is not legal to invoke rcu_process_callbacks() with irqs
 1770  * disabled, we do one pass of force_quiescent_state(), then do a
 1771  * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
 1772  * later.  The ->dyntick_drain field controls the sequencing.
 1773  *
 1774  * The caller must have disabled interrupts.
 1775  */
 1776 static void rcu_prepare_for_idle(int cpu)
 1777 {
 1778         struct timer_list *tp;
 1779         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 1780         int tne;
 1781 
 1782         /* Handle nohz enablement switches conservatively. */
 1783         tne = ACCESS_ONCE(tick_nohz_enabled);
 1784         if (tne != rdtp->tick_nohz_enabled_snap) {
 1785                 if (rcu_cpu_has_callbacks(cpu))
 1786                         invoke_rcu_core(); /* force nohz to see update. */
 1787                 rdtp->tick_nohz_enabled_snap = tne;
 1788                 return;
 1789         }
 1790         if (!tne)
 1791                 return;
 1792 
 1793         /* Adaptive-tick mode, where usermode execution is idle to RCU. */
 1794         if (!is_idle_task(current)) {
 1795                 rdtp->dyntick_holdoff = jiffies - 1;
 1796                 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
 1797                         trace_rcu_prep_idle("User dyntick with callbacks");
 1798                         rdtp->idle_gp_timer_expires =
 1799                                 round_up(jiffies + RCU_IDLE_GP_DELAY,
 1800                                          RCU_IDLE_GP_DELAY);
 1801                 } else if (rcu_cpu_has_callbacks(cpu)) {
 1802                         rdtp->idle_gp_timer_expires =
 1803                                 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
 1804                         trace_rcu_prep_idle("User dyntick with lazy callbacks");
 1805                 } else {
 1806                         return;
 1807                 }
 1808                 tp = &rdtp->idle_gp_timer;
 1809                 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
 1810                 return;
 1811         }
 1812 
 1813         /*
 1814          * If this is an idle re-entry, for example, due to use of
 1815          * RCU_NONIDLE() or the new idle-loop tracing API within the idle
 1816          * loop, then don't take any state-machine actions, unless the
 1817          * momentary exit from idle queued additional non-lazy callbacks.
 1818          * Instead, repost the ->idle_gp_timer if this CPU has callbacks
 1819          * pending.
 1820          */
 1821         if (!rdtp->idle_first_pass &&
 1822             (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
 1823                 if (rcu_cpu_has_callbacks(cpu)) {
 1824                         tp = &rdtp->idle_gp_timer;
 1825                         mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
 1826                 }
 1827                 return;
 1828         }
 1829         rdtp->idle_first_pass = 0;
 1830         rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
 1831 
 1832         /*
 1833          * If there are no callbacks on this CPU, enter dyntick-idle mode.
 1834          * Also reset state to avoid prejudicing later attempts.
 1835          */
 1836         if (!rcu_cpu_has_callbacks(cpu)) {
 1837                 rdtp->dyntick_holdoff = jiffies - 1;
 1838                 rdtp->dyntick_drain = 0;
 1839                 trace_rcu_prep_idle("No callbacks");
 1840                 return;
 1841         }
 1842 
 1843         /*
 1844          * If in holdoff mode, just return.  We will presumably have
 1845          * refrained from disabling the scheduling-clock tick.
 1846          */
 1847         if (rdtp->dyntick_holdoff == jiffies) {
 1848                 trace_rcu_prep_idle("In holdoff");
 1849                 return;
 1850         }
 1851 
 1852         /* Check and update the ->dyntick_drain sequencing. */
 1853         if (rdtp->dyntick_drain <= 0) {
 1854                 /* First time through, initialize the counter. */
 1855                 rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
 1856         } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
 1857                    !rcu_pending(cpu) &&
 1858                    !local_softirq_pending()) {
 1859                 /* Can we go dyntick-idle despite still having callbacks? */
 1860                 rdtp->dyntick_drain = 0;
 1861                 rdtp->dyntick_holdoff = jiffies;
 1862                 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
 1863                         trace_rcu_prep_idle("Dyntick with callbacks");
 1864                         rdtp->idle_gp_timer_expires =
 1865                                 round_up(jiffies + RCU_IDLE_GP_DELAY,
 1866                                          RCU_IDLE_GP_DELAY);
 1867                 } else {
 1868                         rdtp->idle_gp_timer_expires =
 1869                                 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
 1870                         trace_rcu_prep_idle("Dyntick with lazy callbacks");
 1871                 }
 1872                 tp = &rdtp->idle_gp_timer;
 1873                 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
 1874                 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
 1875                 return; /* Nothing more to do immediately. */
 1876         } else if (--(rdtp->dyntick_drain) <= 0) {
 1877                 /* We have hit the limit, so time to give up. */
 1878                 rdtp->dyntick_holdoff = jiffies;
 1879                 trace_rcu_prep_idle("Begin holdoff");
 1880                 invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
 1881                 return;
 1882         }
 1883 
 1884         /*
 1885          * Do one step of pushing the remaining RCU callbacks through
 1886          * the RCU core state machine.
 1887          */
 1888 #ifdef CONFIG_TREE_PREEMPT_RCU
 1889         if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
 1890                 rcu_preempt_qs(cpu);
 1891                 force_quiescent_state(&rcu_preempt_state);
 1892         }
 1893 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 1894         if (per_cpu(rcu_sched_data, cpu).nxtlist) {
 1895                 rcu_sched_qs(cpu);
 1896                 force_quiescent_state(&rcu_sched_state);
 1897         }
 1898         if (per_cpu(rcu_bh_data, cpu).nxtlist) {
 1899                 rcu_bh_qs(cpu);
 1900                 force_quiescent_state(&rcu_bh_state);
 1901         }
 1902 
 1903         /*
 1904          * If RCU callbacks are still pending, RCU still needs this CPU.
 1905          * So try forcing the callbacks through the grace period.
 1906          */
 1907         if (rcu_cpu_has_callbacks(cpu)) {
 1908                 trace_rcu_prep_idle("More callbacks");
 1909                 invoke_rcu_core();
 1910         } else {
 1911                 trace_rcu_prep_idle("Callbacks drained");
 1912         }
 1913 }
 1914 
 1915 /*
 1916  * Keep a running count of the number of non-lazy callbacks posted
 1917  * on this CPU.  This running counter (which is never decremented) allows
 1918  * rcu_prepare_for_idle() to detect when something out of the idle loop
 1919  * posts a callback, even if an equal number of callbacks are invoked.
 1920  * Of course, callbacks should only be posted from within a trace event
 1921  * designed to be called from idle or from within RCU_NONIDLE().
 1922  */
 1923 static void rcu_idle_count_callbacks_posted(void)
 1924 {
 1925         __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
 1926 }
 1927 
 1928 /*
 1929  * Data for flushing lazy RCU callbacks at OOM time.
 1930  */
 1931 static atomic_t oom_callback_count;
 1932 static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
 1933 
 1934 /*
 1935  * RCU OOM callback -- decrement the outstanding count and deliver the
 1936  * wake-up if we are the last one.
 1937  */
 1938 static void rcu_oom_callback(struct rcu_head *rhp)
 1939 {
 1940         if (atomic_dec_and_test(&oom_callback_count))
 1941                 wake_up(&oom_callback_wq);
 1942 }
 1943 
 1944 /*
 1945  * Post an rcu_oom_notify callback on the current CPU if it has at
 1946  * least one lazy callback.  This will unnecessarily post callbacks
 1947  * to CPUs that already have a non-lazy callback at the end of their
 1948  * callback list, but this is an infrequent operation, so accept some
 1949  * extra overhead to keep things simple.
 1950  */
 1951 static void rcu_oom_notify_cpu(void *unused)
 1952 {
 1953         struct rcu_state *rsp;
 1954         struct rcu_data *rdp;
 1955 
 1956         for_each_rcu_flavor(rsp) {
 1957                 rdp = __this_cpu_ptr(rsp->rda);
 1958                 if (rdp->qlen_lazy != 0) {
 1959                         atomic_inc(&oom_callback_count);
 1960                         rsp->call(&rdp->oom_head, rcu_oom_callback);
 1961                 }
 1962         }
 1963 }
 1964 
 1965 /*
 1966  * If low on memory, ensure that each CPU has a non-lazy callback.
 1967  * This will wake up CPUs that have only lazy callbacks, in turn
 1968  * ensuring that they free up the corresponding memory in a timely manner.
 1969  * Because an uncertain amount of memory will be freed in some uncertain
 1970  * timeframe, we do not claim to have freed anything.
 1971  */
 1972 static int rcu_oom_notify(struct notifier_block *self,
 1973                           unsigned long notused, void *nfreed)
 1974 {
 1975         int cpu;
 1976 
 1977         /* Wait for callbacks from earlier instance to complete. */
 1978         wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
 1979 
 1980         /*
 1981          * Prevent premature wakeup: ensure that all increments happen
 1982          * before there is a chance of the counter reaching zero.
 1983          */
 1984         atomic_set(&oom_callback_count, 1);
 1985 
 1986         get_online_cpus();
 1987         for_each_online_cpu(cpu) {
 1988                 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
 1989                 cond_resched();
 1990         }
 1991         put_online_cpus();
 1992 
 1993         /* Unconditionally decrement: no need to wake ourselves up. */
 1994         atomic_dec(&oom_callback_count);
 1995 
 1996         return NOTIFY_OK;
 1997 }
 1998 
 1999 static struct notifier_block rcu_oom_nb = {
 2000         .notifier_call = rcu_oom_notify
 2001 };
 2002 
 2003 static int __init rcu_register_oom_notifier(void)
 2004 {
 2005         register_oom_notifier(&rcu_oom_nb);
 2006         return 0;
 2007 }
 2008 early_initcall(rcu_register_oom_notifier);
 2009 
 2010 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 2011 
 2012 #ifdef CONFIG_RCU_CPU_STALL_INFO
 2013 
 2014 #ifdef CONFIG_RCU_FAST_NO_HZ
 2015 
 2016 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 2017 {
 2018         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 2019         struct timer_list *tltp = &rdtp->idle_gp_timer;
 2020         char c;
 2021 
 2022         c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
 2023         if (timer_pending(tltp))
 2024                 sprintf(cp, "drain=%d %c timer=%lu",
 2025                         rdtp->dyntick_drain, c, tltp->expires - jiffies);
 2026         else
 2027                 sprintf(cp, "drain=%d %c timer not pending",
 2028                         rdtp->dyntick_drain, c);
 2029 }
 2030 
 2031 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 2032 
 2033 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 2034 {
 2035         *cp = '\0';
 2036 }
 2037 
 2038 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
 2039 
 2040 /* Initiate the stall-info list. */
 2041 static void print_cpu_stall_info_begin(void)
 2042 {
 2043         printk(KERN_CONT "\n");
 2044 }
 2045 
 2046 /*
 2047  * Print out diagnostic information for the specified stalled CPU.
 2048  *
 2049  * If the specified CPU is aware of the current RCU grace period
 2050  * (flavor specified by rsp), then print the number of scheduling
 2051  * clock interrupts the CPU has taken during the time that it has
 2052  * been aware.  Otherwise, print the number of RCU grace periods
 2053  * that this CPU is ignorant of, for example, "1" if the CPU was
 2054  * aware of the previous grace period.
 2055  *
 2056  * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
 2057  */
 2058 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 2059 {
 2060         char fast_no_hz[72];
 2061         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 2062         struct rcu_dynticks *rdtp = rdp->dynticks;
 2063         char *ticks_title;
 2064         unsigned long ticks_value;
 2065 
 2066         if (rsp->gpnum == rdp->gpnum) {
 2067                 ticks_title = "ticks this GP";
 2068                 ticks_value = rdp->ticks_this_gp;
 2069         } else {
 2070                 ticks_title = "GPs behind";
 2071                 ticks_value = rsp->gpnum - rdp->gpnum;
 2072         }
 2073         print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
 2074         printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
 2075                cpu, ticks_value, ticks_title,
 2076                atomic_read(&rdtp->dynticks) & 0xfff,
 2077                rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 2078                fast_no_hz);
 2079 }
 2080 
 2081 /* Terminate the stall-info list. */
 2082 static void print_cpu_stall_info_end(void)
 2083 {
 2084         printk(KERN_ERR "\t");
 2085 }
 2086 
 2087 /* Zero ->ticks_this_gp for all flavors of RCU. */
 2088 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
 2089 {
 2090         rdp->ticks_this_gp = 0;
 2091 }
 2092 
 2093 /* Increment ->ticks_this_gp for all flavors of RCU. */
 2094 static void increment_cpu_stall_ticks(void)
 2095 {
 2096         struct rcu_state *rsp;
 2097 
 2098         for_each_rcu_flavor(rsp)
 2099                 __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
 2100 }
 2101 
 2102 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
 2103 
 2104 static void print_cpu_stall_info_begin(void)
 2105 {
 2106         printk(KERN_CONT " {");
 2107 }
 2108 
 2109 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 2110 {
 2111         printk(KERN_CONT " %d", cpu);
 2112 }
 2113 
 2114 static void print_cpu_stall_info_end(void)
 2115 {
 2116         printk(KERN_CONT "} ");
 2117 }
 2118 
 2119 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
 2120 {
 2121 }
 2122 
 2123 static void increment_cpu_stall_ticks(void)
 2124 {
 2125 }
 2126 
 2127 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
 2128 
 2129 #ifdef CONFIG_RCU_NOCB_CPU
 2130 
 2131 /*
 2132  * Offload callback processing from the boot-time-specified set of CPUs
 2133  * specified by rcu_nocb_mask.  For each CPU in the set, there is a
 2134  * kthread created that pulls the callbacks from the corresponding CPU,
 2135  * waits for a grace period to elapse, and invokes the callbacks.
 2136  * The no-CBs CPUs do a wake_up() on their kthread when they insert
 2137  * a callback into any empty list, unless the rcu_nocb_poll boot parameter
 2138  * has been specified, in which case each kthread actively polls its
 2139  * CPU.  (Which isn't so great for energy efficiency, but which does
 2140  * reduce RCU's overhead on that CPU.)
 2141  *
 2142  * This is intended to be used in conjunction with Frederic Weisbecker's
 2143  * adaptive-idle work, which would seriously reduce OS jitter on CPUs
 2144  * running CPU-bound user-mode computations.
 2145  *
 2146  * Offloading of callback processing could also in theory be used as
 2147  * an energy-efficiency measure because CPUs with no RCU callbacks
 2148  * queued are more aggressive about entering dyntick-idle mode.
 2149  */
 2150 
 2151 
 2152 /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
 2153 static int __init rcu_nocb_setup(char *str)
 2154 {
 2155         alloc_bootmem_cpumask_var(&rcu_nocb_mask);
 2156         have_rcu_nocb_mask = true;
 2157         cpulist_parse(str, rcu_nocb_mask);
 2158         return 1;
 2159 }
 2160 __setup("rcu_nocbs=", rcu_nocb_setup);
 2161 
 2162 /* Is the specified CPU a no-CPUs CPU? */
 2163 static bool is_nocb_cpu(int cpu)
 2164 {
 2165         if (have_rcu_nocb_mask)
 2166                 return cpumask_test_cpu(cpu, rcu_nocb_mask);
 2167         return false;
 2168 }
 2169 
 2170 /*
 2171  * Enqueue the specified string of rcu_head structures onto the specified
 2172  * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
 2173  * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
 2174  * counts are supplied by rhcount and rhcount_lazy.
 2175  *
 2176  * If warranted, also wake up the kthread servicing this CPUs queues.
 2177  */
 2178 static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 2179                                     struct rcu_head *rhp,
 2180                                     struct rcu_head **rhtp,
 2181                                     int rhcount, int rhcount_lazy)
 2182 {
 2183         int len;
 2184         struct rcu_head **old_rhpp;
 2185         struct task_struct *t;
 2186 
 2187         /* Enqueue the callback on the nocb list and update counts. */
 2188         old_rhpp = xchg(&rdp->nocb_tail, rhtp);
 2189         ACCESS_ONCE(*old_rhpp) = rhp;
 2190         atomic_long_add(rhcount, &rdp->nocb_q_count);
 2191         atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
 2192 
 2193         /* If we are not being polled and there is a kthread, awaken it ... */
 2194         t = ACCESS_ONCE(rdp->nocb_kthread);
 2195         if (rcu_nocb_poll | !t)
 2196                 return;
 2197         len = atomic_long_read(&rdp->nocb_q_count);
 2198         if (old_rhpp == &rdp->nocb_head) {
 2199                 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
 2200                 rdp->qlen_last_fqs_check = 0;
 2201         } else if (len > rdp->qlen_last_fqs_check + qhimark) {
 2202                 wake_up_process(t); /* ... or if many callbacks queued. */
 2203                 rdp->qlen_last_fqs_check = LONG_MAX / 2;
 2204         }
 2205         return;
 2206 }
 2207 
 2208 /*
 2209  * This is a helper for __call_rcu(), which invokes this when the normal
 2210  * callback queue is inoperable.  If this is not a no-CBs CPU, this
 2211  * function returns failure back to __call_rcu(), which can complain
 2212  * appropriately.
 2213  *
 2214  * Otherwise, this function queues the callback where the corresponding
 2215  * "rcuo" kthread can find it.
 2216  */
 2217 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 2218                             bool lazy)
 2219 {
 2220 
 2221         if (!is_nocb_cpu(rdp->cpu))
 2222                 return 0;
 2223         __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
 2224         return 1;
 2225 }
 2226 
 2227 /*
 2228  * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
 2229  * not a no-CBs CPU.
 2230  */
 2231 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
 2232                                                      struct rcu_data *rdp)
 2233 {
 2234         long ql = rsp->qlen;
 2235         long qll = rsp->qlen_lazy;
 2236 
 2237         /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
 2238         if (!is_nocb_cpu(smp_processor_id()))
 2239                 return 0;
 2240         rsp->qlen = 0;
 2241         rsp->qlen_lazy = 0;
 2242 
 2243         /* First, enqueue the donelist, if any.  This preserves CB ordering. */
 2244         if (rsp->orphan_donelist != NULL) {
 2245                 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
 2246                                         rsp->orphan_donetail, ql, qll);
 2247                 ql = qll = 0;
 2248                 rsp->orphan_donelist = NULL;
 2249                 rsp->orphan_donetail = &rsp->orphan_donelist;
 2250         }
 2251         if (rsp->orphan_nxtlist != NULL) {
 2252                 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
 2253                                         rsp->orphan_nxttail, ql, qll);
 2254                 ql = qll = 0;
 2255                 rsp->orphan_nxtlist = NULL;
 2256                 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
 2257         }
 2258         return 1;
 2259 }
 2260 
 2261 /*
 2262  * There must be at least one non-no-CBs CPU in operation at any given
 2263  * time, because no-CBs CPUs are not capable of initiating grace periods
 2264  * independently.  This function therefore complains if the specified
 2265  * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
 2266  * avoid offlining the last such CPU.  (Recursion is a wonderful thing,
 2267  * but you have to have a base case!)
 2268  */
 2269 static bool nocb_cpu_expendable(int cpu)
 2270 {
 2271         cpumask_var_t non_nocb_cpus;
 2272         int ret;
 2273 
 2274         /*
 2275          * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
 2276          * then offlining this CPU is harmless.  Let it happen.
 2277          */
 2278         if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
 2279                 return 1;
 2280 
 2281         /* If no memory, play it safe and keep the CPU around. */
 2282         if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
 2283                 return 0;
 2284         cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
 2285         cpumask_clear_cpu(cpu, non_nocb_cpus);
 2286         ret = !cpumask_empty(non_nocb_cpus);
 2287         free_cpumask_var(non_nocb_cpus);
 2288         return ret;
 2289 }
 2290 
 2291 /*
 2292  * Helper structure for remote registry of RCU callbacks.
 2293  * This is needed for when a no-CBs CPU needs to start a grace period.
 2294  * If it just invokes call_rcu(), the resulting callback will be queued,
 2295  * which can result in deadlock.
 2296  */
 2297 struct rcu_head_remote {
 2298         struct rcu_head *rhp;
 2299         call_rcu_func_t *crf;
 2300         void (*func)(struct rcu_head *rhp);
 2301 };
 2302 
 2303 /*
 2304  * Register a callback as specified by the rcu_head_remote struct.
 2305  * This function is intended to be invoked via smp_call_function_single().
 2306  */
 2307 static void call_rcu_local(void *arg)
 2308 {
 2309         struct rcu_head_remote *rhrp =
 2310                 container_of(arg, struct rcu_head_remote, rhp);
 2311 
 2312         rhrp->crf(rhrp->rhp, rhrp->func);
 2313 }
 2314 
 2315 /*
 2316  * Set up an rcu_head_remote structure and the invoke call_rcu_local()
 2317  * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
 2318  * smp_call_function_single().
 2319  */
 2320 static void invoke_crf_remote(struct rcu_head *rhp,
 2321                               void (*func)(struct rcu_head *rhp),
 2322                               call_rcu_func_t crf)
 2323 {
 2324         struct rcu_head_remote rhr;
 2325 
 2326         rhr.rhp = rhp;
 2327         rhr.crf = crf;
 2328         rhr.func = func;
 2329         smp_call_function_single(0, call_rcu_local, &rhr, 1);
 2330 }
 2331 
 2332 /*
 2333  * Helper functions to be passed to wait_rcu_gp(), each of which
 2334  * invokes invoke_crf_remote() to register a callback appropriately.
 2335  */
 2336 static void __maybe_unused
 2337 call_rcu_preempt_remote(struct rcu_head *rhp,
 2338                         void (*func)(struct rcu_head *rhp))
 2339 {
 2340         invoke_crf_remote(rhp, func, call_rcu);
 2341 }
 2342 static void call_rcu_bh_remote(struct rcu_head *rhp,
 2343                                void (*func)(struct rcu_head *rhp))
 2344 {
 2345         invoke_crf_remote(rhp, func, call_rcu_bh);
 2346 }
 2347 static void call_rcu_sched_remote(struct rcu_head *rhp,
 2348                                   void (*func)(struct rcu_head *rhp))
 2349 {
 2350         invoke_crf_remote(rhp, func, call_rcu_sched);
 2351 }
 2352 
 2353 /*
 2354  * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes
 2355  * callbacks queued by the corresponding no-CBs CPU.
 2356  */
 2357 static int rcu_nocb_kthread(void *arg)
 2358 {
 2359         int c, cl;
 2360         struct rcu_head *list;
 2361         struct rcu_head *next;
 2362         struct rcu_head **tail;
 2363         struct rcu_data *rdp = arg;
 2364 
 2365         /* Each pass through this loop invokes one batch of callbacks */
 2366         for (;;) {
 2367                 /* If not polling, wait for next batch of callbacks. */
 2368                 if (!rcu_nocb_poll)
 2369                         wait_event(rdp->nocb_wq, rdp->nocb_head);
 2370                 list = ACCESS_ONCE(rdp->nocb_head);
 2371                 if (!list) {
 2372                         schedule_timeout_interruptible(1);
 2373                         continue;
 2374                 }
 2375 
 2376                 /*
 2377                  * Extract queued callbacks, update counts, and wait
 2378                  * for a grace period to elapse.
 2379                  */
 2380                 ACCESS_ONCE(rdp->nocb_head) = NULL;
 2381                 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
 2382                 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
 2383                 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
 2384                 ACCESS_ONCE(rdp->nocb_p_count) += c;
 2385                 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
 2386                 wait_rcu_gp(rdp->rsp->call_remote);
 2387 
 2388                 /* Each pass through the following loop invokes a callback. */
 2389                 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
 2390                 c = cl = 0;
 2391                 while (list) {
 2392                         next = list->next;
 2393                         /* Wait for enqueuing to complete, if needed. */
 2394                         while (next == NULL && &list->next != tail) {
 2395                                 schedule_timeout_interruptible(1);
 2396                                 next = list->next;
 2397                         }
 2398                         debug_rcu_head_unqueue(list);
 2399                         local_bh_disable();
 2400                         if (__rcu_reclaim(rdp->rsp->name, list))
 2401                                 cl++;
 2402                         c++;
 2403                         local_bh_enable();
 2404                         list = next;
 2405                 }
 2406                 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
 2407                 ACCESS_ONCE(rdp->nocb_p_count) -= c;
 2408                 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
 2409                 rdp->n_nocbs_invoked += c;
 2410         }
 2411         return 0;
 2412 }
 2413 
 2414 /* Initialize per-rcu_data variables for no-CBs CPUs. */
 2415 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 2416 {
 2417         rdp->nocb_tail = &rdp->nocb_head;
 2418         init_waitqueue_head(&rdp->nocb_wq);
 2419 }
 2420 
 2421 /* Create a kthread for each RCU flavor for each no-CBs CPU. */
 2422 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
 2423 {
 2424         int cpu;
 2425         struct rcu_data *rdp;
 2426         struct task_struct *t;
 2427 
 2428         if (rcu_nocb_mask == NULL)
 2429                 return;
 2430         for_each_cpu(cpu, rcu_nocb_mask) {
 2431                 rdp = per_cpu_ptr(rsp->rda, cpu);
 2432                 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
 2433                 BUG_ON(IS_ERR(t));
 2434                 ACCESS_ONCE(rdp->nocb_kthread) = t;
 2435         }
 2436 }
 2437 
 2438 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
 2439 static void init_nocb_callback_list(struct rcu_data *rdp)
 2440 {
 2441         if (rcu_nocb_mask == NULL ||
 2442             !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
 2443                 return;
 2444         rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 2445 }
 2446 
 2447 /* Initialize the ->call_remote fields in the rcu_state structures. */
 2448 static void __init rcu_init_nocb(void)
 2449 {
 2450 #ifdef CONFIG_PREEMPT_RCU
 2451         rcu_preempt_state.call_remote = call_rcu_preempt_remote;
 2452 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 2453         rcu_bh_state.call_remote = call_rcu_bh_remote;
 2454         rcu_sched_state.call_remote = call_rcu_sched_remote;
 2455 }
 2456 
 2457 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 2458 
 2459 static bool is_nocb_cpu(int cpu)
 2460 {
 2461         return false;
 2462 }
 2463 
 2464 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 2465                             bool lazy)
 2466 {
 2467         return 0;
 2468 }
 2469 
 2470 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
 2471                                                      struct rcu_data *rdp)
 2472 {
 2473         return 0;
 2474 }
 2475 
 2476 static bool nocb_cpu_expendable(int cpu)
 2477 {
 2478         return 1;
 2479 }
 2480 
 2481 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 2482 {
 2483 }
 2484 
 2485 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
 2486 {
 2487 }
 2488 
 2489 static void init_nocb_callback_list(struct rcu_data *rdp)
 2490 {
 2491 }
 2492 
 2493 static void __init rcu_init_nocb(void)
 2494 {
 2495 }
 2496 
 2497 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

Cache object: 31021ef5d2cee5d5b52c0fdc9db2dd98


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.