LCOV - landlock.info - kernel/rcu/tree.c

LCOV - code coverage report

Current view:	top level - kernel/rcu - tree.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	1282	1779	72.1 %
Date:	2021-04-22 12:43:58	Functions:	82	117	70.1 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0+
       2             : /*
       3             :  * Read-Copy Update mechanism for mutual exclusion (tree-based version)
       4             :  *
       5             :  * Copyright IBM Corporation, 2008
       6             :  *
       7             :  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
       8             :  *          Manfred Spraul <manfred@colorfullife.com>
       9             :  *          Paul E. McKenney <paulmck@linux.ibm.com>
      10             :  *
      11             :  * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
      12             :  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
      13             :  *
      14             :  * For detailed explanation of Read-Copy Update mechanism see -
      15             :  *      Documentation/RCU
      16             :  */
      17             : 
      18             : #define pr_fmt(fmt) "rcu: " fmt
      19             : 
      20             : #include <linux/types.h>
      21             : #include <linux/kernel.h>
      22             : #include <linux/init.h>
      23             : #include <linux/spinlock.h>
      24             : #include <linux/smp.h>
      25             : #include <linux/rcupdate_wait.h>
      26             : #include <linux/interrupt.h>
      27             : #include <linux/sched.h>
      28             : #include <linux/sched/debug.h>
      29             : #include <linux/nmi.h>
      30             : #include <linux/atomic.h>
      31             : #include <linux/bitops.h>
      32             : #include <linux/export.h>
      33             : #include <linux/completion.h>
      34             : #include <linux/moduleparam.h>
      35             : #include <linux/percpu.h>
      36             : #include <linux/notifier.h>
      37             : #include <linux/cpu.h>
      38             : #include <linux/mutex.h>
      39             : #include <linux/time.h>
      40             : #include <linux/kernel_stat.h>
      41             : #include <linux/wait.h>
      42             : #include <linux/kthread.h>
      43             : #include <uapi/linux/sched/types.h>
      44             : #include <linux/prefetch.h>
      45             : #include <linux/delay.h>
      46             : #include <linux/random.h>
      47             : #include <linux/trace_events.h>
      48             : #include <linux/suspend.h>
      49             : #include <linux/ftrace.h>
      50             : #include <linux/tick.h>
      51             : #include <linux/sysrq.h>
      52             : #include <linux/kprobes.h>
      53             : #include <linux/gfp.h>
      54             : #include <linux/oom.h>
      55             : #include <linux/smpboot.h>
      56             : #include <linux/jiffies.h>
      57             : #include <linux/slab.h>
      58             : #include <linux/sched/isolation.h>
      59             : #include <linux/sched/clock.h>
      60             : #include <linux/vmalloc.h>
      61             : #include <linux/mm.h>
      62             : #include <linux/kasan.h>
      63             : #include "../time/tick-internal.h"
      64             : 
      65             : #include "tree.h"
      66             : #include "rcu.h"
      67             : 
      68             : #ifdef MODULE_PARAM_PREFIX
      69             : #undef MODULE_PARAM_PREFIX
      70             : #endif
      71             : #define MODULE_PARAM_PREFIX "rcutree."
      72             : 
      73             : /* Data structures. */
      74             : 
      75             : /*
      76             :  * Steal a bit from the bottom of ->dynticks for idle entry/exit
      77             :  * control.  Initially this is for TLB flushing.
      78             :  */
      79             : #define RCU_DYNTICK_CTRL_MASK 0x1
      80             : #define RCU_DYNTICK_CTRL_CTR  (RCU_DYNTICK_CTRL_MASK + 1)
      81             : 
      82             : static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
      83             :         .dynticks_nesting = 1,
      84             :         .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
      85             :         .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
      86             : #ifdef CONFIG_RCU_NOCB_CPU
      87             :         .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
      88             : #endif
      89             : };
      90             : static struct rcu_state rcu_state = {
      91             :         .level = { &rcu_state.node[0] },
      92             :         .gp_state = RCU_GP_IDLE,
      93             :         .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
      94             :         .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
      95             :         .name = RCU_NAME,
      96             :         .abbr = RCU_ABBR,
      97             :         .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
      98             :         .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
      99             :         .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
     100             : };
     101             : 
     102             : /* Dump rcu_node combining tree at boot to verify correct setup. */
     103             : static bool dump_tree;
     104             : module_param(dump_tree, bool, 0444);
     105             : /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
     106             : static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
     107             : #ifndef CONFIG_PREEMPT_RT
     108             : module_param(use_softirq, bool, 0444);
     109             : #endif
     110             : /* Control rcu_node-tree auto-balancing at boot time. */
     111             : static bool rcu_fanout_exact;
     112             : module_param(rcu_fanout_exact, bool, 0444);
     113             : /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
     114             : static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
     115             : module_param(rcu_fanout_leaf, int, 0444);
     116             : int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
     117             : /* Number of rcu_nodes at specified level. */
     118             : int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
     119             : int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
     120             : 
     121             : /*
     122             :  * The rcu_scheduler_active variable is initialized to the value
     123             :  * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
     124             :  * first task is spawned.  So when this variable is RCU_SCHEDULER_INACTIVE,
     125             :  * RCU can assume that there is but one task, allowing RCU to (for example)
     126             :  * optimize synchronize_rcu() to a simple barrier().  When this variable
     127             :  * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
     128             :  * to detect real grace periods.  This variable is also used to suppress
     129             :  * boot-time false positives from lockdep-RCU error checking.  Finally, it
     130             :  * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
     131             :  * is fully initialized, including all of its kthreads having been spawned.
     132             :  */
     133             : int rcu_scheduler_active __read_mostly;
     134             : EXPORT_SYMBOL_GPL(rcu_scheduler_active);
     135             : 
     136             : /*
     137             :  * The rcu_scheduler_fully_active variable transitions from zero to one
     138             :  * during the early_initcall() processing, which is after the scheduler
     139             :  * is capable of creating new tasks.  So RCU processing (for example,
     140             :  * creating tasks for RCU priority boosting) must be delayed until after
     141             :  * rcu_scheduler_fully_active transitions from zero to one.  We also
     142             :  * currently delay invocation of any RCU callbacks until after this point.
     143             :  *
     144             :  * It might later prove better for people registering RCU callbacks during
     145             :  * early boot to take responsibility for these callbacks, but one step at
     146             :  * a time.
     147             :  */
     148             : static int rcu_scheduler_fully_active __read_mostly;
     149             : 
     150             : static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
     151             :                               unsigned long gps, unsigned long flags);
     152             : static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
     153             : static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
     154             : static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
     155             : static void invoke_rcu_core(void);
     156             : static void rcu_report_exp_rdp(struct rcu_data *rdp);
     157             : static void sync_sched_exp_online_cleanup(int cpu);
     158             : static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
     159             : 
     160             : /* rcuc/rcub kthread realtime priority */
     161             : static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
     162             : module_param(kthread_prio, int, 0444);
     163             : 
     164             : /* Delay in jiffies for grace-period initialization delays, debug only. */
     165             : 
     166             : static int gp_preinit_delay;
     167             : module_param(gp_preinit_delay, int, 0444);
     168             : static int gp_init_delay;
     169             : module_param(gp_init_delay, int, 0444);
     170             : static int gp_cleanup_delay;
     171             : module_param(gp_cleanup_delay, int, 0444);
     172             : 
     173             : // Add delay to rcu_read_unlock() for strict grace periods.
     174             : static int rcu_unlock_delay;
     175             : #ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
     176             : module_param(rcu_unlock_delay, int, 0444);
     177             : #endif
     178             : 
     179             : /*
     180             :  * This rcu parameter is runtime-read-only. It reflects
     181             :  * a minimum allowed number of objects which can be cached
     182             :  * per-CPU. Object size is equal to one page. This value
     183             :  * can be changed at boot time.
     184             :  */
     185             : static int rcu_min_cached_objs = 5;
     186             : module_param(rcu_min_cached_objs, int, 0444);
     187             : 
     188             : /* Retrieve RCU kthreads priority for rcutorture */
     189           0 : int rcu_get_gp_kthreads_prio(void)
     190             : {
     191           0 :         return kthread_prio;
     192             : }
     193             : EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
     194             : 
     195             : /*
     196             :  * Number of grace periods between delays, normalized by the duration of
     197             :  * the delay.  The longer the delay, the more the grace periods between
     198             :  * each delay.  The reason for this normalization is that it means that,
     199             :  * for non-zero delays, the overall slowdown of grace periods is constant
     200             :  * regardless of the duration of the delay.  This arrangement balances
     201             :  * the need for long delays to increase some race probabilities with the
     202             :  * need for fast grace periods to increase other race probabilities.
     203             :  */
     204             : #define PER_RCU_NODE_PERIOD 3   /* Number of grace periods between delays. */
     205             : 
     206             : /*
     207             :  * Compute the mask of online CPUs for the specified rcu_node structure.
     208             :  * This will not be stable unless the rcu_node structure's ->lock is
     209             :  * held, but the bit corresponding to the current CPU will be stable
     210             :  * in most contexts.
     211             :  */
     212    34985144 : static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
     213             : {
     214    34985144 :         return READ_ONCE(rnp->qsmaskinitnext);
     215             : }
     216             : 
     217             : /*
     218             :  * Return true if an RCU grace period is in progress.  The READ_ONCE()s
     219             :  * permit this function to be invoked without holding the root rcu_node
     220             :  * structure's ->lock, but of course results can be subject to change.
     221             :  */
     222      168502 : static int rcu_gp_in_progress(void)
     223             : {
     224       82037 :         return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
     225             : }
     226             : 
     227             : /*
     228             :  * Return the number of callbacks queued on the specified CPU.
     229             :  * Handles both the nocbs and normal cases.
     230             :  */
     231           0 : static long rcu_get_n_cbs_cpu(int cpu)
     232             : {
     233           0 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
     234             : 
     235           0 :         if (rcu_segcblist_is_enabled(&rdp->cblist))
     236           0 :                 return rcu_segcblist_n_cbs(&rdp->cblist);
     237             :         return 0;
     238             : }
     239             : 
     240       33078 : void rcu_softirq_qs(void)
     241             : {
     242       33078 :         rcu_qs();
     243       33077 :         rcu_preempt_deferred_qs(current);
     244       33077 : }
     245             : 
     246             : /*
     247             :  * Record entry into an extended quiescent state.  This is only to be
     248             :  * called when not already in an extended quiescent state, that is,
     249             :  * RCU is watching prior to the call to this function and is no longer
     250             :  * watching upon return.
     251             :  */
     252       34960 : static noinstr void rcu_dynticks_eqs_enter(void)
     253             : {
     254       34960 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
     255       34966 :         int seq;
     256             : 
     257             :         /*
     258             :          * CPUs seeing atomic_add_return() must see prior RCU read-side
     259             :          * critical sections, and we also must force ordering with the
     260             :          * next idle sojourn.
     261             :          */
     262       34966 :         rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
     263       34966 :         seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
     264             :         // RCU is no longer watching.  Better be in extended quiescent state!
     265       35017 :         WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
     266             :                      (seq & RCU_DYNTICK_CTRL_CTR));
     267             :         /* Better not have special action (TLB flush) pending! */
     268       35017 :         WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
     269             :                      (seq & RCU_DYNTICK_CTRL_MASK));
     270       35017 : }
     271             : 
     272             : /*
     273             :  * Record exit from an extended quiescent state.  This is only to be
     274             :  * called from an extended quiescent state, that is, RCU is not watching
     275             :  * prior to the call to this function and is watching upon return.
     276             :  */
     277       34256 : static noinstr void rcu_dynticks_eqs_exit(void)
     278             : {
     279       34256 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
     280       34544 :         int seq;
     281             : 
     282             :         /*
     283             :          * CPUs seeing atomic_add_return() must see prior idle sojourns,
     284             :          * and we also must force ordering with the next RCU read-side
     285             :          * critical section.
     286             :          */
     287       34544 :         seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
     288             :         // RCU is now watching.  Better not be in an extended quiescent state!
     289       34957 :         rcu_dynticks_task_trace_exit();  // After ->dynticks update!
     290       34957 :         WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
     291             :                      !(seq & RCU_DYNTICK_CTRL_CTR));
     292       34957 :         if (seq & RCU_DYNTICK_CTRL_MASK) {
     293           0 :                 arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
     294       34957 :                 smp_mb__after_atomic(); /* _exit after clearing mask. */
     295             :         }
     296       34957 : }
     297             : 
     298             : /*
     299             :  * Reset the current CPU's ->dynticks counter to indicate that the
     300             :  * newly onlined CPU is no longer in an extended quiescent state.
     301             :  * This will either leave the counter unchanged, or increment it
     302             :  * to the next non-quiescent value.
     303             :  *
     304             :  * The non-atomic test/increment sequence works because the upper bits
     305             :  * of the ->dynticks counter are manipulated only by the corresponding CPU,
     306             :  * or when the corresponding CPU is offline.
     307             :  */
     308           4 : static void rcu_dynticks_eqs_online(void)
     309             : {
     310           4 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
     311             : 
     312           4 :         if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
     313             :                 return;
     314           0 :         atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
     315             : }
     316             : 
     317             : /*
     318             :  * Is the current CPU in an extended quiescent state?
     319             :  *
     320             :  * No ordering, as we are sampling CPU-local information.
     321             :  */
     322    40582375 : static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
     323             : {
     324           0 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
     325             : 
     326    40624511 :         return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
     327             : }
     328             : 
     329             : /*
     330             :  * Snapshot the ->dynticks counter with full ordering so as to allow
     331             :  * stable comparison of this counter with past and future snapshots.
     332             :  */
     333        4174 : static int rcu_dynticks_snap(struct rcu_data *rdp)
     334             : {
     335        4174 :         int snap = atomic_add_return(0, &rdp->dynticks);
     336             : 
     337        4174 :         return snap & ~RCU_DYNTICK_CTRL_MASK;
     338             : }
     339             : 
     340             : /*
     341             :  * Return true if the snapshot returned from rcu_dynticks_snap()
     342             :  * indicates that RCU is in an extended quiescent state.
     343             :  */
     344        2866 : static bool rcu_dynticks_in_eqs(int snap)
     345             : {
     346         483 :         return !(snap & RCU_DYNTICK_CTRL_CTR);
     347             : }
     348             : 
     349             : /* Return true if the specified CPU is currently idle from an RCU viewpoint.  */
     350           0 : bool rcu_is_idle_cpu(int cpu)
     351             : {
     352           0 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
     353             : 
     354           0 :         return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
     355             : }
     356             : 
     357             : /*
     358             :  * Return true if the CPU corresponding to the specified rcu_data
     359             :  * structure has spent some time in an extended quiescent state since
     360             :  * rcu_dynticks_snap() returned the specified snapshot.
     361             :  */
     362        1308 : static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
     363             : {
     364         303 :         return snap != rcu_dynticks_snap(rdp);
     365             : }
     366             : 
     367             : /*
     368             :  * Return true if the referenced integer is zero while the specified
     369             :  * CPU remains within a single extended quiescent state.
     370             :  */
     371           0 : bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
     372             : {
     373           0 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
     374           0 :         int snap;
     375             : 
     376             :         // If not quiescent, force back to earlier extended quiescent state.
     377           0 :         snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
     378             :                                                RCU_DYNTICK_CTRL_CTR);
     379             : 
     380           0 :         smp_rmb(); // Order ->dynticks and *vp reads.
     381           0 :         if (READ_ONCE(*vp))
     382             :                 return false;  // Non-zero, so report failure;
     383           0 :         smp_rmb(); // Order *vp read and ->dynticks re-read.
     384             : 
     385             :         // If still in the same extended quiescent state, we are good!
     386           0 :         return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
     387             : }
     388             : 
     389             : /*
     390             :  * Set the special (bottom) bit of the specified CPU so that it
     391             :  * will take special action (such as flushing its TLB) on the
     392             :  * next exit from an extended quiescent state.  Returns true if
     393             :  * the bit was successfully set, or false if the CPU was not in
     394             :  * an extended quiescent state.
     395             :  */
     396           0 : bool rcu_eqs_special_set(int cpu)
     397             : {
     398           0 :         int old;
     399           0 :         int new;
     400           0 :         int new_old;
     401           0 :         struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
     402             : 
     403           0 :         new_old = atomic_read(&rdp->dynticks);
     404           0 :         do {
     405           0 :                 old = new_old;
     406           0 :                 if (old & RCU_DYNTICK_CTRL_CTR)
     407             :                         return false;
     408           0 :                 new = old | RCU_DYNTICK_CTRL_MASK;
     409           0 :                 new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
     410           0 :         } while (new_old != old);
     411             :         return true;
     412             : }
     413             : 
     414             : /*
     415             :  * Let the RCU core know that this CPU has gone through the scheduler,
     416             :  * which is a quiescent state.  This is called when the need for a
     417             :  * quiescent state is urgent, so we burn an atomic operation and full
     418             :  * memory barriers to let the RCU core know about it, regardless of what
     419             :  * this CPU might (or might not) do in the near future.
     420             :  *
     421             :  * We inform the RCU core by emulating a zero-duration dyntick-idle period.
     422             :  *
     423             :  * The caller must have disabled interrupts and must not be idle.
     424             :  */
     425         111 : notrace void rcu_momentary_dyntick_idle(void)
     426             : {
     427         111 :         int special;
     428             : 
     429         111 :         raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
     430         359 :         special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
     431         111 :                                     &this_cpu_ptr(&rcu_data)->dynticks);
     432             :         /* It is illegal to call this from idle state. */
     433         128 :         WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
     434         128 :         rcu_preempt_deferred_qs(current);
     435         128 : }
     436             : EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
     437             : 
     438             : /**
     439             :  * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
     440             :  *
     441             :  * If the current CPU is idle and running at a first-level (not nested)
     442             :  * interrupt, or directly, from idle, return true.
     443             :  *
     444             :  * The caller must have at least disabled IRQs.
     445             :  */
     446       53883 : static int rcu_is_cpu_rrupt_from_idle(void)
     447             : {
     448       53883 :         long nesting;
     449             : 
     450             :         /*
     451             :          * Usually called from the tick; but also used from smp_function_call()
     452             :          * for expedited grace periods. This latter can result in running from
     453             :          * the idle task, instead of an actual IPI.
     454             :          */
     455      108170 :         lockdep_assert_irqs_disabled();
     456             : 
     457             :         /* Check for counter underflows */
     458       54395 :         RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
     459             :                          "RCU dynticks_nesting counter underflow!");
     460       54638 :         RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
     461             :                          "RCU dynticks_nmi_nesting counter underflow/zero!");
     462             : 
     463             :         /* Are we at first interrupt nesting level? */
     464       55133 :         nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
     465       55133 :         if (nesting > 1)
     466             :                 return false;
     467             : 
     468             :         /*
     469             :          * If we're not in an interrupt, we must be in the idle task!
     470             :          */
     471       23053 :         WARN_ON_ONCE(!nesting && !is_idle_task(current));
     472             : 
     473             :         /* Does CPU appear to be idle from an RCU standpoint? */
     474       23053 :         return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
     475             : }
     476             : 
     477             : #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
     478             :                                 // Maximum callbacks per rcu_do_batch ...
     479             : #define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
     480             : static long blimit = DEFAULT_RCU_BLIMIT;
     481             : #define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
     482             : static long qhimark = DEFAULT_RCU_QHIMARK;
     483             : #define DEFAULT_RCU_QLOMARK 100   // Once only this many pending, use blimit.
     484             : static long qlowmark = DEFAULT_RCU_QLOMARK;
     485             : #define DEFAULT_RCU_QOVLD_MULT 2
     486             : #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
     487             : static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
     488             : static long qovld_calc = -1;      // No pre-initialization lock acquisitions!
     489             : 
     490             : module_param(blimit, long, 0444);
     491             : module_param(qhimark, long, 0444);
     492             : module_param(qlowmark, long, 0444);
     493             : module_param(qovld, long, 0444);
     494             : 
     495             : static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX;
     496             : static ulong jiffies_till_next_fqs = ULONG_MAX;
     497             : static bool rcu_kick_kthreads;
     498             : static int rcu_divisor = 7;
     499             : module_param(rcu_divisor, int, 0644);
     500             : 
     501             : /* Force an exit from rcu_do_batch() after 3 milliseconds. */
     502             : static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
     503             : module_param(rcu_resched_ns, long, 0644);
     504             : 
     505             : /*
     506             :  * How long the grace period must be before we start recruiting
     507             :  * quiescent-state help from rcu_note_context_switch().
     508             :  */
     509             : static ulong jiffies_till_sched_qs = ULONG_MAX;
     510             : module_param(jiffies_till_sched_qs, ulong, 0444);
     511             : static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
     512             : module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
     513             : 
     514             : /*
     515             :  * Make sure that we give the grace-period kthread time to detect any
     516             :  * idle CPUs before taking active measures to force quiescent states.
     517             :  * However, don't go below 100 milliseconds, adjusted upwards for really
     518             :  * large systems.
     519             :  */
     520           1 : static void adjust_jiffies_till_sched_qs(void)
     521             : {
     522           1 :         unsigned long j;
     523             : 
     524             :         /* If jiffies_till_sched_qs was specified, respect the request. */
     525           1 :         if (jiffies_till_sched_qs != ULONG_MAX) {
     526           0 :                 WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
     527           0 :                 return;
     528             :         }
     529             :         /* Otherwise, set to third fqs scan, but bound below on large system. */
     530           1 :         j = READ_ONCE(jiffies_till_first_fqs) +
     531           1 :                       2 * READ_ONCE(jiffies_till_next_fqs);
     532           1 :         if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
     533             :                 j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
     534           1 :         pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
     535           1 :         WRITE_ONCE(jiffies_to_sched_qs, j);
     536             : }
     537             : 
     538           0 : static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp)
     539             : {
     540           0 :         ulong j;
     541           0 :         int ret = kstrtoul(val, 0, &j);
     542             : 
     543           0 :         if (!ret) {
     544           0 :                 WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j);
     545           0 :                 adjust_jiffies_till_sched_qs();
     546             :         }
     547           0 :         return ret;
     548             : }
     549             : 
     550           0 : static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp)
     551             : {
     552           0 :         ulong j;
     553           0 :         int ret = kstrtoul(val, 0, &j);
     554             : 
     555           0 :         if (!ret) {
     556           0 :                 WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1));
     557           0 :                 adjust_jiffies_till_sched_qs();
     558             :         }
     559           0 :         return ret;
     560             : }
     561             : 
     562             : static const struct kernel_param_ops first_fqs_jiffies_ops = {
     563             :         .set = param_set_first_fqs_jiffies,
     564             :         .get = param_get_ulong,
     565             : };
     566             : 
     567             : static const struct kernel_param_ops next_fqs_jiffies_ops = {
     568             :         .set = param_set_next_fqs_jiffies,
     569             :         .get = param_get_ulong,
     570             : };
     571             : 
     572             : module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644);
     573             : module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644);
     574             : module_param(rcu_kick_kthreads, bool, 0644);
     575             : 
     576             : static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
     577             : static int rcu_pending(int user);
     578             : 
     579             : /*
     580             :  * Return the number of RCU GPs completed thus far for debug & stats.
     581             :  */
     582           0 : unsigned long rcu_get_gp_seq(void)
     583             : {
     584           0 :         return READ_ONCE(rcu_state.gp_seq);
     585             : }
     586             : EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
     587             : 
     588             : /*
     589             :  * Return the number of RCU expedited batches completed thus far for
     590             :  * debug & stats.  Odd numbers mean that a batch is in progress, even
     591             :  * numbers mean idle.  The value returned will thus be roughly double
     592             :  * the cumulative batches since boot.
     593             :  */
     594           0 : unsigned long rcu_exp_batches_completed(void)
     595             : {
     596           0 :         return rcu_state.expedited_sequence;
     597             : }
     598             : EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
     599             : 
     600             : /*
     601             :  * Return the root node of the rcu_state structure.
     602             :  */
     603       82132 : static struct rcu_node *rcu_get_root(void)
     604             : {
     605       54835 :         return &rcu_state.node[0];
     606             : }
     607             : 
     608             : /*
     609             :  * Send along grace-period-related data for rcutorture diagnostics.
     610             :  */
     611           0 : void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
     612             :                             unsigned long *gp_seq)
     613             : {
     614           0 :         switch (test_type) {
     615             :         case RCU_FLAVOR:
     616           0 :                 *flags = READ_ONCE(rcu_state.gp_flags);
     617           0 :                 *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
     618           0 :                 break;
     619             :         default:
     620             :                 break;
     621             :         }
     622           0 : }
     623             : EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
     624             : 
     625             : /*
     626             :  * Enter an RCU extended quiescent state, which can be either the
     627             :  * idle loop or adaptive-tickless usermode execution.
     628             :  *
     629             :  * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
     630             :  * the possibility of usermode upcalls having messed up our count
     631             :  * of interrupt nesting level during the prior busy period.
     632             :  */
     633       17371 : static noinstr void rcu_eqs_enter(bool user)
     634             : {
     635       17371 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
     636             : 
     637       17371 :         WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
     638       17371 :         WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
     639       17371 :         WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
     640             :                      rdp->dynticks_nesting == 0);
     641       17371 :         if (rdp->dynticks_nesting != 1) {
     642             :                 // RCU will still be watching, so just do accounting and leave.
     643           0 :                 rdp->dynticks_nesting--;
     644           0 :                 return;
     645             :         }
     646             : 
     647       34754 :         lockdep_assert_irqs_disabled();
     648       17382 :         instrumentation_begin();
     649       17382 :         trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
     650       17374 :         WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
     651       17374 :         rdp = this_cpu_ptr(&rcu_data);
     652       17375 :         rcu_prepare_for_idle();
     653       17375 :         rcu_preempt_deferred_qs(current);
     654             : 
     655             :         // instrumentation for the noinstr rcu_dynticks_eqs_enter()
     656       17375 :         instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
     657             : 
     658       17393 :         instrumentation_end();
     659       17393 :         WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
     660             :         // RCU is watching here ...
     661       17393 :         rcu_dynticks_eqs_enter();
     662             :         // ... but is no longer watching here.
     663       17393 :         rcu_dynticks_task_enter();
     664             : }
     665             : 
     666             : /**
     667             :  * rcu_idle_enter - inform RCU that current CPU is entering idle
     668             :  *
     669             :  * Enter idle mode, in other words, -leave- the mode in which RCU
     670             :  * read-side critical sections can occur.  (Though RCU read-side
     671             :  * critical sections can occur in irq handlers in idle, a possibility
     672             :  * handled by irq_enter() and irq_exit().)
     673             :  *
     674             :  * If you add or remove a call to rcu_idle_enter(), be sure to test with
     675             :  * CONFIG_RCU_EQS_DEBUG=y.
     676             :  */
     677       17379 : void rcu_idle_enter(void)
     678             : {
     679       34761 :         lockdep_assert_irqs_disabled();
     680       17389 :         rcu_eqs_enter(false);
     681       17398 : }
     682             : EXPORT_SYMBOL_GPL(rcu_idle_enter);
     683             : 
     684             : #ifdef CONFIG_NO_HZ_FULL
     685             : 
     686             : #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
     687             : /*
     688             :  * An empty function that will trigger a reschedule on
     689             :  * IRQ tail once IRQs get re-enabled on userspace/guest resume.
     690             :  */
     691             : static void late_wakeup_func(struct irq_work *work)
     692             : {
     693             : }
     694             : 
     695             : static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
     696             :         IRQ_WORK_INIT(late_wakeup_func);
     697             : 
     698             : /*
     699             :  * If either:
     700             :  *
     701             :  * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
     702             :  * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
     703             :  *
     704             :  * In these cases the late RCU wake ups aren't supported in the resched loops and our
     705             :  * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
     706             :  * get re-enabled again.
     707             :  */
     708             : noinstr static void rcu_irq_work_resched(void)
     709             : {
     710             :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
     711             : 
     712             :         if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
     713             :                 return;
     714             : 
     715             :         if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
     716             :                 return;
     717             : 
     718             :         instrumentation_begin();
     719             :         if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
     720             :                 irq_work_queue(this_cpu_ptr(&late_wakeup_work));
     721             :         }
     722             :         instrumentation_end();
     723             : }
     724             : 
     725             : #else
     726             : static inline void rcu_irq_work_resched(void) { }
     727             : #endif
     728             : 
     729             : /**
     730             :  * rcu_user_enter - inform RCU that we are resuming userspace.
     731             :  *
     732             :  * Enter RCU idle mode right before resuming userspace.  No use of RCU
     733             :  * is permitted between this call and rcu_user_exit(). This way the
     734             :  * CPU doesn't need to maintain the tick for RCU maintenance purposes
     735             :  * when the CPU runs in userspace.
     736             :  *
     737             :  * If you add or remove a call to rcu_user_enter(), be sure to test with
     738             :  * CONFIG_RCU_EQS_DEBUG=y.
     739             :  */
     740             : noinstr void rcu_user_enter(void)
     741             : {
     742             :         lockdep_assert_irqs_disabled();
     743             : 
     744             :         /*
     745             :          * Other than generic entry implementation, we may be past the last
     746             :          * rescheduling opportunity in the entry code. Trigger a self IPI
     747             :          * that will fire and reschedule once we resume in user/guest mode.
     748             :          */
     749             :         rcu_irq_work_resched();
     750             :         rcu_eqs_enter(true);
     751             : }
     752             : 
     753             : #endif /* CONFIG_NO_HZ_FULL */
     754             : 
     755             : /**
     756             :  * rcu_nmi_exit - inform RCU of exit from NMI context
     757             :  *
     758             :  * If we are returning from the outermost NMI handler that interrupted an
     759             :  * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
     760             :  * to let the RCU grace-period handling know that the CPU is back to
     761             :  * being RCU-idle.
     762             :  *
     763             :  * If you add or remove a call to rcu_nmi_exit(), be sure to test
     764             :  * with CONFIG_RCU_EQS_DEBUG=y.
     765             :  */
     766       19178 : noinstr void rcu_nmi_exit(void)
     767             : {
     768       19178 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
     769             : 
     770       19199 :         instrumentation_begin();
     771             :         /*
     772             :          * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
     773             :          * (We are exiting an NMI handler, so RCU better be paying attention
     774             :          * to us!)
     775             :          */
     776       19199 :         WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
     777       19199 :         WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
     778             : 
     779             :         /*
     780             :          * If the nesting level is not 1, the CPU wasn't RCU-idle, so
     781             :          * leave it in non-RCU-idle state.
     782             :          */
     783       19199 :         if (rdp->dynticks_nmi_nesting != 1) {
     784        3240 :                 trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
     785        1620 :                                   atomic_read(&rdp->dynticks));
     786        1620 :                 WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
     787             :                            rdp->dynticks_nmi_nesting - 2);
     788        1620 :                 instrumentation_end();
     789        1620 :                 return;
     790             :         }
     791             : 
     792             :         /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
     793       17579 :         trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
     794       17569 :         WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
     795             : 
     796       17569 :         if (!in_nmi())
     797             :                 rcu_prepare_for_idle();
     798             : 
     799             :         // instrumentation for the noinstr rcu_dynticks_eqs_enter()
     800       17569 :         instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
     801       17577 :         instrumentation_end();
     802             : 
     803             :         // RCU is watching here ...
     804       17577 :         rcu_dynticks_eqs_enter();
     805             :         // ... but is no longer watching here.
     806             : 
     807       17608 :         if (!in_nmi())
     808             :                 rcu_dynticks_task_enter();
     809             : }
     810             : 
     811             : /**
     812             :  * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
     813             :  *
     814             :  * Exit from an interrupt handler, which might possibly result in entering
     815             :  * idle mode, in other words, leaving the mode in which read-side critical
     816             :  * sections can occur.  The caller must have disabled interrupts.
     817             :  *
     818             :  * This code assumes that the idle loop never does anything that might
     819             :  * result in unbalanced calls to irq_enter() and irq_exit().  If your
     820             :  * architecture's idle loop violates this assumption, RCU will give you what
     821             :  * you deserve, good and hard.  But very infrequently and irreproducibly.
     822             :  *
     823             :  * Use things like work queues to work around this limitation.
     824             :  *
     825             :  * You have been warned.
     826             :  *
     827             :  * If you add or remove a call to rcu_irq_exit(), be sure to test with
     828             :  * CONFIG_RCU_EQS_DEBUG=y.
     829             :  */
     830       19183 : void noinstr rcu_irq_exit(void)
     831             : {
     832       38388 :         lockdep_assert_irqs_disabled();
     833       19202 :         rcu_nmi_exit();
     834       19230 : }
     835             : 
     836             : /**
     837             :  * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
     838             :  *                        towards in kernel preemption
     839             :  *
     840             :  * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
     841             :  * from RCU point of view. Invoked from return from interrupt before kernel
     842             :  * preemption.
     843             :  */
     844           0 : void rcu_irq_exit_preempt(void)
     845             : {
     846           0 :         lockdep_assert_irqs_disabled();
     847           0 :         rcu_nmi_exit();
     848             : 
     849           0 :         RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
     850             :                          "RCU dynticks_nesting counter underflow/zero!");
     851           0 :         RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
     852             :                          DYNTICK_IRQ_NONIDLE,
     853             :                          "Bad RCU  dynticks_nmi_nesting counter\n");
     854           0 :         RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
     855             :                          "RCU in extended quiescent state!");
     856           0 : }
     857             : 
     858             : #ifdef CONFIG_PROVE_RCU
     859             : /**
     860             :  * rcu_irq_exit_check_preempt - Validate that scheduling is possible
     861             :  */
     862           0 : void rcu_irq_exit_check_preempt(void)
     863             : {
     864           0 :         lockdep_assert_irqs_disabled();
     865             : 
     866           0 :         RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
     867             :                          "RCU dynticks_nesting counter underflow/zero!");
     868           0 :         RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
     869             :                          DYNTICK_IRQ_NONIDLE,
     870             :                          "Bad RCU  dynticks_nmi_nesting counter\n");
     871           0 :         RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
     872             :                          "RCU in extended quiescent state!");
     873           0 : }
     874             : #endif /* #ifdef CONFIG_PROVE_RCU */
     875             : 
     876             : /*
     877             :  * Wrapper for rcu_irq_exit() where interrupts are enabled.
     878             :  *
     879             :  * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
     880             :  * with CONFIG_RCU_EQS_DEBUG=y.
     881             :  */
     882           0 : void rcu_irq_exit_irqson(void)
     883             : {
     884           0 :         unsigned long flags;
     885             : 
     886           0 :         local_irq_save(flags);
     887           0 :         rcu_irq_exit();
     888           0 :         local_irq_restore(flags);
     889           0 : }
     890             : 
     891             : /*
     892             :  * Exit an RCU extended quiescent state, which can be either the
     893             :  * idle loop or adaptive-tickless usermode execution.
     894             :  *
     895             :  * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
     896             :  * allow for the possibility of usermode upcalls messing up our count of
     897             :  * interrupt nesting level during the busy period that is just now starting.
     898             :  */
     899       17360 : static void noinstr rcu_eqs_exit(bool user)
     900             : {
     901       17360 :         struct rcu_data *rdp;
     902       17360 :         long oldval;
     903             : 
     904       34727 :         lockdep_assert_irqs_disabled();
     905       17370 :         rdp = this_cpu_ptr(&rcu_data);
     906       17373 :         oldval = rdp->dynticks_nesting;
     907       17373 :         WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
     908       17373 :         if (oldval) {
     909             :                 // RCU was already watching, so just do accounting and leave.
     910           0 :                 rdp->dynticks_nesting++;
     911           0 :                 return;
     912             :         }
     913       17373 :         rcu_dynticks_task_exit();
     914             :         // RCU is not watching here ...
     915       17373 :         rcu_dynticks_eqs_exit();
     916             :         // ... but is watching here.
     917       17397 :         instrumentation_begin();
     918             : 
     919             :         // instrumentation for the noinstr rcu_dynticks_eqs_exit()
     920       17397 :         instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
     921             : 
     922       17349 :         rcu_cleanup_after_idle();
     923       17349 :         trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
     924       17355 :         WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
     925       17355 :         WRITE_ONCE(rdp->dynticks_nesting, 1);
     926       17355 :         WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
     927       17355 :         WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
     928       17355 :         instrumentation_end();
     929             : }
     930             : 
     931             : /**
     932             :  * rcu_idle_exit - inform RCU that current CPU is leaving idle
     933             :  *
     934             :  * Exit idle mode, in other words, -enter- the mode in which RCU
     935             :  * read-side critical sections can occur.
     936             :  *
     937             :  * If you add or remove a call to rcu_idle_exit(), be sure to test with
     938             :  * CONFIG_RCU_EQS_DEBUG=y.
     939             :  */
     940       17315 : void rcu_idle_exit(void)
     941             : {
     942       17315 :         unsigned long flags;
     943             : 
     944       34671 :         local_irq_save(flags);
     945       17356 :         rcu_eqs_exit(false);
     946       17355 :         local_irq_restore(flags);
     947       17362 : }
     948             : EXPORT_SYMBOL_GPL(rcu_idle_exit);
     949             : 
     950             : #ifdef CONFIG_NO_HZ_FULL
     951             : /**
     952             :  * rcu_user_exit - inform RCU that we are exiting userspace.
     953             :  *
     954             :  * Exit RCU idle mode while entering the kernel because it can
     955             :  * run a RCU read side critical section anytime.
     956             :  *
     957             :  * If you add or remove a call to rcu_user_exit(), be sure to test with
     958             :  * CONFIG_RCU_EQS_DEBUG=y.
     959             :  */
     960             : void noinstr rcu_user_exit(void)
     961             : {
     962             :         rcu_eqs_exit(1);
     963             : }
     964             : 
     965             : /**
     966             :  * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
     967             :  *
     968             :  * The scheduler tick is not normally enabled when CPUs enter the kernel
     969             :  * from nohz_full userspace execution.  After all, nohz_full userspace
     970             :  * execution is an RCU quiescent state and the time executing in the kernel
     971             :  * is quite short.  Except of course when it isn't.  And it is not hard to
     972             :  * cause a large system to spend tens of seconds or even minutes looping
     973             :  * in the kernel, which can cause a number of problems, include RCU CPU
     974             :  * stall warnings.
     975             :  *
     976             :  * Therefore, if a nohz_full CPU fails to report a quiescent state
     977             :  * in a timely manner, the RCU grace-period kthread sets that CPU's
     978             :  * ->rcu_urgent_qs flag with the expectation that the next interrupt or
     979             :  * exception will invoke this function, which will turn on the scheduler
     980             :  * tick, which will enable RCU to detect that CPU's quiescent states,
     981             :  * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels.
     982             :  * The tick will be disabled once a quiescent state is reported for
     983             :  * this CPU.
     984             :  *
     985             :  * Of course, in carefully tuned systems, there might never be an
     986             :  * interrupt or exception.  In that case, the RCU grace-period kthread
     987             :  * will eventually cause one to happen.  However, in less carefully
     988             :  * controlled environments, this function allows RCU to get what it
     989             :  * needs without creating otherwise useless interruptions.
     990             :  */
     991             : void __rcu_irq_enter_check_tick(void)
     992             : {
     993             :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
     994             : 
     995             :         // If we're here from NMI there's nothing to do.
     996             :         if (in_nmi())
     997             :                 return;
     998             : 
     999             :         RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
    1000             :                          "Illegal rcu_irq_enter_check_tick() from extended quiescent state");
    1001             : 
    1002             :         if (!tick_nohz_full_cpu(rdp->cpu) ||
    1003             :             !READ_ONCE(rdp->rcu_urgent_qs) ||
    1004             :             READ_ONCE(rdp->rcu_forced_tick)) {
    1005             :                 // RCU doesn't need nohz_full help from this CPU, or it is
    1006             :                 // already getting that help.
    1007             :                 return;
    1008             :         }
    1009             : 
    1010             :         // We get here only when not in an extended quiescent state and
    1011             :         // from interrupts (as opposed to NMIs).  Therefore, (1) RCU is
    1012             :         // already watching and (2) The fact that we are in an interrupt
    1013             :         // handler and that the rcu_node lock is an irq-disabled lock
    1014             :         // prevents self-deadlock.  So we can safely recheck under the lock.
    1015             :         // Note that the nohz_full state currently cannot change.
    1016             :         raw_spin_lock_rcu_node(rdp->mynode);
    1017             :         if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
    1018             :                 // A nohz_full CPU is in the kernel and RCU needs a
    1019             :                 // quiescent state.  Turn on the tick!
    1020             :                 WRITE_ONCE(rdp->rcu_forced_tick, true);
    1021             :                 tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
    1022             :         }
    1023             :         raw_spin_unlock_rcu_node(rdp->mynode);
    1024             : }
    1025             : #endif /* CONFIG_NO_HZ_FULL */
    1026             : 
    1027             : /**
    1028             :  * rcu_nmi_enter - inform RCU of entry to NMI context
    1029             :  *
    1030             :  * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
    1031             :  * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
    1032             :  * that the CPU is active.  This implementation permits nested NMIs, as
    1033             :  * long as the nesting level does not overflow an int.  (You will probably
    1034             :  * run out of stack space first.)
    1035             :  *
    1036             :  * If you add or remove a call to rcu_nmi_enter(), be sure to test
    1037             :  * with CONFIG_RCU_EQS_DEBUG=y.
    1038             :  */
    1039       18865 : noinstr void rcu_nmi_enter(void)
    1040             : {
    1041       18865 :         long incby = 2;
    1042       18865 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    1043             : 
    1044             :         /* Complain about underflow. */
    1045       19083 :         WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
    1046             : 
    1047             :         /*
    1048             :          * If idle from RCU viewpoint, atomically increment ->dynticks
    1049             :          * to mark non-idle and increment ->dynticks_nmi_nesting by one.
    1050             :          * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
    1051             :          * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
    1052             :          * to be in the outermost NMI handler that interrupted an RCU-idle
    1053             :          * period (observation due to Andy Lutomirski).
    1054             :          */
    1055       19083 :         if (rcu_dynticks_curr_cpu_in_eqs()) {
    1056             : 
    1057       17572 :                 if (!in_nmi())
    1058             :                         rcu_dynticks_task_exit();
    1059             : 
    1060             :                 // RCU is not watching here ...
    1061       17572 :                 rcu_dynticks_eqs_exit();
    1062             :                 // ... but is watching here.
    1063             : 
    1064       17516 :                 if (!in_nmi()) {
    1065             :                         instrumentation_begin();
    1066             :                         rcu_cleanup_after_idle();
    1067       17516 :                         instrumentation_end();
    1068             :                 }
    1069             : 
    1070       17516 :                 instrumentation_begin();
    1071             :                 // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
    1072       17516 :                 instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
    1073             :                 // instrumentation for the noinstr rcu_dynticks_eqs_exit()
    1074       17193 :                 instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
    1075             : 
    1076       17193 :                 incby = 1;
    1077        1620 :         } else if (!in_nmi()) {
    1078             :                 instrumentation_begin();
    1079             :                 rcu_irq_enter_check_tick();
    1080             :                 instrumentation_end();
    1081             :         } else  {
    1082       19122 :                 instrumentation_begin();
    1083             :         }
    1084             : 
    1085       38130 :         trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
    1086             :                           rdp->dynticks_nmi_nesting,
    1087       19122 :                           rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
    1088       19008 :         instrumentation_end();
    1089       19008 :         WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
    1090             :                    rdp->dynticks_nmi_nesting + incby);
    1091       19008 :         barrier();
    1092       19030 : }
    1093             : 
    1094             : /**
    1095             :  * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
    1096             :  *
    1097             :  * Enter an interrupt handler, which might possibly result in exiting
    1098             :  * idle mode, in other words, entering the mode in which read-side critical
    1099             :  * sections can occur.  The caller must have disabled interrupts.
    1100             :  *
    1101             :  * Note that the Linux kernel is fully capable of entering an interrupt
    1102             :  * handler that it never exits, for example when doing upcalls to user mode!
    1103             :  * This code assumes that the idle loop never does upcalls to user mode.
    1104             :  * If your architecture's idle loop does do upcalls to user mode (or does
    1105             :  * anything else that results in unbalanced calls to the irq_enter() and
    1106             :  * irq_exit() functions), RCU will give you what you deserve, good and hard.
    1107             :  * But very infrequently and irreproducibly.
    1108             :  *
    1109             :  * Use things like work queues to work around this limitation.
    1110             :  *
    1111             :  * You have been warned.
    1112             :  *
    1113             :  * If you add or remove a call to rcu_irq_enter(), be sure to test with
    1114             :  * CONFIG_RCU_EQS_DEBUG=y.
    1115             :  */
    1116       18860 : noinstr void rcu_irq_enter(void)
    1117             : {
    1118       37736 :         lockdep_assert_irqs_disabled();
    1119       18867 :         rcu_nmi_enter();
    1120       19031 : }
    1121             : 
    1122             : /*
    1123             :  * Wrapper for rcu_irq_enter() where interrupts are enabled.
    1124             :  *
    1125             :  * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
    1126             :  * with CONFIG_RCU_EQS_DEBUG=y.
    1127             :  */
    1128           0 : void rcu_irq_enter_irqson(void)
    1129             : {
    1130           0 :         unsigned long flags;
    1131             : 
    1132           0 :         local_irq_save(flags);
    1133           0 :         rcu_irq_enter();
    1134           0 :         local_irq_restore(flags);
    1135           0 : }
    1136             : 
    1137             : /*
    1138             :  * If any sort of urgency was applied to the current CPU (for example,
    1139             :  * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
    1140             :  * to get to a quiescent state, disable it.
    1141             :  */
    1142        8072 : static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
    1143             : {
    1144       16144 :         raw_lockdep_assert_held_rcu_node(rdp->mynode);
    1145        8072 :         WRITE_ONCE(rdp->rcu_urgent_qs, false);
    1146        8072 :         WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
    1147        8072 :         if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
    1148             :                 tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
    1149        8072 :                 WRITE_ONCE(rdp->rcu_forced_tick, false);
    1150             :         }
    1151        8072 : }
    1152             : 
    1153             : /**
    1154             :  * rcu_is_watching - see if RCU thinks that the current CPU is not idle
    1155             :  *
    1156             :  * Return true if RCU is watching the running CPU, which means that this
    1157             :  * CPU can safely enter RCU read-side critical sections.  In other words,
    1158             :  * if the current CPU is not in its idle loop or is in an interrupt or
    1159             :  * NMI handler, return true.
    1160             :  *
    1161             :  * Make notrace because it can be called by the internal functions of
    1162             :  * ftrace, and making this notrace removes unnecessary recursion calls.
    1163             :  */
    1164    40529675 : notrace bool rcu_is_watching(void)
    1165             : {
    1166    40529675 :         bool ret;
    1167             : 
    1168    39902398 :         preempt_disable_notrace();
    1169    40544093 :         ret = !rcu_dynticks_curr_cpu_in_eqs();
    1170    40586120 :         preempt_enable_notrace();
    1171           0 :         return ret;
    1172             : }
    1173             : EXPORT_SYMBOL_GPL(rcu_is_watching);
    1174             : 
    1175             : /*
    1176             :  * If a holdout task is actually running, request an urgent quiescent
    1177             :  * state from its CPU.  This is unsynchronized, so migrations can cause
    1178             :  * the request to go to the wrong CPU.  Which is OK, all that will happen
    1179             :  * is that the CPU's next context switch will be a bit slower and next
    1180             :  * time around this task will generate another request.
    1181             :  */
    1182           0 : void rcu_request_urgent_qs_task(struct task_struct *t)
    1183             : {
    1184           0 :         int cpu;
    1185             : 
    1186           0 :         barrier();
    1187           0 :         cpu = task_cpu(t);
    1188           0 :         if (!task_curr(t))
    1189             :                 return; /* This task is not running on that CPU. */
    1190           0 :         smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
    1191             : }
    1192             : 
    1193             : #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
    1194             : 
    1195             : /*
    1196             :  * Is the current CPU online as far as RCU is concerned?
    1197             :  *
    1198             :  * Disable preemption to avoid false positives that could otherwise
    1199             :  * happen due to the current CPU number being sampled, this task being
    1200             :  * preempted, its old CPU being taken offline, resuming on some other CPU,
    1201             :  * then determining that its old CPU is now offline.
    1202             :  *
    1203             :  * Disable checking if in an NMI handler because we cannot safely
    1204             :  * report errors from NMI handlers anyway.  In addition, it is OK to use
    1205             :  * RCU on an offline processor during initial boot, hence the check for
    1206             :  * rcu_scheduler_fully_active.
    1207             :  */
    1208    34980094 : bool rcu_lockdep_current_cpu_online(void)
    1209             : {
    1210    34980094 :         struct rcu_data *rdp;
    1211    34980094 :         struct rcu_node *rnp;
    1212    34980094 :         bool ret = false;
    1213             : 
    1214    34980094 :         if (in_nmi() || !rcu_scheduler_fully_active)
    1215             :                 return true;
    1216    34975720 :         preempt_disable_notrace();
    1217    34924124 :         rdp = this_cpu_ptr(&rcu_data);
    1218    34984186 :         rnp = rdp->mynode;
    1219    34984186 :         if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
    1220    34984186 :                 ret = true;
    1221    34984186 :         preempt_enable_notrace();
    1222    34988061 :         return ret;
    1223             : }
    1224             : EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
    1225             : 
    1226             : #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
    1227             : 
    1228             : /*
    1229             :  * We are reporting a quiescent state on behalf of some other CPU, so
    1230             :  * it is our responsibility to check for and handle potential overflow
    1231             :  * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
    1232             :  * After all, the CPU might be in deep idle state, and thus executing no
    1233             :  * code whatsoever.
    1234             :  */
    1235       10453 : static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
    1236             : {
    1237       20906 :         raw_lockdep_assert_held_rcu_node(rnp);
    1238       10453 :         if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
    1239             :                          rnp->gp_seq))
    1240           0 :                 WRITE_ONCE(rdp->gpwrap, true);
    1241       10453 :         if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
    1242           0 :                 rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
    1243       10453 : }
    1244             : 
    1245             : /*
    1246             :  * Snapshot the specified CPU's dynticks counter so that we can later
    1247             :  * credit them with an implicit quiescent state.  Return 1 if this CPU
    1248             :  * is in dynticks idle mode, which is an extended quiescent state.
    1249             :  */
    1250        2379 : static int dyntick_save_progress_counter(struct rcu_data *rdp)
    1251             : {
    1252        2379 :         rdp->dynticks_snap = rcu_dynticks_snap(rdp);
    1253        2379 :         if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
    1254         710 :                 trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
    1255         710 :                 rcu_gpnum_ovf(rdp->mynode, rdp);
    1256         710 :                 return 1;
    1257             :         }
    1258             :         return 0;
    1259             : }
    1260             : 
    1261             : /*
    1262             :  * Return true if the specified CPU has passed through a quiescent
    1263             :  * state by virtue of being in or having passed through an dynticks
    1264             :  * idle state since the last call to dyntick_save_progress_counter()
    1265             :  * for this same CPU, or by virtue of having been offline.
    1266             :  */
    1267        1005 : static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
    1268             : {
    1269        1005 :         unsigned long jtsq;
    1270        1005 :         bool *rnhqp;
    1271        1005 :         bool *ruqp;
    1272        1005 :         struct rcu_node *rnp = rdp->mynode;
    1273             : 
    1274             :         /*
    1275             :          * If the CPU passed through or entered a dynticks idle phase with
    1276             :          * no active irq/NMI handlers, then we can safely pretend that the CPU
    1277             :          * already acknowledged the request to pass through a quiescent
    1278             :          * state.  Either way, that CPU cannot possibly be in an RCU
    1279             :          * read-side critical section that started before the beginning
    1280             :          * of the current RCU grace period.
    1281             :          */
    1282        1005 :         if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) {
    1283          47 :                 trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
    1284          47 :                 rcu_gpnum_ovf(rnp, rdp);
    1285          47 :                 return 1;
    1286             :         }
    1287             : 
    1288             :         /*
    1289             :          * Complain if a CPU that is considered to be offline from RCU's
    1290             :          * perspective has not yet reported a quiescent state.  After all,
    1291             :          * the offline CPU should have reported a quiescent state during
    1292             :          * the CPU-offline process, or, failing that, by rcu_gp_init()
    1293             :          * if it ran concurrently with either the CPU going offline or the
    1294             :          * last task on a leaf rcu_node structure exiting its RCU read-side
    1295             :          * critical section while all CPUs corresponding to that structure
    1296             :          * are offline.  This added warning detects bugs in any of these
    1297             :          * code paths.
    1298             :          *
    1299             :          * The rcu_node structure's ->lock is held here, which excludes
    1300             :          * the relevant portions the CPU-hotplug code, the grace-period
    1301             :          * initialization code, and the rcu_read_unlock() code paths.
    1302             :          *
    1303             :          * For more detail, please refer to the "Hotplug CPU" section
    1304             :          * of RCU's Requirements documentation.
    1305             :          */
    1306         958 :         if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
    1307           0 :                 bool onl;
    1308           0 :                 struct rcu_node *rnp1;
    1309             : 
    1310           0 :                 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
    1311             :                         __func__, rnp->grplo, rnp->grphi, rnp->level,
    1312             :                         (long)rnp->gp_seq, (long)rnp->completedqs);
    1313           0 :                 for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
    1314           0 :                         pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
    1315             :                                 __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
    1316           0 :                 onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
    1317           0 :                 pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
    1318             :                         __func__, rdp->cpu, ".o"[onl],
    1319             :                         (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
    1320             :                         (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
    1321           0 :                 return 1; /* Break things loose after complaining. */
    1322             :         }
    1323             : 
    1324             :         /*
    1325             :          * A CPU running for an extended time within the kernel can
    1326             :          * delay RCU grace periods: (1) At age jiffies_to_sched_qs,
    1327             :          * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
    1328             :          * both .rcu_need_heavy_qs and .rcu_urgent_qs.  Note that the
    1329             :          * unsynchronized assignments to the per-CPU rcu_need_heavy_qs
    1330             :          * variable are safe because the assignments are repeated if this
    1331             :          * CPU failed to pass through a quiescent state.  This code
    1332             :          * also checks .jiffies_resched in case jiffies_to_sched_qs
    1333             :          * is set way high.
    1334             :          */
    1335         958 :         jtsq = READ_ONCE(jiffies_to_sched_qs);
    1336         958 :         ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
    1337         958 :         rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
    1338         958 :         if (!READ_ONCE(*rnhqp) &&
    1339         958 :             (time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
    1340         958 :              time_after(jiffies, rcu_state.jiffies_resched) ||
    1341         958 :              rcu_state.cbovld)) {
    1342           0 :                 WRITE_ONCE(*rnhqp, true);
    1343             :                 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
    1344           0 :                 smp_store_release(ruqp, true);
    1345         958 :         } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
    1346          22 :                 WRITE_ONCE(*ruqp, true);
    1347             :         }
    1348             : 
    1349             :         /*
    1350             :          * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
    1351             :          * The above code handles this, but only for straight cond_resched().
    1352             :          * And some in-kernel loops check need_resched() before calling
    1353             :          * cond_resched(), which defeats the above code for CPUs that are
    1354             :          * running in-kernel with scheduling-clock interrupts disabled.
    1355             :          * So hit them over the head with the resched_cpu() hammer!
    1356             :          */
    1357         958 :         if (tick_nohz_full_cpu(rdp->cpu) &&
    1358             :             (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
    1359             :              rcu_state.cbovld)) {
    1360             :                 WRITE_ONCE(*ruqp, true);
    1361             :                 resched_cpu(rdp->cpu);
    1362         958 :                 WRITE_ONCE(rdp->last_fqs_resched, jiffies);
    1363             :         }
    1364             : 
    1365             :         /*
    1366             :          * If more than halfway to RCU CPU stall-warning time, invoke
    1367             :          * resched_cpu() more frequently to try to loosen things up a bit.
    1368             :          * Also check to see if the CPU is getting hammered with interrupts,
    1369             :          * but only once per grace period, just to keep the IPIs down to
    1370             :          * a dull roar.
    1371             :          */
    1372         958 :         if (time_after(jiffies, rcu_state.jiffies_resched)) {
    1373           0 :                 if (time_after(jiffies,
    1374             :                                READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
    1375           0 :                         resched_cpu(rdp->cpu);
    1376           0 :                         WRITE_ONCE(rdp->last_fqs_resched, jiffies);
    1377             :                 }
    1378           0 :                 if (IS_ENABLED(CONFIG_IRQ_WORK) &&
    1379           0 :                     !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
    1380           0 :                     (rnp->ffmask & rdp->grpmask)) {
    1381           0 :                         rdp->rcu_iw_pending = true;
    1382           0 :                         rdp->rcu_iw_gp_seq = rnp->gp_seq;
    1383           0 :                         irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
    1384             :                 }
    1385             :         }
    1386             : 
    1387             :         return 0;
    1388             : }
    1389             : 
    1390             : /* Trace-event wrapper function for trace_rcu_future_grace_period.  */
    1391       26648 : static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
    1392             :                               unsigned long gp_seq_req, const char *s)
    1393             : {
    1394       26648 :         trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
    1395       26648 :                                       gp_seq_req, rnp->level,
    1396             :                                       rnp->grplo, rnp->grphi, s);
    1397             : }
    1398             : 
    1399             : /*
    1400             :  * rcu_start_this_gp - Request the start of a particular grace period
    1401             :  * @rnp_start: The leaf node of the CPU from which to start.
    1402             :  * @rdp: The rcu_data corresponding to the CPU from which to start.
    1403             :  * @gp_seq_req: The gp_seq of the grace period to start.
    1404             :  *
    1405             :  * Start the specified grace period, as needed to handle newly arrived
    1406             :  * callbacks.  The required future grace periods are recorded in each
    1407             :  * rcu_node structure's ->gp_seq_needed field.  Returns true if there
    1408             :  * is reason to awaken the grace-period kthread.
    1409             :  *
    1410             :  * The caller must hold the specified rcu_node structure's ->lock, which
    1411             :  * is why the caller is responsible for waking the grace-period kthread.
    1412             :  *
    1413             :  * Returns true if the GP thread needs to be awakened else false.
    1414             :  */
    1415       12314 : static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
    1416             :                               unsigned long gp_seq_req)
    1417             : {
    1418       12314 :         bool ret = false;
    1419       12314 :         struct rcu_node *rnp;
    1420             : 
    1421             :         /*
    1422             :          * Use funnel locking to either acquire the root rcu_node
    1423             :          * structure's lock or bail out if the need for this grace period
    1424             :          * has already been recorded -- or if that grace period has in
    1425             :          * fact already started.  If there is already a grace period in
    1426             :          * progress in a non-leaf node, no recording is needed because the
    1427             :          * end of the grace period will scan the leaf rcu_node structures.
    1428             :          * Note that rnp_start->lock must not be released.
    1429             :          */
    1430       24628 :         raw_lockdep_assert_held_rcu_node(rnp_start);
    1431       12314 :         trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
    1432       12314 :         for (rnp = rnp_start; 1; rnp = rnp->parent) {
    1433       12314 :                 if (rnp != rnp_start)
    1434           0 :                         raw_spin_lock_rcu_node(rnp);
    1435       12314 :                 if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) ||
    1436        2021 :                     rcu_seq_started(&rnp->gp_seq, gp_seq_req) ||
    1437           0 :                     (rnp != rnp_start &&
    1438           0 :                      rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) {
    1439       10293 :                         trace_rcu_this_gp(rnp, rdp, gp_seq_req,
    1440       10293 :                                           TPS("Prestarted"));
    1441       10293 :                         goto unlock_out;
    1442             :                 }
    1443        2021 :                 WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
    1444        2021 :                 if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
    1445             :                         /*
    1446             :                          * We just marked the leaf or internal node, and a
    1447             :                          * grace period is in progress, which means that
    1448             :                          * rcu_gp_cleanup() will see the marking.  Bail to
    1449             :                          * reduce contention.
    1450             :                          */
    1451        1993 :                         trace_rcu_this_gp(rnp_start, rdp, gp_seq_req,
    1452        1993 :                                           TPS("Startedleaf"));
    1453        1993 :                         goto unlock_out;
    1454             :                 }
    1455          28 :                 if (rnp != rnp_start && rnp->parent != NULL)
    1456           0 :                         raw_spin_unlock_rcu_node(rnp);
    1457          28 :                 if (!rnp->parent)
    1458             :                         break;  /* At root, and perhaps also leaf. */
    1459             :         }
    1460             : 
    1461             :         /* If GP already in progress, just leave, otherwise start one. */
    1462          28 :         if (rcu_gp_in_progress()) {
    1463          12 :                 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
    1464          12 :                 goto unlock_out;
    1465             :         }
    1466          16 :         trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
    1467          16 :         WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
    1468          16 :         WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
    1469          16 :         if (!READ_ONCE(rcu_state.gp_kthread)) {
    1470           1 :                 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
    1471           1 :                 goto unlock_out;
    1472             :         }
    1473       12314 :         trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq"));
    1474       12314 :         ret = true;  /* Caller must wake GP kthread. */
    1475       12314 : unlock_out:
    1476             :         /* Push furthest requested GP to leaf node and rcu_data structure. */
    1477       12314 :         if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
    1478           0 :                 WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
    1479           0 :                 WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
    1480             :         }
    1481       12314 :         if (rnp != rnp_start)
    1482           0 :                 raw_spin_unlock_rcu_node(rnp);
    1483       12314 :         return ret;
    1484             : }
    1485             : 
    1486             : /*
    1487             :  * Clean up any old requests for the just-ended grace period.  Also return
    1488             :  * whether any additional grace periods have been requested.
    1489             :  */
    1490        2019 : static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
    1491             : {
    1492        2019 :         bool needmore;
    1493        4038 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    1494             : 
    1495        2019 :         needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
    1496        2019 :         if (!needmore)
    1497          15 :                 rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */
    1498        2019 :         trace_rcu_this_gp(rnp, rdp, rnp->gp_seq,
    1499        2019 :                           needmore ? TPS("CleanupMore") : TPS("Cleanup"));
    1500        2019 :         return needmore;
    1501             : }
    1502             : 
    1503             : /*
    1504             :  * Awaken the grace-period kthread.  Don't do a self-awaken (unless in an
    1505             :  * interrupt or softirq handler, in which case we just might immediately
    1506             :  * sleep upon return, resulting in a grace-period hang), and don't bother
    1507             :  * awakening when there is nothing for the grace-period kthread to do
    1508             :  * (as in several CPUs raced to awaken, we lost), and finally don't try
    1509             :  * to awaken a kthread that has not yet been created.  If all those checks
    1510             :  * are passed, track some debug information and awaken.
    1511             :  *
    1512             :  * So why do the self-wakeup when in an interrupt or softirq handler
    1513             :  * in the grace-period kthread's context?  Because the kthread might have
    1514             :  * been interrupted just as it was going to sleep, and just after the final
    1515             :  * pre-sleep check of the awaken condition.  In this case, a wakeup really
    1516             :  * is required, and is therefore supplied.
    1517             :  */
    1518        2034 : static void rcu_gp_kthread_wake(void)
    1519             : {
    1520        2034 :         struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
    1521             : 
    1522        2034 :         if ((current == t && !in_irq() && !in_serving_softirq()) ||
    1523        1683 :             !READ_ONCE(rcu_state.gp_flags) || !t)
    1524             :                 return;
    1525        1683 :         WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
    1526        1683 :         WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
    1527        1683 :         swake_up_one(&rcu_state.gp_wq);
    1528             : }
    1529             : 
    1530             : /*
    1531             :  * If there is room, assign a ->gp_seq number to any callbacks on this
    1532             :  * CPU that have not already been assigned.  Also accelerate any callbacks
    1533             :  * that were previously assigned a ->gp_seq number that has since proven
    1534             :  * to be too conservative, which can happen if callbacks get assigned a
    1535             :  * ->gp_seq number while RCU is idle, but with reference to a non-root
    1536             :  * rcu_node structure.  This function is idempotent, so it does not hurt
    1537             :  * to call it repeatedly.  Returns an flag saying that we should awaken
    1538             :  * the RCU grace-period kthread.
    1539             :  *
    1540             :  * The caller must hold rnp->lock with interrupts disabled.
    1541             :  */
    1542       18684 : static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
    1543             : {
    1544       18684 :         unsigned long gp_seq_req;
    1545       18684 :         bool ret = false;
    1546             : 
    1547       18684 :         rcu_lockdep_assert_cblist_protected(rdp);
    1548       37368 :         raw_lockdep_assert_held_rcu_node(rnp);
    1549             : 
    1550             :         /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
    1551       18684 :         if (!rcu_segcblist_pend_cbs(&rdp->cblist))
    1552             :                 return false;
    1553             : 
    1554       16532 :         trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
    1555             : 
    1556             :         /*
    1557             :          * Callbacks are often registered with incomplete grace-period
    1558             :          * information.  Something about the fact that getting exact
    1559             :          * information requires acquiring a global lock...  RCU therefore
    1560             :          * makes a conservative estimate of the grace period number at which
    1561             :          * a given callback will become ready to invoke.        The following
    1562             :          * code checks this estimate and improves it when possible, thus
    1563             :          * accelerating callback invocation to an earlier grace-period
    1564             :          * number.
    1565             :          */
    1566       16532 :         gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
    1567       16532 :         if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
    1568       12314 :                 ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
    1569             : 
    1570             :         /* Trace depending on how much we were able to accelerate. */
    1571       16532 :         if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
    1572       18684 :                 trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
    1573             :         else
    1574       16532 :                 trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
    1575             : 
    1576       18684 :         trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
    1577             : 
    1578             :         return ret;
    1579             : }
    1580             : 
    1581             : /*
    1582             :  * Similar to rcu_accelerate_cbs(), but does not require that the leaf
    1583             :  * rcu_node structure's ->lock be held.  It consults the cached value
    1584             :  * of ->gp_seq_needed in the rcu_data structure, and if that indicates
    1585             :  * that a new grace-period request be made, invokes rcu_accelerate_cbs()
    1586             :  * while holding the leaf rcu_node structure's ->lock.
    1587             :  */
    1588          14 : static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
    1589             :                                         struct rcu_data *rdp)
    1590             : {
    1591          14 :         unsigned long c;
    1592          14 :         bool needwake;
    1593             : 
    1594          14 :         rcu_lockdep_assert_cblist_protected(rdp);
    1595          14 :         c = rcu_seq_snap(&rcu_state.gp_seq);
    1596          14 :         if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
    1597             :                 /* Old request still live, so mark recent callbacks. */
    1598           1 :                 (void)rcu_segcblist_accelerate(&rdp->cblist, c);
    1599           1 :                 return;
    1600             :         }
    1601          13 :         raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
    1602          13 :         needwake = rcu_accelerate_cbs(rnp, rdp);
    1603          26 :         raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
    1604          13 :         if (needwake)
    1605           9 :                 rcu_gp_kthread_wake();
    1606             : }
    1607             : 
    1608             : /*
    1609             :  * Move any callbacks whose grace period has completed to the
    1610             :  * RCU_DONE_TAIL sublist, then compact the remaining sublists and
    1611             :  * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
    1612             :  * sublist.  This function is idempotent, so it does not hurt to
    1613             :  * invoke it repeatedly.  As long as it is not invoked -too- often...
    1614             :  * Returns true if the RCU grace-period kthread needs to be awakened.
    1615             :  *
    1616             :  * The caller must hold rnp->lock with interrupts disabled.
    1617             :  */
    1618        7622 : static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
    1619             : {
    1620        7622 :         rcu_lockdep_assert_cblist_protected(rdp);
    1621       15244 :         raw_lockdep_assert_held_rcu_node(rnp);
    1622             : 
    1623             :         /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
    1624        7622 :         if (!rcu_segcblist_pend_cbs(&rdp->cblist))
    1625             :                 return false;
    1626             : 
    1627             :         /*
    1628             :          * Find all callbacks whose ->gp_seq numbers indicate that they
    1629             :          * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
    1630             :          */
    1631        7267 :         rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
    1632             : 
    1633             :         /* Classify any remaining callbacks. */
    1634        7267 :         return rcu_accelerate_cbs(rnp, rdp);
    1635             : }
    1636             : 
    1637             : /*
    1638             :  * Move and classify callbacks, but only if doing so won't require
    1639             :  * that the RCU grace-period kthread be awakened.
    1640             :  */
    1641             : static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
    1642             :                                                   struct rcu_data *rdp)
    1643             : {
    1644             :         rcu_lockdep_assert_cblist_protected(rdp);
    1645             :         if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
    1646             :             !raw_spin_trylock_rcu_node(rnp))
    1647             :                 return;
    1648             :         WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
    1649             :         raw_spin_unlock_rcu_node(rnp);
    1650             : }
    1651             : 
    1652             : /*
    1653             :  * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
    1654             :  * quiescent state.  This is intended to be invoked when the CPU notices
    1655             :  * a new grace period.
    1656             :  */
    1657        5653 : static void rcu_strict_gp_check_qs(void)
    1658             : {
    1659        5653 :         if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
    1660             :                 rcu_read_lock();
    1661             :                 rcu_read_unlock();
    1662             :         }
    1663             : }
    1664             : 
    1665             : /*
    1666             :  * Update CPU-local rcu_data state to record the beginnings and ends of
    1667             :  * grace periods.  The caller must hold the ->lock of the leaf rcu_node
    1668             :  * structure corresponding to the current CPU, and must have irqs disabled.
    1669             :  * Returns true if the grace-period kthread needs to be awakened.
    1670             :  */
    1671        9692 : static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
    1672             : {
    1673        9692 :         bool ret = false;
    1674        9692 :         bool need_qs;
    1675        9692 :         const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
    1676             : 
    1677       19384 :         raw_lockdep_assert_held_rcu_node(rnp);
    1678             : 
    1679        9692 :         if (rdp->gp_seq == rnp->gp_seq)
    1680             :                 return false; /* Nothing to do. */
    1681             : 
    1682             :         /* Handle the ends of any preceding grace periods first. */
    1683        9692 :         if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
    1684        2070 :             unlikely(READ_ONCE(rdp->gpwrap))) {
    1685        7622 :                 if (!offloaded)
    1686        7622 :                         ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
    1687        7622 :                 rdp->core_needs_qs = false;
    1688        7622 :                 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
    1689             :         } else {
    1690        2070 :                 if (!offloaded)
    1691        2070 :                         ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
    1692        2070 :                 if (rdp->core_needs_qs)
    1693           0 :                         rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
    1694             :         }
    1695             : 
    1696             :         /* Now handle the beginnings of any new-to-this-CPU grace periods. */
    1697        9692 :         if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
    1698        2064 :             unlikely(READ_ONCE(rdp->gpwrap))) {
    1699             :                 /*
    1700             :                  * If the current grace period is waiting for this CPU,
    1701             :                  * set up to detect a quiescent state, otherwise don't
    1702             :                  * go looking for one.
    1703             :                  */
    1704        7628 :                 trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
    1705        7628 :                 need_qs = !!(rnp->qsmask & rdp->grpmask);
    1706        7628 :                 rdp->cpu_no_qs.b.norm = need_qs;
    1707        7628 :                 rdp->core_needs_qs = need_qs;
    1708        7628 :                 zero_cpu_stall_ticks(rdp);
    1709             :         }
    1710        9692 :         rdp->gp_seq = rnp->gp_seq;  /* Remember new grace-period state. */
    1711        9692 :         if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
    1712        6955 :                 WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
    1713        9692 :         WRITE_ONCE(rdp->gpwrap, false);
    1714        9692 :         rcu_gpnum_ovf(rnp, rdp);
    1715        9692 :         return ret;
    1716             : }
    1717             : 
    1718       54025 : static void note_gp_changes(struct rcu_data *rdp)
    1719             : {
    1720       54025 :         unsigned long flags;
    1721       54025 :         bool needwake;
    1722       54025 :         struct rcu_node *rnp;
    1723             : 
    1724      108130 :         local_irq_save(flags);
    1725       54109 :         rnp = rdp->mynode;
    1726       54109 :         if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) &&
    1727       47761 :              !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
    1728        6348 :             !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
    1729       48535 :                 local_irq_restore(flags);
    1730       48562 :                 return;
    1731             :         }
    1732        5653 :         needwake = __note_gp_changes(rnp, rdp);
    1733       11306 :         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    1734        5653 :         rcu_strict_gp_check_qs();
    1735        5653 :         if (needwake)
    1736           6 :                 rcu_gp_kthread_wake();
    1737             : }
    1738             : 
    1739        6059 : static void rcu_gp_slow(int delay)
    1740             : {
    1741        6059 :         if (delay > 0 &&
    1742           0 :             !(rcu_seq_ctr(rcu_state.gp_seq) %
    1743           0 :               (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
    1744           0 :                 schedule_timeout_idle(delay);
    1745        6059 : }
    1746             : 
    1747             : static unsigned long sleep_duration;
    1748             : 
    1749             : /* Allow rcutorture to stall the grace-period kthread. */
    1750           0 : void rcu_gp_set_torture_wait(int duration)
    1751             : {
    1752           0 :         if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0)
    1753           0 :                 WRITE_ONCE(sleep_duration, duration);
    1754           0 : }
    1755             : EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait);
    1756             : 
    1757             : /* Actually implement the aforementioned wait. */
    1758        6495 : static void rcu_gp_torture_wait(void)
    1759             : {
    1760        6495 :         unsigned long duration;
    1761             : 
    1762        6495 :         if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST))
    1763        6495 :                 return;
    1764             :         duration = xchg(&sleep_duration, 0UL);
    1765             :         if (duration > 0) {
    1766             :                 pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
    1767             :                 schedule_timeout_idle(duration);
    1768             :                 pr_alert("%s: Wait complete\n", __func__);
    1769             :         }
    1770             : }
    1771             : 
    1772             : /*
    1773             :  * Handler for on_each_cpu() to invoke the target CPU's RCU core
    1774             :  * processing.
    1775             :  */
    1776             : static void rcu_strict_gp_boundary(void *unused)
    1777             : {
    1778             :         invoke_rcu_core();
    1779             : }
    1780             : 
    1781             : /*
    1782             :  * Initialize a new grace period.  Return false if no grace period required.
    1783             :  */
    1784        2020 : static bool rcu_gp_init(void)
    1785             : {
    1786        2020 :         unsigned long firstseq;
    1787        2020 :         unsigned long flags;
    1788        2020 :         unsigned long oldmask;
    1789        2020 :         unsigned long mask;
    1790        2020 :         struct rcu_data *rdp;
    1791        2020 :         struct rcu_node *rnp = rcu_get_root();
    1792             : 
    1793        2020 :         WRITE_ONCE(rcu_state.gp_activity, jiffies);
    1794        2020 :         raw_spin_lock_irq_rcu_node(rnp);
    1795        2020 :         if (!READ_ONCE(rcu_state.gp_flags)) {
    1796             :                 /* Spurious wakeup, tell caller to go back to sleep.  */
    1797           0 :                 raw_spin_unlock_irq_rcu_node(rnp);
    1798           0 :                 return false;
    1799             :         }
    1800        2020 :         WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */
    1801             : 
    1802        2020 :         if (WARN_ON_ONCE(rcu_gp_in_progress())) {
    1803             :                 /*
    1804             :                  * Grace period already in progress, don't start another.
    1805             :                  * Not supposed to be able to happen.
    1806             :                  */
    1807           0 :                 raw_spin_unlock_irq_rcu_node(rnp);
    1808           0 :                 return false;
    1809             :         }
    1810             : 
    1811             :         /* Advance to a new grace period and initialize state. */
    1812        2020 :         record_gp_stall_check_time();
    1813             :         /* Record GP times before starting GP, hence rcu_seq_start(). */
    1814        2020 :         rcu_seq_start(&rcu_state.gp_seq);
    1815        2020 :         ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
    1816        2020 :         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
    1817        4040 :         raw_spin_unlock_irq_rcu_node(rnp);
    1818             : 
    1819             :         /*
    1820             :          * Apply per-leaf buffered online and offline operations to
    1821             :          * the rcu_node tree. Note that this new grace period need not
    1822             :          * wait for subsequent online CPUs, and that RCU hooks in the CPU
    1823             :          * offlining path, when combined with checks in this function,
    1824             :          * will handle CPUs that are currently going offline or that will
    1825             :          * go offline later.  Please also refer to "Hotplug CPU" section
    1826             :          * of RCU's Requirements documentation.
    1827             :          */
    1828        2020 :         WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
    1829        4040 :         rcu_for_each_leaf_node(rnp) {
    1830        2020 :                 smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
    1831        2020 :                 firstseq = READ_ONCE(rnp->ofl_seq);
    1832        2020 :                 if (firstseq & 0x1)
    1833           0 :                         while (firstseq == READ_ONCE(rnp->ofl_seq))
    1834           0 :                                 schedule_timeout_idle(1);  // Can't wake unless RCU is watching.
    1835        2020 :                 smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
    1836        2020 :                 raw_spin_lock(&rcu_state.ofl_lock);
    1837        2020 :                 raw_spin_lock_irq_rcu_node(rnp);
    1838        2020 :                 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
    1839        2017 :                     !rnp->wait_blkd_tasks) {
    1840             :                         /* Nothing to do on this leaf rcu_node structure. */
    1841        4034 :                         raw_spin_unlock_irq_rcu_node(rnp);
    1842        2017 :                         raw_spin_unlock(&rcu_state.ofl_lock);
    1843        2017 :                         continue;
    1844             :                 }
    1845             : 
    1846             :                 /* Record old state, apply changes to ->qsmaskinit field. */
    1847           3 :                 oldmask = rnp->qsmaskinit;
    1848           3 :                 rnp->qsmaskinit = rnp->qsmaskinitnext;
    1849             : 
    1850             :                 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
    1851           3 :                 if (!oldmask != !rnp->qsmaskinit) {
    1852           1 :                         if (!oldmask) { /* First online CPU for rcu_node. */
    1853           1 :                                 if (!rnp->wait_blkd_tasks) /* Ever offline? */
    1854           1 :                                         rcu_init_new_rnp(rnp);
    1855           0 :                         } else if (rcu_preempt_has_tasks(rnp)) {
    1856             :                                 rnp->wait_blkd_tasks = true; /* blocked tasks */
    1857             :                         } else { /* Last offline CPU and can propagate. */
    1858           0 :                                 rcu_cleanup_dead_rnp(rnp);
    1859             :                         }
    1860             :                 }
    1861             : 
    1862             :                 /*
    1863             :                  * If all waited-on tasks from prior grace period are
    1864             :                  * done, and if all this rcu_node structure's CPUs are
    1865             :                  * still offline, propagate up the rcu_node tree and
    1866             :                  * clear ->wait_blkd_tasks.  Otherwise, if one of this
    1867             :                  * rcu_node structure's CPUs has since come back online,
    1868             :                  * simply clear ->wait_blkd_tasks.
    1869             :                  */
    1870           3 :                 if (rnp->wait_blkd_tasks &&
    1871           0 :                     (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) {
    1872           0 :                         rnp->wait_blkd_tasks = false;
    1873           0 :                         if (!rnp->qsmaskinit)
    1874           0 :                                 rcu_cleanup_dead_rnp(rnp);
    1875             :                 }
    1876             : 
    1877           6 :                 raw_spin_unlock_irq_rcu_node(rnp);
    1878           3 :                 raw_spin_unlock(&rcu_state.ofl_lock);
    1879             :         }
    1880        2020 :         rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
    1881             : 
    1882             :         /*
    1883             :          * Set the quiescent-state-needed bits in all the rcu_node
    1884             :          * structures for all currently online CPUs in breadth-first
    1885             :          * order, starting from the root rcu_node structure, relying on the
    1886             :          * layout of the tree within the rcu_state.node[] array.  Note that
    1887             :          * other CPUs will access only the leaves of the hierarchy, thus
    1888             :          * seeing that no grace period is in progress, at least until the
    1889             :          * corresponding leaf node has been initialized.
    1890             :          *
    1891             :          * The grace period cannot complete until the initialization
    1892             :          * process finishes, because this kthread handles both.
    1893             :          */
    1894        2020 :         WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
    1895        4040 :         rcu_for_each_node_breadth_first(rnp) {
    1896        2020 :                 rcu_gp_slow(gp_init_delay);
    1897        2020 :                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
    1898        2020 :                 rdp = this_cpu_ptr(&rcu_data);
    1899        2020 :                 rcu_preempt_check_blocked_tasks(rnp);
    1900        2020 :                 rnp->qsmask = rnp->qsmaskinit;
    1901        2020 :                 WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
    1902        2020 :                 if (rnp == rdp->mynode)
    1903        2020 :                         (void)__note_gp_changes(rnp, rdp);
    1904        2020 :                 rcu_preempt_boost_start_gp(rnp);
    1905        2020 :                 trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
    1906        2020 :                                             rnp->level, rnp->grplo,
    1907             :                                             rnp->grphi, rnp->qsmask);
    1908             :                 /* Quiescent states for tasks on any now-offline CPUs. */
    1909        2020 :                 mask = rnp->qsmask & ~rnp->qsmaskinitnext;
    1910        2020 :                 rnp->rcu_gp_init_mask = mask;
    1911        2020 :                 if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
    1912           0 :                         rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
    1913             :                 else
    1914        4040 :                         raw_spin_unlock_irq_rcu_node(rnp);
    1915        2020 :                 cond_resched_tasks_rcu_qs();
    1916        2020 :                 WRITE_ONCE(rcu_state.gp_activity, jiffies);
    1917             :         }
    1918             : 
    1919             :         // If strict, make all CPUs aware of new grace period.
    1920             :         if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
    1921             :                 on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
    1922             : 
    1923             :         return true;
    1924             : }
    1925             : 
    1926             : /*
    1927             :  * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
    1928             :  * time.
    1929             :  */
    1930       12720 : static bool rcu_gp_fqs_check_wake(int *gfp)
    1931             : {
    1932       12720 :         struct rcu_node *rnp = rcu_get_root();
    1933             : 
    1934             :         // If under overload conditions, force an immediate FQS scan.
    1935       12720 :         if (*gfp & RCU_GP_FLAG_OVLD)
    1936             :                 return true;
    1937             : 
    1938             :         // Someone like call_rcu() requested a force-quiescent-state scan.
    1939       12720 :         *gfp = READ_ONCE(rcu_state.gp_flags);
    1940       12720 :         if (*gfp & RCU_GP_FLAG_FQS)
    1941             :                 return true;
    1942             : 
    1943             :         // The current grace period has completed.
    1944       11055 :         if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
    1945         353 :                 return true;
    1946             : 
    1947             :         return false;
    1948             : }
    1949             : 
    1950             : /*
    1951             :  * Do one round of quiescent-state forcing.
    1952             :  */
    1953        2456 : static void rcu_gp_fqs(bool first_time)
    1954             : {
    1955        2456 :         struct rcu_node *rnp = rcu_get_root();
    1956             : 
    1957        2456 :         WRITE_ONCE(rcu_state.gp_activity, jiffies);
    1958        2456 :         rcu_state.n_force_qs++;
    1959        2456 :         if (first_time) {
    1960             :                 /* Collect dyntick-idle snapshots. */
    1961        1510 :                 force_qs_rnp(dyntick_save_progress_counter);
    1962             :         } else {
    1963             :                 /* Handle dyntick-idle and offline CPUs. */
    1964         946 :                 force_qs_rnp(rcu_implicit_dynticks_qs);
    1965             :         }
    1966             :         /* Clear flag to prevent immediate re-entry. */
    1967        2456 :         if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
    1968         352 :                 raw_spin_lock_irq_rcu_node(rnp);
    1969         352 :                 WRITE_ONCE(rcu_state.gp_flags,
    1970             :                            READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
    1971         704 :                 raw_spin_unlock_irq_rcu_node(rnp);
    1972             :         }
    1973        2456 : }
    1974             : 
    1975             : /*
    1976             :  * Loop doing repeated quiescent-state forcing until the grace period ends.
    1977             :  */
    1978        2020 : static void rcu_gp_fqs_loop(void)
    1979             : {
    1980        2020 :         bool first_gp_fqs;
    1981        2020 :         int gf = 0;
    1982        2020 :         unsigned long j;
    1983        2020 :         int ret;
    1984        2020 :         struct rcu_node *rnp = rcu_get_root();
    1985             : 
    1986        2020 :         first_gp_fqs = true;
    1987        2020 :         j = READ_ONCE(jiffies_till_first_fqs);
    1988        2020 :         if (rcu_state.cbovld)
    1989           0 :                 gf = RCU_GP_FLAG_OVLD;
    1990             :         ret = 0;
    1991        4476 :         for (;;) {
    1992        4476 :                 if (!ret) {
    1993        4476 :                         WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
    1994             :                         /*
    1995             :                          * jiffies_force_qs before RCU_GP_WAIT_FQS state
    1996             :                          * update; required for stall checks.
    1997             :                          */
    1998        4476 :                         smp_wmb();
    1999        4476 :                         WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
    2000             :                                    jiffies + (j ? 3 * j : 2));
    2001             :                 }
    2002        4476 :                 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
    2003        4476 :                                        TPS("fqswait"));
    2004        4476 :                 WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
    2005        8596 :                 ret = swait_event_idle_timeout_exclusive(
    2006             :                                 rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
    2007        4475 :                 rcu_gp_torture_wait();
    2008        4475 :                 WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
    2009             :                 /* Locking provides needed memory barriers. */
    2010             :                 /* If grace period done, leave loop. */
    2011        4475 :                 if (!READ_ONCE(rnp->qsmask) &&
    2012        2019 :                     !rcu_preempt_blocked_readers_cgp(rnp))
    2013             :                         break;
    2014             :                 /* If time for quiescent-state forcing, do it. */
    2015        2456 :                 if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
    2016           0 :                     (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) {
    2017        2456 :                         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
    2018        2456 :                                                TPS("fqsstart"));
    2019        2456 :                         rcu_gp_fqs(first_gp_fqs);
    2020        2456 :                         gf = 0;
    2021        2456 :                         if (first_gp_fqs) {
    2022        1510 :                                 first_gp_fqs = false;
    2023        3020 :                                 gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0;
    2024             :                         }
    2025        2456 :                         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
    2026        2456 :                                                TPS("fqsend"));
    2027        2456 :                         cond_resched_tasks_rcu_qs();
    2028        2456 :                         WRITE_ONCE(rcu_state.gp_activity, jiffies);
    2029        2456 :                         ret = 0; /* Force full wait till next FQS. */
    2030        2456 :                         j = READ_ONCE(jiffies_till_next_fqs);
    2031             :                 } else {
    2032             :                         /* Deal with stray signal. */
    2033           0 :                         cond_resched_tasks_rcu_qs();
    2034           0 :                         WRITE_ONCE(rcu_state.gp_activity, jiffies);
    2035           0 :                         WARN_ON(signal_pending(current));
    2036           0 :                         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
    2037           0 :                                                TPS("fqswaitsig"));
    2038           0 :                         ret = 1; /* Keep old FQS timing. */
    2039           0 :                         j = jiffies;
    2040           0 :                         if (time_after(jiffies, rcu_state.jiffies_force_qs))
    2041             :                                 j = 1;
    2042             :                         else
    2043           0 :                                 j = rcu_state.jiffies_force_qs - j;
    2044           0 :                         gf = 0;
    2045             :                 }
    2046             :         }
    2047        2019 : }
    2048             : 
    2049             : /*
    2050             :  * Clean up after the old grace period.
    2051             :  */
    2052        2019 : static void rcu_gp_cleanup(void)
    2053             : {
    2054        2019 :         int cpu;
    2055        2019 :         bool needgp = false;
    2056        2019 :         unsigned long gp_duration;
    2057        2019 :         unsigned long new_gp_seq;
    2058        2019 :         bool offloaded;
    2059        2019 :         struct rcu_data *rdp;
    2060        2019 :         struct rcu_node *rnp = rcu_get_root();
    2061        2019 :         struct swait_queue_head *sq;
    2062             : 
    2063        2019 :         WRITE_ONCE(rcu_state.gp_activity, jiffies);
    2064        2019 :         raw_spin_lock_irq_rcu_node(rnp);
    2065        2019 :         rcu_state.gp_end = jiffies;
    2066        2019 :         gp_duration = rcu_state.gp_end - rcu_state.gp_start;
    2067        2019 :         if (gp_duration > rcu_state.gp_max)
    2068           6 :                 rcu_state.gp_max = gp_duration;
    2069             : 
    2070             :         /*
    2071             :          * We know the grace period is complete, but to everyone else
    2072             :          * it appears to still be ongoing.  But it is also the case
    2073             :          * that to everyone else it looks like there is nothing that
    2074             :          * they can do to advance the grace period.  It is therefore
    2075             :          * safe for us to drop the lock in order to mark the grace
    2076             :          * period as completed in all of the rcu_node structures.
    2077             :          */
    2078        4038 :         raw_spin_unlock_irq_rcu_node(rnp);
    2079             : 
    2080             :         /*
    2081             :          * Propagate new ->gp_seq value to rcu_node structures so that
    2082             :          * other CPUs don't have to wait until the start of the next grace
    2083             :          * period to process their callbacks.  This also avoids some nasty
    2084             :          * RCU grace-period initialization races by forcing the end of
    2085             :          * the current grace period to be completely recorded in all of
    2086             :          * the rcu_node structures before the beginning of the next grace
    2087             :          * period is recorded in any of the rcu_node structures.
    2088             :          */
    2089        2019 :         new_gp_seq = rcu_state.gp_seq;
    2090        2019 :         rcu_seq_end(&new_gp_seq);
    2091        6057 :         rcu_for_each_node_breadth_first(rnp) {
    2092        2019 :                 raw_spin_lock_irq_rcu_node(rnp);
    2093        2019 :                 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
    2094        2019 :                         dump_blkd_tasks(rnp, 10);
    2095        2019 :                 WARN_ON_ONCE(rnp->qsmask);
    2096        2019 :                 WRITE_ONCE(rnp->gp_seq, new_gp_seq);
    2097        2019 :                 rdp = this_cpu_ptr(&rcu_data);
    2098        2019 :                 if (rnp == rdp->mynode)
    2099        4038 :                         needgp = __note_gp_changes(rnp, rdp) || needgp;
    2100             :                 /* smp_mb() provided by prior unlock-lock pair. */
    2101        4038 :                 needgp = rcu_future_gp_cleanup(rnp) || needgp;
    2102             :                 // Reset overload indication for CPUs no longer overloaded
    2103        2019 :                 if (rcu_is_leaf_node(rnp))
    2104        2019 :                         for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
    2105           0 :                                 rdp = per_cpu_ptr(&rcu_data, cpu);
    2106           0 :                                 check_cb_ovld_locked(rdp, rnp);
    2107             :                         }
    2108        2019 :                 sq = rcu_nocb_gp_get(rnp);
    2109        4038 :                 raw_spin_unlock_irq_rcu_node(rnp);
    2110        2019 :                 rcu_nocb_gp_cleanup(sq);
    2111        2019 :                 cond_resched_tasks_rcu_qs();
    2112        2019 :                 WRITE_ONCE(rcu_state.gp_activity, jiffies);
    2113        2019 :                 rcu_gp_slow(gp_cleanup_delay);
    2114             :         }
    2115        2019 :         rnp = rcu_get_root();
    2116        2019 :         raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
    2117             : 
    2118             :         /* Declare grace period done, trace first to use old GP number. */
    2119        2019 :         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
    2120        2019 :         rcu_seq_end(&rcu_state.gp_seq);
    2121        2019 :         ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
    2122        2019 :         WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
    2123             :         /* Check for GP requests since above loop. */
    2124        2019 :         rdp = this_cpu_ptr(&rcu_data);
    2125        2019 :         if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
    2126           0 :                 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed,
    2127           0 :                                   TPS("CleanupMore"));
    2128           0 :                 needgp = true;
    2129             :         }
    2130             :         /* Advance CBs to reduce false positives below. */
    2131        2019 :         offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
    2132        2019 :         if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
    2133        2004 :                 WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
    2134        2004 :                 WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
    2135        4023 :                 trace_rcu_grace_period(rcu_state.name,
    2136             :                                        rcu_state.gp_seq,
    2137        2004 :                                        TPS("newreq"));
    2138             :         } else {
    2139          15 :                 WRITE_ONCE(rcu_state.gp_flags,
    2140             :                            rcu_state.gp_flags & RCU_GP_FLAG_INIT);
    2141             :         }
    2142        4038 :         raw_spin_unlock_irq_rcu_node(rnp);
    2143             : 
    2144             :         // If strict, make all CPUs aware of the end of the old grace period.
    2145        2019 :         if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
    2146             :                 on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
    2147        2019 : }
    2148             : 
    2149             : /*
    2150             :  * Body of kthread that handles grace periods.
    2151             :  */
    2152           1 : static int __noreturn rcu_gp_kthread(void *unused)
    2153             : {
    2154           1 :         rcu_bind_gp_kthread();
    2155        4039 :         for (;;) {
    2156             : 
    2157             :                 /* Handle grace-period start. */
    2158        2020 :                 for (;;) {
    2159        2020 :                         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
    2160        2020 :                                                TPS("reqwait"));
    2161        2020 :                         WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
    2162        2035 :                         swait_event_idle_exclusive(rcu_state.gp_wq,
    2163             :                                          READ_ONCE(rcu_state.gp_flags) &
    2164             :                                          RCU_GP_FLAG_INIT);
    2165        2020 :                         rcu_gp_torture_wait();
    2166        2020 :                         WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
    2167             :                         /* Locking provides needed memory barrier. */
    2168        2020 :                         if (rcu_gp_init())
    2169             :                                 break;
    2170           0 :                         cond_resched_tasks_rcu_qs();
    2171           0 :                         WRITE_ONCE(rcu_state.gp_activity, jiffies);
    2172           0 :                         WARN_ON(signal_pending(current));
    2173        2020 :                         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
    2174        2019 :                                                TPS("reqwaitsig"));
    2175             :                 }
    2176             : 
    2177             :                 /* Handle quiescent-state forcing. */
    2178        2020 :                 rcu_gp_fqs_loop();
    2179             : 
    2180             :                 /* Handle grace-period end. */
    2181        2019 :                 WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
    2182        2019 :                 rcu_gp_cleanup();
    2183        2019 :                 WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
    2184             :         }
    2185             : }
    2186             : 
    2187             : /*
    2188             :  * Report a full set of quiescent states to the rcu_state data structure.
    2189             :  * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
    2190             :  * another grace period is required.  Whether we wake the grace-period
    2191             :  * kthread or it awakens itself for the next round of quiescent-state
    2192             :  * forcing, that kthread will clean up after the just-completed grace
    2193             :  * period.  Note that the caller must hold rnp->lock, which is released
    2194             :  * before return.
    2195             :  */
    2196        2019 : static void rcu_report_qs_rsp(unsigned long flags)
    2197             :         __releases(rcu_get_root()->lock)
    2198             : {
    2199        4038 :         raw_lockdep_assert_held_rcu_node(rcu_get_root());
    2200        2019 :         WARN_ON_ONCE(!rcu_gp_in_progress());
    2201        2019 :         WRITE_ONCE(rcu_state.gp_flags,
    2202             :                    READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
    2203        4038 :         raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
    2204        2019 :         rcu_gp_kthread_wake();
    2205        2019 : }
    2206             : 
    2207             : /*
    2208             :  * Similar to rcu_report_qs_rdp(), for which it is a helper function.
    2209             :  * Allows quiescent states for a group of CPUs to be reported at one go
    2210             :  * to the specified rcu_node structure, though all the CPUs in the group
    2211             :  * must be represented by the same rcu_node structure (which need not be a
    2212             :  * leaf rcu_node structure, though it often will be).  The gps parameter
    2213             :  * is the grace-period snapshot, which means that the quiescent states
    2214             :  * are valid only if rnp->gp_seq is equal to gps.  That structure's lock
    2215             :  * must be held upon entry, and it is released before return.
    2216             :  *
    2217             :  * As a special case, if mask is zero, the bit-already-cleared check is
    2218             :  * disabled.  This allows propagating quiescent state due to resumed tasks
    2219             :  * during grace-period initialization.
    2220             :  */
    2221        7937 : static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
    2222             :                               unsigned long gps, unsigned long flags)
    2223             :         __releases(rnp->lock)
    2224             : {
    2225        7937 :         unsigned long oldmask = 0;
    2226        7937 :         struct rcu_node *rnp_c;
    2227             : 
    2228       15874 :         raw_lockdep_assert_held_rcu_node(rnp);
    2229             : 
    2230             :         /* Walk up the rcu_node hierarchy. */
    2231        7937 :         for (;;) {
    2232        7937 :                 if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) {
    2233             : 
    2234             :                         /*
    2235             :                          * Our bit has already been cleared, or the
    2236             :                          * relevant grace period is already over, so done.
    2237             :                          */
    2238           0 :                         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    2239           0 :                         return;
    2240             :                 }
    2241        7937 :                 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
    2242        7937 :                 WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
    2243             :                              rcu_preempt_blocked_readers_cgp(rnp));
    2244        7937 :                 WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
    2245        7937 :                 trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
    2246             :                                                  mask, rnp->qsmask, rnp->level,
    2247             :                                                  rnp->grplo, rnp->grphi,
    2248        7937 :                                                  !!rnp->gp_tasks);
    2249        7937 :                 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
    2250             : 
    2251             :                         /* Other bits still set at this level, so done. */
    2252       11836 :                         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    2253        5918 :                         return;
    2254             :                 }
    2255        2019 :                 rnp->completedqs = rnp->gp_seq;
    2256        2019 :                 mask = rnp->grpmask;
    2257        2019 :                 if (rnp->parent == NULL) {
    2258             : 
    2259             :                         /* No more levels.  Exit loop holding root lock. */
    2260             : 
    2261             :                         break;
    2262             :                 }
    2263           0 :                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    2264           0 :                 rnp_c = rnp;
    2265           0 :                 rnp = rnp->parent;
    2266           0 :                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
    2267           0 :                 oldmask = READ_ONCE(rnp_c->qsmask);
    2268             :         }
    2269             : 
    2270             :         /*
    2271             :          * Get here if we are the last CPU to pass through a quiescent
    2272             :          * state for this grace period.  Invoke rcu_report_qs_rsp()
    2273             :          * to clean up and start the next grace period if one is needed.
    2274             :          */
    2275        2019 :         rcu_report_qs_rsp(flags); /* releases rnp->lock. */
    2276             : }
    2277             : 
    2278             : /*
    2279             :  * Record a quiescent state for all tasks that were previously queued
    2280             :  * on the specified rcu_node structure and that were blocking the current
    2281             :  * RCU grace period.  The caller must hold the corresponding rnp->lock with
    2282             :  * irqs disabled, and this lock is released upon return, but irqs remain
    2283             :  * disabled.
    2284             :  */
    2285             : static void __maybe_unused
    2286             : rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
    2287             :         __releases(rnp->lock)
    2288             : {
    2289             :         unsigned long gps;
    2290             :         unsigned long mask;
    2291             :         struct rcu_node *rnp_p;
    2292             : 
    2293             :         raw_lockdep_assert_held_rcu_node(rnp);
    2294             :         if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
    2295             :             WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
    2296             :             rnp->qsmask != 0) {
    2297             :                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    2298             :                 return;  /* Still need more quiescent states! */
    2299             :         }
    2300             : 
    2301             :         rnp->completedqs = rnp->gp_seq;
    2302             :         rnp_p = rnp->parent;
    2303             :         if (rnp_p == NULL) {
    2304             :                 /*
    2305             :                  * Only one rcu_node structure in the tree, so don't
    2306             :                  * try to report up to its nonexistent parent!
    2307             :                  */
    2308             :                 rcu_report_qs_rsp(flags);
    2309             :                 return;
    2310             :         }
    2311             : 
    2312             :         /* Report up the rest of the hierarchy, tracking current ->gp_seq. */
    2313             :         gps = rnp->gp_seq;
    2314             :         mask = rnp->grpmask;
    2315             :         raw_spin_unlock_rcu_node(rnp);  /* irqs remain disabled. */
    2316             :         raw_spin_lock_rcu_node(rnp_p);  /* irqs already disabled. */
    2317             :         rcu_report_qs_rnp(mask, rnp_p, gps, flags);
    2318             : }
    2319             : 
    2320             : /*
    2321             :  * Record a quiescent state for the specified CPU to that CPU's rcu_data
    2322             :  * structure.  This must be called from the specified CPU.
    2323             :  */
    2324             : static void
    2325        7395 : rcu_report_qs_rdp(struct rcu_data *rdp)
    2326             : {
    2327        7395 :         unsigned long flags;
    2328        7395 :         unsigned long mask;
    2329        7395 :         bool needwake = false;
    2330        7395 :         const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
    2331        7395 :         struct rcu_node *rnp;
    2332             : 
    2333        7395 :         WARN_ON_ONCE(rdp->cpu != smp_processor_id());
    2334        7395 :         rnp = rdp->mynode;
    2335        7395 :         raw_spin_lock_irqsave_rcu_node(rnp, flags);
    2336        7411 :         if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
    2337        7393 :             rdp->gpwrap) {
    2338             : 
    2339             :                 /*
    2340             :                  * The grace period in which this quiescent state was
    2341             :                  * recorded has ended, so don't report it upwards.
    2342             :                  * We will instead need a new quiescent state that lies
    2343             :                  * within the current grace period.
    2344             :                  */
    2345          18 :                 rdp->cpu_no_qs.b.norm = true;        /* need qs for new gp. */
    2346          36 :                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    2347          18 :                 return;
    2348             :         }
    2349        7393 :         mask = rdp->grpmask;
    2350        7393 :         rdp->core_needs_qs = false;
    2351        7393 :         if ((rnp->qsmask & mask) == 0) {
    2352         156 :                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    2353             :         } else {
    2354             :                 /*
    2355             :                  * This GP can't end until cpu checks in, so all of our
    2356             :                  * callbacks can be processed during the next GP.
    2357             :                  */
    2358        7315 :                 if (!offloaded)
    2359        7315 :                         needwake = rcu_accelerate_cbs(rnp, rdp);
    2360             : 
    2361        7315 :                 rcu_disable_urgency_upon_qs(rdp);
    2362        7315 :                 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
    2363             :                 /* ^^^ Released rnp->lock */
    2364        7315 :                 if (needwake)
    2365           0 :                         rcu_gp_kthread_wake();
    2366             :         }
    2367             : }
    2368             : 
    2369             : /*
    2370             :  * Check to see if there is a new grace period of which this CPU
    2371             :  * is not yet aware, and if so, set up local rcu_data state for it.
    2372             :  * Otherwise, see if this CPU has just passed through its first
    2373             :  * quiescent state for this grace period, and record that fact if so.
    2374             :  */
    2375             : static void
    2376       54066 : rcu_check_quiescent_state(struct rcu_data *rdp)
    2377             : {
    2378             :         /* Check for grace-period ends and beginnings. */
    2379       54066 :         note_gp_changes(rdp);
    2380             : 
    2381             :         /*
    2382             :          * Does this CPU still need to do its part for current grace period?
    2383             :          * If no, return and let the other CPUs do their part as well.
    2384             :          */
    2385       54215 :         if (!rdp->core_needs_qs)
    2386             :                 return;
    2387             : 
    2388             :         /*
    2389             :          * Was there a quiescent state since the beginning of the grace
    2390             :          * period? If no, then exit and wait for the next call.
    2391             :          */
    2392       21228 :         if (rdp->cpu_no_qs.b.norm)
    2393             :                 return;
    2394             : 
    2395             :         /*
    2396             :          * Tell RCU we are done (but rcu_report_qs_rdp() will be the
    2397             :          * judge of that).
    2398             :          */
    2399        7408 :         rcu_report_qs_rdp(rdp);
    2400             : }
    2401             : 
    2402             : /*
    2403             :  * Near the end of the offline process.  Trace the fact that this CPU
    2404             :  * is going offline.
    2405             :  */
    2406           0 : int rcutree_dying_cpu(unsigned int cpu)
    2407             : {
    2408           0 :         bool blkd;
    2409           0 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    2410           0 :         struct rcu_node *rnp = rdp->mynode;
    2411             : 
    2412           0 :         if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
    2413             :                 return 0;
    2414             : 
    2415           0 :         blkd = !!(rnp->qsmask & rdp->grpmask);
    2416           0 :         trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
    2417           0 :                                blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
    2418           0 :         return 0;
    2419             : }
    2420             : 
    2421             : /*
    2422             :  * All CPUs for the specified rcu_node structure have gone offline,
    2423             :  * and all tasks that were preempted within an RCU read-side critical
    2424             :  * section while running on one of those CPUs have since exited their RCU
    2425             :  * read-side critical section.  Some other CPU is reporting this fact with
    2426             :  * the specified rcu_node structure's ->lock held and interrupts disabled.
    2427             :  * This function therefore goes up the tree of rcu_node structures,
    2428             :  * clearing the corresponding bits in the ->qsmaskinit fields.  Note that
    2429             :  * the leaf rcu_node structure's ->qsmaskinit field has already been
    2430             :  * updated.
    2431             :  *
    2432             :  * This function does check that the specified rcu_node structure has
    2433             :  * all CPUs offline and no blocked tasks, so it is OK to invoke it
    2434             :  * prematurely.  That said, invoking it after the fact will cost you
    2435             :  * a needless lock acquisition.  So once it has done its work, don't
    2436             :  * invoke it again.
    2437             :  */
    2438           0 : static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
    2439             : {
    2440           0 :         long mask;
    2441           0 :         struct rcu_node *rnp = rnp_leaf;
    2442             : 
    2443           0 :         raw_lockdep_assert_held_rcu_node(rnp_leaf);
    2444           0 :         if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
    2445           0 :             WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
    2446           0 :             WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
    2447             :                 return;
    2448           0 :         for (;;) {
    2449           0 :                 mask = rnp->grpmask;
    2450           0 :                 rnp = rnp->parent;
    2451           0 :                 if (!rnp)
    2452             :                         break;
    2453           0 :                 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
    2454           0 :                 rnp->qsmaskinit &= ~mask;
    2455             :                 /* Between grace periods, so better already be zero! */
    2456           0 :                 WARN_ON_ONCE(rnp->qsmask);
    2457           0 :                 if (rnp->qsmaskinit) {
    2458           0 :                         raw_spin_unlock_rcu_node(rnp);
    2459             :                         /* irqs remain disabled. */
    2460           0 :                         return;
    2461             :                 }
    2462           0 :                 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
    2463             :         }
    2464             : }
    2465             : 
    2466             : /*
    2467             :  * The CPU has been completely removed, and some other CPU is reporting
    2468             :  * this fact from process context.  Do the remainder of the cleanup.
    2469             :  * There can only be one CPU hotplug operation at a time, so no need for
    2470             :  * explicit locking.
    2471             :  */
    2472           0 : int rcutree_dead_cpu(unsigned int cpu)
    2473             : {
    2474           0 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    2475           0 :         struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
    2476             : 
    2477           0 :         if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
    2478             :                 return 0;
    2479             : 
    2480           0 :         WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
    2481             :         /* Adjust any no-longer-needed kthreads. */
    2482           0 :         rcu_boost_kthread_setaffinity(rnp, -1);
    2483             :         /* Do any needed no-CB deferred wakeups from this CPU. */
    2484           0 :         do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
    2485             : 
    2486             :         // Stop-machine done, so allow nohz_full to disable tick.
    2487           0 :         tick_dep_clear(TICK_DEP_BIT_RCU);
    2488           0 :         return 0;
    2489             : }
    2490             : 
    2491             : /*
    2492             :  * Invoke any RCU callbacks that have made it to the end of their grace
    2493             :  * period.  Thottle as specified by rdp->blimit.
    2494             :  */
    2495       48775 : static void rcu_do_batch(struct rcu_data *rdp)
    2496             : {
    2497       48775 :         int div;
    2498       48775 :         bool __maybe_unused empty;
    2499       48775 :         unsigned long flags;
    2500       48775 :         const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
    2501       48775 :         struct rcu_head *rhp;
    2502       48775 :         struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
    2503       48775 :         long bl, count = 0;
    2504       48775 :         long pending, tlimit = 0;
    2505             : 
    2506             :         /* If no callbacks are ready, just return. */
    2507       48775 :         if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
    2508           0 :                 trace_rcu_batch_start(rcu_state.name,
    2509             :                                       rcu_segcblist_n_cbs(&rdp->cblist), 0);
    2510           0 :                 trace_rcu_batch_end(rcu_state.name, 0,
    2511           0 :                                     !rcu_segcblist_empty(&rdp->cblist),
    2512           0 :                                     need_resched(), is_idle_task(current),
    2513             :                                     rcu_is_callbacks_kthread());
    2514           0 :                 return;
    2515             :         }
    2516             : 
    2517             :         /*
    2518             :          * Extract the list of ready callbacks, disabling to prevent
    2519             :          * races with call_rcu() from interrupt handlers.  Leave the
    2520             :          * callback counts, as rcu_barrier() needs to be conservative.
    2521             :          */
    2522       97553 :         local_irq_save(flags);
    2523       48767 :         rcu_nocb_lock(rdp);
    2524       48767 :         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
    2525       48776 :         pending = rcu_segcblist_n_cbs(&rdp->cblist);
    2526       48776 :         div = READ_ONCE(rcu_divisor);
    2527       48776 :         div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
    2528       48776 :         bl = max(rdp->blimit, pending >> div);
    2529       48776 :         if (unlikely(bl > 100)) {
    2530           0 :                 long rrn = READ_ONCE(rcu_resched_ns);
    2531             : 
    2532           0 :                 rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
    2533           0 :                 tlimit = local_clock() + rrn;
    2534             :         }
    2535       48776 :         trace_rcu_batch_start(rcu_state.name,
    2536             :                               rcu_segcblist_n_cbs(&rdp->cblist), bl);
    2537       48776 :         rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
    2538       48781 :         if (offloaded)
    2539             :                 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
    2540             : 
    2541       48781 :         trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
    2542       48781 :         rcu_nocb_unlock_irqrestore(rdp, flags);
    2543             : 
    2544             :         /* Invoke callbacks. */
    2545       48779 :         tick_dep_set_task(current, TICK_DEP_BIT_RCU);
    2546       48779 :         rhp = rcu_cblist_dequeue(&rcl);
    2547             : 
    2548      678351 :         for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
    2549      626420 :                 rcu_callback_t f;
    2550             : 
    2551      626420 :                 count++;
    2552      626420 :                 debug_rcu_head_unqueue(rhp);
    2553             : 
    2554      624602 :                 rcu_lock_acquire(&rcu_callback_map);
    2555      624982 :                 trace_rcu_invoke_callback(rcu_state.name, rhp);
    2556             : 
    2557      624982 :                 f = rhp->func;
    2558      624982 :                 WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
    2559      624982 :                 f(rhp);
    2560             : 
    2561      624982 :                 rcu_lock_release(&rcu_callback_map);
    2562             : 
    2563             :                 /*
    2564             :                  * Stop only if limit reached and CPU has something to do.
    2565             :                  */
    2566      624639 :                 if (count >= bl && !offloaded &&
    2567      188408 :                     (need_resched() ||
    2568      187137 :                      (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
    2569             :                         break;
    2570      580788 :                 if (unlikely(tlimit)) {
    2571             :                         /* only call local_clock() every 32 callbacks */
    2572           0 :                         if (likely((count & 31) || local_clock() < tlimit))
    2573           0 :                                 continue;
    2574             :                         /* Exceeded the time limit, so leave. */
    2575             :                         break;
    2576             :                 }
    2577      580788 :                 if (!in_serving_softirq()) {
    2578           0 :                         local_bh_enable();
    2579           0 :                         lockdep_assert_irqs_enabled();
    2580           0 :                         cond_resched_tasks_rcu_qs();
    2581           0 :                         lockdep_assert_irqs_enabled();
    2582           0 :                         local_bh_disable();
    2583             :                 }
    2584             :         }
    2585             : 
    2586       97521 :         local_irq_save(flags);
    2587       48764 :         rcu_nocb_lock(rdp);
    2588       48764 :         rdp->n_cbs_invoked += count;
    2589       48764 :         trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
    2590       48764 :                             is_idle_task(current), rcu_is_callbacks_kthread());
    2591             : 
    2592             :         /* Update counts and requeue any remaining callbacks. */
    2593       48772 :         rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
    2594       48767 :         rcu_segcblist_add_len(&rdp->cblist, -count);
    2595             : 
    2596             :         /* Reinstate batch limit if we have worked down the excess. */
    2597       48777 :         count = rcu_segcblist_n_cbs(&rdp->cblist);
    2598       48777 :         if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
    2599           0 :                 rdp->blimit = blimit;
    2600             : 
    2601             :         /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
    2602       48777 :         if (count == 0 && rdp->qlen_last_fqs_check != 0) {
    2603           0 :                 rdp->qlen_last_fqs_check = 0;
    2604           0 :                 rdp->n_force_qs_snap = rcu_state.n_force_qs;
    2605       48777 :         } else if (count < rdp->qlen_last_fqs_check - qhimark)
    2606           0 :                 rdp->qlen_last_fqs_check = count;
    2607             : 
    2608             :         /*
    2609             :          * The following usually indicates a double call_rcu().  To track
    2610             :          * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
    2611             :          */
    2612       48777 :         empty = rcu_segcblist_empty(&rdp->cblist);
    2613       48777 :         WARN_ON_ONCE(count == 0 && !empty);
    2614       48777 :         WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
    2615             :                      count != 0 && empty);
    2616       49035 :         WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
    2617       97289 :         WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
    2618             : 
    2619       48775 :         rcu_nocb_unlock_irqrestore(rdp, flags);
    2620             : 
    2621             :         /* Re-invoke RCU core processing if there are callbacks remaining. */
    2622       48777 :         if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
    2623       43580 :                 invoke_rcu_core();
    2624       48775 :         tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
    2625             : }
    2626             : 
    2627             : /*
    2628             :  * This function is invoked from each scheduling-clock interrupt,
    2629             :  * and checks to see if this CPU is in a non-context-switch quiescent
    2630             :  * state, for example, user mode or idle loop.  It also schedules RCU
    2631             :  * core processing.  If the current grace period has gone on too long,
    2632             :  * it will ask the scheduler to manufacture a context switch for the sole
    2633             :  * purpose of providing a providing the needed quiescent state.
    2634             :  */
    2635       27707 : void rcu_sched_clock_irq(int user)
    2636             : {
    2637       27707 :         trace_rcu_utilization(TPS("Start scheduler-tick"));
    2638       54994 :         lockdep_assert_irqs_disabled();
    2639       27508 :         raw_cpu_inc(rcu_data.ticks_this_gp);
    2640             :         /* The load-acquire pairs with the store-release setting to true. */
    2641       27508 :         if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
    2642             :                 /* Idle and userspace execution already are quiescent states. */
    2643          24 :                 if (!rcu_is_cpu_rrupt_from_idle() && !user) {
    2644          25 :                         set_tsk_need_resched(current);
    2645          25 :                         set_preempt_need_resched();
    2646             :                 }
    2647       27522 :                 __this_cpu_write(rcu_data.rcu_urgent_qs, false);
    2648             :         }
    2649       27522 :         rcu_flavor_sched_clock_irq(user);
    2650       28002 :         if (rcu_pending(user))
    2651       11374 :                 invoke_rcu_core();
    2652       55673 :         lockdep_assert_irqs_disabled();
    2653             : 
    2654       27945 :         trace_rcu_utilization(TPS("End scheduler-tick"));
    2655       27902 : }
    2656             : 
    2657             : /*
    2658             :  * Scan the leaf rcu_node structures.  For each structure on which all
    2659             :  * CPUs have reported a quiescent state and on which there are tasks
    2660             :  * blocking the current grace period, initiate RCU priority boosting.
    2661             :  * Otherwise, invoke the specified function to check dyntick state for
    2662             :  * each CPU that has not yet reported a quiescent state.
    2663             :  */
    2664        2456 : static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
    2665             : {
    2666        2456 :         int cpu;
    2667        2456 :         unsigned long flags;
    2668        2456 :         unsigned long mask;
    2669        2456 :         struct rcu_data *rdp;
    2670        2456 :         struct rcu_node *rnp;
    2671             : 
    2672        2456 :         rcu_state.cbovld = rcu_state.cbovldnext;
    2673        2456 :         rcu_state.cbovldnext = false;
    2674        4912 :         rcu_for_each_leaf_node(rnp) {
    2675        2456 :                 cond_resched_tasks_rcu_qs();
    2676        2456 :                 mask = 0;
    2677        2456 :                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
    2678        2456 :                 rcu_state.cbovldnext |= !!rnp->cbovldmask;
    2679        2456 :                 if (rnp->qsmask == 0) {
    2680           1 :                         if (rcu_preempt_blocked_readers_cgp(rnp)) {
    2681             :                                 /*
    2682             :                                  * No point in scanning bits because they
    2683             :                                  * are all zero.  But we might need to
    2684             :                                  * priority-boost blocked readers.
    2685             :                                  */
    2686             :                                 rcu_initiate_boost(rnp, flags);
    2687             :                                 /* rcu_initiate_boost() releases rnp->lock */
    2688             :                                 continue;
    2689             :                         }
    2690           2 :                         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    2691           1 :                         continue;
    2692             :                 }
    2693        5839 :                 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
    2694        3384 :                         rdp = per_cpu_ptr(&rcu_data, cpu);
    2695        3384 :                         if (f(rdp)) {
    2696         757 :                                 mask |= rdp->grpmask;
    2697         757 :                                 rcu_disable_urgency_upon_qs(rdp);
    2698             :                         }
    2699             :                 }
    2700        2455 :                 if (mask != 0) {
    2701             :                         /* Idle/offline CPUs, report (releases rnp->lock). */
    2702         622 :                         rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
    2703             :                 } else {
    2704             :                         /* Nothing to do here, so just drop the lock. */
    2705        4289 :                         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    2706             :                 }
    2707             :         }
    2708        2456 : }
    2709             : 
    2710             : /*
    2711             :  * Force quiescent states on reluctant CPUs, and also detect which
    2712             :  * CPUs are in dyntick-idle mode.
    2713             :  */
    2714           0 : void rcu_force_quiescent_state(void)
    2715             : {
    2716           0 :         unsigned long flags;
    2717           0 :         bool ret;
    2718           0 :         struct rcu_node *rnp;
    2719           0 :         struct rcu_node *rnp_old = NULL;
    2720             : 
    2721             :         /* Funnel through hierarchy to reduce memory contention. */
    2722           0 :         rnp = __this_cpu_read(rcu_data.mynode);
    2723           0 :         for (; rnp != NULL; rnp = rnp->parent) {
    2724           0 :                 ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
    2725           0 :                        !raw_spin_trylock(&rnp->fqslock);
    2726           0 :                 if (rnp_old != NULL)
    2727           0 :                         raw_spin_unlock(&rnp_old->fqslock);
    2728           0 :                 if (ret)
    2729             :                         return;
    2730           0 :                 rnp_old = rnp;
    2731             :         }
    2732             :         /* rnp_old == rcu_get_root(), rnp == NULL. */
    2733             : 
    2734             :         /* Reached the root of the rcu_node tree, acquire lock. */
    2735           0 :         raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
    2736           0 :         raw_spin_unlock(&rnp_old->fqslock);
    2737           0 :         if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
    2738           0 :                 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
    2739           0 :                 return;  /* Someone beat us to it. */
    2740             :         }
    2741           0 :         WRITE_ONCE(rcu_state.gp_flags,
    2742             :                    READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
    2743           0 :         raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
    2744           0 :         rcu_gp_kthread_wake();
    2745             : }
    2746             : EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
    2747             : 
    2748             : // Workqueue handler for an RCU reader for kernels enforcing struct RCU
    2749             : // grace periods.
    2750           0 : static void strict_work_handler(struct work_struct *work)
    2751             : {
    2752           0 :         rcu_read_lock();
    2753           0 :         rcu_read_unlock();
    2754           0 : }
    2755             : 
    2756             : /* Perform RCU core processing work for the current CPU.  */
    2757       54020 : static __latent_entropy void rcu_core(void)
    2758             : {
    2759       54020 :         unsigned long flags;
    2760       54020 :         struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
    2761       54051 :         struct rcu_node *rnp = rdp->mynode;
    2762       54051 :         const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
    2763             : 
    2764       54051 :         if (cpu_is_offline(smp_processor_id()))
    2765             :                 return;
    2766       54060 :         trace_rcu_utilization(TPS("Start RCU core"));
    2767       54153 :         WARN_ON_ONCE(!rdp->beenonline);
    2768             : 
    2769             :         /* Report any deferred quiescent states if preemption enabled. */
    2770       54153 :         if (!(preempt_count() & PREEMPT_MASK)) {
    2771       38831 :                 rcu_preempt_deferred_qs(current);
    2772       15322 :         } else if (rcu_preempt_need_deferred_qs(current)) {
    2773             :                 set_tsk_need_resched(current);
    2774             :                 set_preempt_need_resched();
    2775             :         }
    2776             : 
    2777             :         /* Update RCU state based on any recent quiescent states. */
    2778       54153 :         rcu_check_quiescent_state(rdp);
    2779             : 
    2780             :         /* No grace period and unregistered callbacks? */
    2781       54215 :         if (!rcu_gp_in_progress() &&
    2782          75 :             rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
    2783         150 :                 rcu_nocb_lock_irqsave(rdp, flags);
    2784          75 :                 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
    2785          14 :                         rcu_accelerate_cbs_unlocked(rnp, rdp);
    2786          75 :                 rcu_nocb_unlock_irqrestore(rdp, flags);
    2787             :         }
    2788             : 
    2789       54215 :         rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
    2790             : 
    2791             :         /* If there are callbacks ready, invoke them. */
    2792       54194 :         if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
    2793       48774 :             likely(READ_ONCE(rcu_scheduler_fully_active)))
    2794       48774 :                 rcu_do_batch(rdp);
    2795             : 
    2796             :         /* Do any needed deferred wakeups of rcuo kthreads. */
    2797       54197 :         do_nocb_deferred_wakeup(rdp);
    2798       54197 :         trace_rcu_utilization(TPS("End RCU core"));
    2799             : 
    2800             :         // If strict GPs, schedule an RCU reader in a clean environment.
    2801       54197 :         if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
    2802             :                 queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
    2803             : }
    2804             : 
    2805       54063 : static void rcu_core_si(struct softirq_action *h)
    2806             : {
    2807       54063 :         rcu_core();
    2808       54215 : }
    2809             : 
    2810           0 : static void rcu_wake_cond(struct task_struct *t, int status)
    2811             : {
    2812             :         /*
    2813             :          * If the thread is yielding, only wake it when this
    2814             :          * is invoked from idle
    2815             :          */
    2816           0 :         if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
    2817           0 :                 wake_up_process(t);
    2818           0 : }
    2819             : 
    2820           0 : static void invoke_rcu_core_kthread(void)
    2821             : {
    2822           0 :         struct task_struct *t;
    2823           0 :         unsigned long flags;
    2824             : 
    2825           0 :         local_irq_save(flags);
    2826           0 :         __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
    2827           0 :         t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
    2828           0 :         if (t != NULL && t != current)
    2829           0 :                 rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
    2830           0 :         local_irq_restore(flags);
    2831           0 : }
    2832             : 
    2833             : /*
    2834             :  * Wake up this CPU's rcuc kthread to do RCU core processing.
    2835             :  */
    2836       54848 : static void invoke_rcu_core(void)
    2837             : {
    2838       54848 :         if (!cpu_online(smp_processor_id()))
    2839             :                 return;
    2840       54855 :         if (use_softirq)
    2841       54855 :                 raise_softirq(RCU_SOFTIRQ);
    2842             :         else
    2843           0 :                 invoke_rcu_core_kthread();
    2844             : }
    2845             : 
    2846           0 : static void rcu_cpu_kthread_park(unsigned int cpu)
    2847             : {
    2848           0 :         per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
    2849           0 : }
    2850             : 
    2851           0 : static int rcu_cpu_kthread_should_run(unsigned int cpu)
    2852             : {
    2853           0 :         return __this_cpu_read(rcu_data.rcu_cpu_has_work);
    2854             : }
    2855             : 
    2856             : /*
    2857             :  * Per-CPU kernel thread that invokes RCU callbacks.  This replaces
    2858             :  * the RCU softirq used in configurations of RCU that do not support RCU
    2859             :  * priority boosting.
    2860             :  */
    2861           0 : static void rcu_cpu_kthread(unsigned int cpu)
    2862             : {
    2863           0 :         unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
    2864           0 :         char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
    2865           0 :         int spincnt;
    2866             : 
    2867           0 :         trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
    2868           0 :         for (spincnt = 0; spincnt < 10; spincnt++) {
    2869           0 :                 local_bh_disable();
    2870           0 :                 *statusp = RCU_KTHREAD_RUNNING;
    2871           0 :                 local_irq_disable();
    2872           0 :                 work = *workp;
    2873           0 :                 *workp = 0;
    2874           0 :                 local_irq_enable();
    2875           0 :                 if (work)
    2876           0 :                         rcu_core();
    2877           0 :                 local_bh_enable();
    2878           0 :                 if (*workp == 0) {
    2879           0 :                         trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
    2880           0 :                         *statusp = RCU_KTHREAD_WAITING;
    2881           0 :                         return;
    2882             :                 }
    2883             :         }
    2884           0 :         *statusp = RCU_KTHREAD_YIELDING;
    2885           0 :         trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
    2886           0 :         schedule_timeout_idle(2);
    2887           0 :         trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
    2888           0 :         *statusp = RCU_KTHREAD_WAITING;
    2889             : }
    2890             : 
    2891             : static struct smp_hotplug_thread rcu_cpu_thread_spec = {
    2892             :         .store                  = &rcu_data.rcu_cpu_kthread_task,
    2893             :         .thread_should_run      = rcu_cpu_kthread_should_run,
    2894             :         .thread_fn              = rcu_cpu_kthread,
    2895             :         .thread_comm            = "rcuc/%u",
    2896             :         .setup                  = rcu_cpu_kthread_setup,
    2897             :         .park                   = rcu_cpu_kthread_park,
    2898             : };
    2899             : 
    2900             : /*
    2901             :  * Spawn per-CPU RCU core processing kthreads.
    2902             :  */
    2903           1 : static int __init rcu_spawn_core_kthreads(void)
    2904             : {
    2905           1 :         int cpu;
    2906             : 
    2907           5 :         for_each_possible_cpu(cpu)
    2908           4 :                 per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
    2909           1 :         if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
    2910             :                 return 0;
    2911           0 :         WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
    2912             :                   "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
    2913             :         return 0;
    2914             : }
    2915             : early_initcall(rcu_spawn_core_kthreads);
    2916             : 
    2917             : /*
    2918             :  * Handle any core-RCU processing required by a call_rcu() invocation.
    2919             :  */
    2920      627276 : static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
    2921             :                             unsigned long flags)
    2922             : {
    2923             :         /*
    2924             :          * If called from an extended quiescent state, invoke the RCU
    2925             :          * core in order to force a re-evaluation of RCU's idleness.
    2926             :          */
    2927     1254632 :         if (!rcu_is_watching())
    2928           0 :                 invoke_rcu_core();
    2929             : 
    2930             :         /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
    2931      627356 :         if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
    2932       20388 :                 return;
    2933             : 
    2934             :         /*
    2935             :          * Force the grace period if too many callbacks or too long waiting.
    2936             :          * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
    2937             :          * if some other CPU has recently done so.  Also, don't bother
    2938             :          * invoking rcu_force_quiescent_state() if the newly enqueued callback
    2939             :          * is the only one waiting for a grace period to complete.
    2940             :          */
    2941      606826 :         if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
    2942             :                      rdp->qlen_last_fqs_check + qhimark)) {
    2943             : 
    2944             :                 /* Are we ignoring a completed grace period? */
    2945           0 :                 note_gp_changes(rdp);
    2946             : 
    2947             :                 /* Start a new grace period if one not already started. */
    2948           0 :                 if (!rcu_gp_in_progress()) {
    2949           0 :                         rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
    2950             :                 } else {
    2951             :                         /* Give the grace period a kick. */
    2952           0 :                         rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
    2953           0 :                         if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
    2954           0 :                             rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
    2955           0 :                                 rcu_force_quiescent_state();
    2956           0 :                         rdp->n_force_qs_snap = rcu_state.n_force_qs;
    2957           0 :                         rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
    2958             :                 }
    2959             :         }
    2960             : }
    2961             : 
    2962             : /*
    2963             :  * RCU callback function to leak a callback.
    2964             :  */
    2965           0 : static void rcu_leak_callback(struct rcu_head *rhp)
    2966             : {
    2967           0 : }
    2968             : 
    2969             : /*
    2970             :  * Check and if necessary update the leaf rcu_node structure's
    2971             :  * ->cbovldmask bit corresponding to the current CPU based on that CPU's
    2972             :  * number of queued RCU callbacks.  The caller must hold the leaf rcu_node
    2973             :  * structure's ->lock.
    2974             :  */
    2975           0 : static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp)
    2976             : {
    2977           0 :         raw_lockdep_assert_held_rcu_node(rnp);
    2978           0 :         if (qovld_calc <= 0)
    2979             :                 return; // Early boot and wildcard value set.
    2980           0 :         if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc)
    2981           0 :                 WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask);
    2982             :         else
    2983           0 :                 WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
    2984             : }
    2985             : 
    2986             : /*
    2987             :  * Check and if necessary update the leaf rcu_node structure's
    2988             :  * ->cbovldmask bit corresponding to the current CPU based on that CPU's
    2989             :  * number of queued RCU callbacks.  No locks need be held, but the
    2990             :  * caller must have disabled interrupts.
    2991             :  *
    2992             :  * Note that this function ignores the possibility that there are a lot
    2993             :  * of callbacks all of which have already seen the end of their respective
    2994             :  * grace periods.  This omission is due to the need for no-CBs CPUs to
    2995             :  * be holding ->nocb_lock to do this check, which is too heavy for a
    2996             :  * common-case operation.
    2997             :  */
    2998      627341 : static void check_cb_ovld(struct rcu_data *rdp)
    2999             : {
    3000      627341 :         struct rcu_node *const rnp = rdp->mynode;
    3001             : 
    3002      627341 :         if (qovld_calc <= 0 ||
    3003      627338 :             ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) ==
    3004      627338 :              !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
    3005             :                 return; // Early boot wildcard value or already set correctly.
    3006           0 :         raw_spin_lock_rcu_node(rnp);
    3007           0 :         check_cb_ovld_locked(rdp, rnp);
    3008           0 :         raw_spin_unlock_rcu_node(rnp);
    3009             : }
    3010             : 
    3011             : /* Helper function for call_rcu() and friends.  */
    3012             : static void
    3013      627265 : __call_rcu(struct rcu_head *head, rcu_callback_t func)
    3014             : {
    3015      627265 :         static atomic_t doublefrees;
    3016      627265 :         unsigned long flags;
    3017      627265 :         struct rcu_data *rdp;
    3018      627265 :         bool was_alldone;
    3019             : 
    3020             :         /* Misaligned rcu_head! */
    3021      627265 :         WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
    3022             : 
    3023      627265 :         if (debug_rcu_head_queue(head)) {
    3024             :                 /*
    3025             :                  * Probable double call_rcu(), so leak the callback.
    3026             :                  * Use rcu:rcu_callback trace event to find the previous
    3027             :                  * time callback was passed to __call_rcu().
    3028             :                  */
    3029           0 :                 if (atomic_inc_return(&doublefrees) < 4) {
    3030           0 :                         pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
    3031           0 :                         mem_dump_obj(head);
    3032             :                 }
    3033           0 :                 WRITE_ONCE(head->func, rcu_leak_callback);
    3034           0 :                 return;
    3035             :         }
    3036      627002 :         head->func = func;
    3037      627002 :         head->next = NULL;
    3038     1254057 :         local_irq_save(flags);
    3039      626953 :         kasan_record_aux_stack(head);
    3040      627350 :         rdp = this_cpu_ptr(&rcu_data);
    3041             : 
    3042             :         /* Add the callback to our list. */
    3043      627356 :         if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
    3044             :                 // This can trigger due to call_rcu() from offline CPU:
    3045           1 :                 WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
    3046           2 :                 WARN_ON_ONCE(!rcu_is_watching());
    3047             :                 // Very early boot, before rcu_init().  Initialize if needed
    3048             :                 // and then drop through to queue the callback.
    3049           1 :                 if (rcu_segcblist_empty(&rdp->cblist))
    3050           1 :                         rcu_segcblist_init(&rdp->cblist);
    3051             :         }
    3052             : 
    3053      627356 :         check_cb_ovld(rdp);
    3054      627326 :         if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
    3055             :                 return; // Enqueued onto ->nocb_bypass, so just leave.
    3056             :         // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
    3057      627326 :         rcu_segcblist_enqueue(&rdp->cblist, head);
    3058      627279 :         if (__is_kvfree_rcu_offset((unsigned long)func))
    3059           0 :                 trace_rcu_kvfree_callback(rcu_state.name, head,
    3060             :                                          (unsigned long)func,
    3061             :                                          rcu_segcblist_n_cbs(&rdp->cblist));
    3062             :         else
    3063      627279 :                 trace_rcu_callback(rcu_state.name, head,
    3064             :                                    rcu_segcblist_n_cbs(&rdp->cblist));
    3065             : 
    3066      627279 :         trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
    3067             : 
    3068             :         /* Go handle any RCU core processing required. */
    3069      627279 :         if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
    3070      627041 :                 __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
    3071             :         } else {
    3072      627279 :                 __call_rcu_core(rdp, head, flags);
    3073      627209 :                 local_irq_restore(flags);
    3074             :         }
    3075             : }
    3076             : 
    3077             : /**
    3078             :  * call_rcu() - Queue an RCU callback for invocation after a grace period.
    3079             :  * @head: structure to be used for queueing the RCU updates.
    3080             :  * @func: actual callback function to be invoked after the grace period
    3081             :  *
    3082             :  * The callback function will be invoked some time after a full grace
    3083             :  * period elapses, in other words after all pre-existing RCU read-side
    3084             :  * critical sections have completed.  However, the callback function
    3085             :  * might well execute concurrently with RCU read-side critical sections
    3086             :  * that started after call_rcu() was invoked.  RCU read-side critical
    3087             :  * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
    3088             :  * may be nested.  In addition, regions of code across which interrupts,
    3089             :  * preemption, or softirqs have been disabled also serve as RCU read-side
    3090             :  * critical sections.  This includes hardware interrupt handlers, softirq
    3091             :  * handlers, and NMI handlers.
    3092             :  *
    3093             :  * Note that all CPUs must agree that the grace period extended beyond
    3094             :  * all pre-existing RCU read-side critical section.  On systems with more
    3095             :  * than one CPU, this means that when "func()" is invoked, each CPU is
    3096             :  * guaranteed to have executed a full memory barrier since the end of its
    3097             :  * last RCU read-side critical section whose beginning preceded the call
    3098             :  * to call_rcu().  It also means that each CPU executing an RCU read-side
    3099             :  * critical section that continues beyond the start of "func()" must have
    3100             :  * executed a memory barrier after the call_rcu() but before the beginning
    3101             :  * of that RCU read-side critical section.  Note that these guarantees
    3102             :  * include CPUs that are offline, idle, or executing in user mode, as
    3103             :  * well as CPUs that are executing in the kernel.
    3104             :  *
    3105             :  * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
    3106             :  * resulting RCU callback function "func()", then both CPU A and CPU B are
    3107             :  * guaranteed to execute a full memory barrier during the time interval
    3108             :  * between the call to call_rcu() and the invocation of "func()" -- even
    3109             :  * if CPU A and CPU B are the same CPU (but again only if the system has
    3110             :  * more than one CPU).
    3111             :  */
    3112      627267 : void call_rcu(struct rcu_head *head, rcu_callback_t func)
    3113             : {
    3114      627267 :         __call_rcu(head, func);
    3115      627014 : }
    3116             : EXPORT_SYMBOL_GPL(call_rcu);
    3117             : 
    3118             : 
    3119             : /* Maximum number of jiffies to wait before draining a batch. */
    3120             : #define KFREE_DRAIN_JIFFIES (HZ / 50)
    3121             : #define KFREE_N_BATCHES 2
    3122             : #define FREE_N_CHANNELS 2
    3123             : 
    3124             : /**
    3125             :  * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
    3126             :  * @nr_records: Number of active pointers in the array
    3127             :  * @next: Next bulk object in the block chain
    3128             :  * @records: Array of the kvfree_rcu() pointers
    3129             :  */
    3130             : struct kvfree_rcu_bulk_data {
    3131             :         unsigned long nr_records;
    3132             :         struct kvfree_rcu_bulk_data *next;
    3133             :         void *records[];
    3134             : };
    3135             : 
    3136             : /*
    3137             :  * This macro defines how many entries the "records" array
    3138             :  * will contain. It is based on the fact that the size of
    3139             :  * kvfree_rcu_bulk_data structure becomes exactly one page.
    3140             :  */
    3141             : #define KVFREE_BULK_MAX_ENTR \
    3142             :         ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
    3143             : 
    3144             : /**
    3145             :  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
    3146             :  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
    3147             :  * @head_free: List of kfree_rcu() objects waiting for a grace period
    3148             :  * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
    3149             :  * @krcp: Pointer to @kfree_rcu_cpu structure
    3150             :  */
    3151             : 
    3152             : struct kfree_rcu_cpu_work {
    3153             :         struct rcu_work rcu_work;
    3154             :         struct rcu_head *head_free;
    3155             :         struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
    3156             :         struct kfree_rcu_cpu *krcp;
    3157             : };
    3158             : 
    3159             : /**
    3160             :  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
    3161             :  * @head: List of kfree_rcu() objects not yet waiting for a grace period
    3162             :  * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
    3163             :  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
    3164             :  * @lock: Synchronize access to this structure
    3165             :  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
    3166             :  * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
    3167             :  * @initialized: The @rcu_work fields have been initialized
    3168             :  * @count: Number of objects for which GP not started
    3169             :  * @bkvcache:
    3170             :  *      A simple cache list that contains objects for reuse purpose.
    3171             :  *      In order to save some per-cpu space the list is singular.
    3172             :  *      Even though it is lockless an access has to be protected by the
    3173             :  *      per-cpu lock.
    3174             :  * @page_cache_work: A work to refill the cache when it is empty
    3175             :  * @work_in_progress: Indicates that page_cache_work is running
    3176             :  * @hrtimer: A hrtimer for scheduling a page_cache_work
    3177             :  * @nr_bkv_objs: number of allocated objects at @bkvcache.
    3178             :  *
    3179             :  * This is a per-CPU structure.  The reason that it is not included in
    3180             :  * the rcu_data structure is to permit this code to be extracted from
    3181             :  * the RCU files.  Such extraction could allow further optimization of
    3182             :  * the interactions with the slab allocators.
    3183             :  */
    3184             : struct kfree_rcu_cpu {
    3185             :         struct rcu_head *head;
    3186             :         struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
    3187             :         struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
    3188             :         raw_spinlock_t lock;
    3189             :         struct delayed_work monitor_work;
    3190             :         bool monitor_todo;
    3191             :         bool initialized;
    3192             :         int count;
    3193             : 
    3194             :         struct work_struct page_cache_work;
    3195             :         atomic_t work_in_progress;
    3196             :         struct hrtimer hrtimer;
    3197             : 
    3198             :         struct llist_head bkvcache;
    3199             :         int nr_bkv_objs;
    3200             : };
    3201             : 
    3202             : static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
    3203             :         .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
    3204             : };
    3205             : 
    3206             : static __always_inline void
    3207         141 : debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
    3208             : {
    3209             : #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
    3210         141 :         int i;
    3211             : 
    3212         389 :         for (i = 0; i < bhead->nr_records; i++)
    3213         248 :                 debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
    3214             : #endif
    3215             : }
    3216             : 
    3217             : static inline struct kfree_rcu_cpu *
    3218         252 : krc_this_cpu_lock(unsigned long *flags)
    3219             : {
    3220         252 :         struct kfree_rcu_cpu *krcp;
    3221             : 
    3222         504 :         local_irq_save(*flags); // For safely calling this_cpu_ptr().
    3223         252 :         krcp = this_cpu_ptr(&krc);
    3224         252 :         raw_spin_lock(&krcp->lock);
    3225             : 
    3226         252 :         return krcp;
    3227             : }
    3228             : 
    3229             : static inline void
    3230         252 : krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
    3231             : {
    3232         252 :         raw_spin_unlock(&krcp->lock);
    3233         252 :         local_irq_restore(flags);
    3234         252 : }
    3235             : 
    3236             : static inline struct kvfree_rcu_bulk_data *
    3237         145 : get_cached_bnode(struct kfree_rcu_cpu *krcp)
    3238             : {
    3239         145 :         if (!krcp->nr_bkv_objs)
    3240             :                 return NULL;
    3241             : 
    3242         141 :         krcp->nr_bkv_objs--;
    3243         141 :         return (struct kvfree_rcu_bulk_data *)
    3244         141 :                 llist_del_first(&krcp->bkvcache);
    3245             : }
    3246             : 
    3247             : static inline bool
    3248         161 : put_cached_bnode(struct kfree_rcu_cpu *krcp,
    3249             :         struct kvfree_rcu_bulk_data *bnode)
    3250             : {
    3251             :         // Check the limit.
    3252         161 :         if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
    3253             :                 return false;
    3254             : 
    3255         161 :         llist_add((struct llist_node *) bnode, &krcp->bkvcache);
    3256         161 :         krcp->nr_bkv_objs++;
    3257         161 :         return true;
    3258             : 
    3259             : }
    3260             : 
    3261             : /*
    3262             :  * This function is invoked in workqueue context after a grace period.
    3263             :  * It frees all the objects queued on ->bhead_free or ->head_free.
    3264             :  */
    3265         144 : static void kfree_rcu_work(struct work_struct *work)
    3266             : {
    3267         144 :         unsigned long flags;
    3268         144 :         struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
    3269         144 :         struct rcu_head *head, *next;
    3270         144 :         struct kfree_rcu_cpu *krcp;
    3271         144 :         struct kfree_rcu_cpu_work *krwp;
    3272         144 :         int i, j;
    3273             : 
    3274         144 :         krwp = container_of(to_rcu_work(work),
    3275             :                             struct kfree_rcu_cpu_work, rcu_work);
    3276         144 :         krcp = krwp->krcp;
    3277             : 
    3278         144 :         raw_spin_lock_irqsave(&krcp->lock, flags);
    3279             :         // Channels 1 and 2.
    3280         576 :         for (i = 0; i < FREE_N_CHANNELS; i++) {
    3281         288 :                 bkvhead[i] = krwp->bkvhead_free[i];
    3282         288 :                 krwp->bkvhead_free[i] = NULL;
    3283             :         }
    3284             : 
    3285             :         // Channel 3.
    3286         144 :         head = krwp->head_free;
    3287         144 :         krwp->head_free = NULL;
    3288         144 :         raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3289             : 
    3290             :         // Handle two first channels.
    3291         576 :         for (i = 0; i < FREE_N_CHANNELS; i++) {
    3292         429 :                 for (; bkvhead[i]; bkvhead[i] = bnext) {
    3293         141 :                         bnext = bkvhead[i]->next;
    3294         141 :                         debug_rcu_bhead_unqueue(bkvhead[i]);
    3295             : 
    3296         141 :                         rcu_lock_acquire(&rcu_callback_map);
    3297         141 :                         if (i == 0) { // kmalloc() / kfree().
    3298         141 :                                 trace_rcu_invoke_kfree_bulk_callback(
    3299             :                                         rcu_state.name, bkvhead[i]->nr_records,
    3300         141 :                                         bkvhead[i]->records);
    3301             : 
    3302         141 :                                 kfree_bulk(bkvhead[i]->nr_records,
    3303             :                                         bkvhead[i]->records);
    3304             :                         } else { // vmalloc() / vfree().
    3305           0 :                                 for (j = 0; j < bkvhead[i]->nr_records; j++) {
    3306           0 :                                         trace_rcu_invoke_kvfree_callback(
    3307             :                                                 rcu_state.name,
    3308           0 :                                                 bkvhead[i]->records[j], 0);
    3309             : 
    3310           0 :                                         vfree(bkvhead[i]->records[j]);
    3311             :                                 }
    3312             :                         }
    3313         141 :                         rcu_lock_release(&rcu_callback_map);
    3314             : 
    3315         141 :                         raw_spin_lock_irqsave(&krcp->lock, flags);
    3316         141 :                         if (put_cached_bnode(krcp, bkvhead[i]))
    3317         141 :                                 bkvhead[i] = NULL;
    3318         141 :                         raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3319             : 
    3320         141 :                         if (bkvhead[i])
    3321           0 :                                 free_page((unsigned long) bkvhead[i]);
    3322             : 
    3323         141 :                         cond_resched_tasks_rcu_qs();
    3324             :                 }
    3325             :         }
    3326             : 
    3327             :         /*
    3328             :          * Emergency case only. It can happen under low memory
    3329             :          * condition when an allocation gets failed, so the "bulk"
    3330             :          * path can not be temporary maintained.
    3331             :          */
    3332         148 :         for (; head; head = next) {
    3333           4 :                 unsigned long offset = (unsigned long)head->func;
    3334           4 :                 void *ptr = (void *)head - offset;
    3335             : 
    3336           4 :                 next = head->next;
    3337           4 :                 debug_rcu_head_unqueue((struct rcu_head *)ptr);
    3338           4 :                 rcu_lock_acquire(&rcu_callback_map);
    3339           4 :                 trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
    3340             : 
    3341           4 :                 if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
    3342           4 :                         kvfree(ptr);
    3343             : 
    3344           4 :                 rcu_lock_release(&rcu_callback_map);
    3345           4 :                 cond_resched_tasks_rcu_qs();
    3346             :         }
    3347         144 : }
    3348             : 
    3349             : /*
    3350             :  * Schedule the kfree batch RCU work to run in workqueue context after a GP.
    3351             :  *
    3352             :  * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
    3353             :  * timeout has been reached.
    3354             :  */
    3355         165 : static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
    3356             : {
    3357         165 :         struct kfree_rcu_cpu_work *krwp;
    3358         165 :         bool repeat = false;
    3359         165 :         int i, j;
    3360             : 
    3361         330 :         lockdep_assert_held(&krcp->lock);
    3362             : 
    3363         495 :         for (i = 0; i < KFREE_N_BATCHES; i++) {
    3364         330 :                 krwp = &(krcp->krw_arr[i]);
    3365             : 
    3366             :                 /*
    3367             :                  * Try to detach bkvhead or head and attach it over any
    3368             :                  * available corresponding free channel. It can be that
    3369             :                  * a previous RCU batch is in progress, it means that
    3370             :                  * immediately to queue another one is not possible so
    3371             :                  * return false to tell caller to retry.
    3372             :                  */
    3373         330 :                 if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
    3374         189 :                         (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
    3375         189 :                                 (krcp->head && !krwp->head_free)) {
    3376             :                         // Channel 1 corresponds to SLAB ptrs.
    3377             :                         // Channel 2 corresponds to vmalloc ptrs.
    3378         432 :                         for (j = 0; j < FREE_N_CHANNELS; j++) {
    3379         288 :                                 if (!krwp->bkvhead_free[j]) {
    3380         288 :                                         krwp->bkvhead_free[j] = krcp->bkvhead[j];
    3381         288 :                                         krcp->bkvhead[j] = NULL;
    3382             :                                 }
    3383             :                         }
    3384             : 
    3385             :                         // Channel 3 corresponds to emergency path.
    3386         144 :                         if (!krwp->head_free) {
    3387         144 :                                 krwp->head_free = krcp->head;
    3388         144 :                                 krcp->head = NULL;
    3389             :                         }
    3390             : 
    3391         144 :                         WRITE_ONCE(krcp->count, 0);
    3392             : 
    3393             :                         /*
    3394             :                          * One work is per one batch, so there are three
    3395             :                          * "free channels", the batch can handle. It can
    3396             :                          * be that the work is in the pending state when
    3397             :                          * channels have been detached following by each
    3398             :                          * other.
    3399             :                          */
    3400         144 :                         queue_rcu_work(system_wq, &krwp->rcu_work);
    3401             :                 }
    3402             : 
    3403             :                 // Repeat if any "free" corresponding channel is still busy.
    3404         330 :                 if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
    3405          35 :                         repeat = true;
    3406             :         }
    3407             : 
    3408         165 :         return !repeat;
    3409             : }
    3410             : 
    3411         165 : static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
    3412             :                                           unsigned long flags)
    3413             : {
    3414             :         // Attempt to start a new batch.
    3415         165 :         krcp->monitor_todo = false;
    3416         165 :         if (queue_kfree_rcu_work(krcp)) {
    3417             :                 // Success! Our job is done here.
    3418         139 :                 raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3419         139 :                 return;
    3420             :         }
    3421             : 
    3422             :         // Previous RCU batch still in progress, try again later.
    3423          26 :         krcp->monitor_todo = true;
    3424          26 :         schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
    3425          26 :         raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3426             : }
    3427             : 
    3428             : /*
    3429             :  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
    3430             :  * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
    3431             :  */
    3432         165 : static void kfree_rcu_monitor(struct work_struct *work)
    3433             : {
    3434         165 :         unsigned long flags;
    3435         165 :         struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
    3436             :                                                  monitor_work.work);
    3437             : 
    3438         165 :         raw_spin_lock_irqsave(&krcp->lock, flags);
    3439         165 :         if (krcp->monitor_todo)
    3440         165 :                 kfree_rcu_drain_unlock(krcp, flags);
    3441             :         else
    3442           0 :                 raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3443         165 : }
    3444             : 
    3445             : static enum hrtimer_restart
    3446           4 : schedule_page_work_fn(struct hrtimer *t)
    3447             : {
    3448           4 :         struct kfree_rcu_cpu *krcp =
    3449           4 :                 container_of(t, struct kfree_rcu_cpu, hrtimer);
    3450             : 
    3451           4 :         queue_work(system_highpri_wq, &krcp->page_cache_work);
    3452           4 :         return HRTIMER_NORESTART;
    3453             : }
    3454             : 
    3455           4 : static void fill_page_cache_func(struct work_struct *work)
    3456             : {
    3457           4 :         struct kvfree_rcu_bulk_data *bnode;
    3458           4 :         struct kfree_rcu_cpu *krcp =
    3459           4 :                 container_of(work, struct kfree_rcu_cpu,
    3460             :                         page_cache_work);
    3461           4 :         unsigned long flags;
    3462           4 :         bool pushed;
    3463           4 :         int i;
    3464             : 
    3465          24 :         for (i = 0; i < rcu_min_cached_objs; i++) {
    3466          40 :                 bnode = (struct kvfree_rcu_bulk_data *)
    3467          20 :                         __get_free_page(GFP_KERNEL | __GFP_NOWARN);
    3468             : 
    3469          20 :                 if (bnode) {
    3470          20 :                         raw_spin_lock_irqsave(&krcp->lock, flags);
    3471          20 :                         pushed = put_cached_bnode(krcp, bnode);
    3472          20 :                         raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3473             : 
    3474          20 :                         if (!pushed) {
    3475           0 :                                 free_page((unsigned long) bnode);
    3476           0 :                                 break;
    3477             :                         }
    3478             :                 }
    3479             :         }
    3480             : 
    3481           4 :         atomic_set(&krcp->work_in_progress, 0);
    3482           4 : }
    3483             : 
    3484             : static void
    3485           4 : run_page_cache_worker(struct kfree_rcu_cpu *krcp)
    3486             : {
    3487           4 :         if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
    3488           8 :                         !atomic_xchg(&krcp->work_in_progress, 1)) {
    3489           4 :                 hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
    3490             :                         HRTIMER_MODE_REL);
    3491           4 :                 krcp->hrtimer.function = schedule_page_work_fn;
    3492           4 :                 hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
    3493             :         }
    3494           4 : }
    3495             : 
    3496             : static inline bool
    3497         252 : kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
    3498             : {
    3499         252 :         struct kvfree_rcu_bulk_data *bnode;
    3500         252 :         int idx;
    3501             : 
    3502         252 :         if (unlikely(!krcp->initialized))
    3503             :                 return false;
    3504             : 
    3505         504 :         lockdep_assert_held(&krcp->lock);
    3506         252 :         idx = !!is_vmalloc_addr(ptr);
    3507             : 
    3508             :         /* Check if a new block is required. */
    3509         252 :         if (!krcp->bkvhead[idx] ||
    3510         107 :                         krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
    3511         145 :                 bnode = get_cached_bnode(krcp);
    3512             :                 /* Switch to emergency path. */
    3513         145 :                 if (!bnode)
    3514             :                         return false;
    3515             : 
    3516             :                 /* Initialize the new block. */
    3517         141 :                 bnode->nr_records = 0;
    3518         141 :                 bnode->next = krcp->bkvhead[idx];
    3519             : 
    3520             :                 /* Attach it to the head. */
    3521         141 :                 krcp->bkvhead[idx] = bnode;
    3522             :         }
    3523             : 
    3524             :         /* Finally insert. */
    3525         248 :         krcp->bkvhead[idx]->records
    3526         248 :                 [krcp->bkvhead[idx]->nr_records++] = ptr;
    3527             : 
    3528         248 :         return true;
    3529             : }
    3530             : 
    3531             : /*
    3532             :  * Queue a request for lazy invocation of appropriate free routine after a
    3533             :  * grace period. Please note there are three paths are maintained, two are the
    3534             :  * main ones that use array of pointers interface and third one is emergency
    3535             :  * one, that is used only when the main path can not be maintained temporary,
    3536             :  * due to memory pressure.
    3537             :  *
    3538             :  * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
    3539             :  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
    3540             :  * be free'd in workqueue context. This allows us to: batch requests together to
    3541             :  * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
    3542             :  */
    3543         252 : void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
    3544             : {
    3545         252 :         unsigned long flags;
    3546         252 :         struct kfree_rcu_cpu *krcp;
    3547         252 :         bool success;
    3548         252 :         void *ptr;
    3549             : 
    3550         252 :         if (head) {
    3551         252 :                 ptr = (void *) head - (unsigned long) func;
    3552             :         } else {
    3553             :                 /*
    3554             :                  * Please note there is a limitation for the head-less
    3555             :                  * variant, that is why there is a clear rule for such
    3556             :                  * objects: it can be used from might_sleep() context
    3557             :                  * only. For other places please embed an rcu_head to
    3558             :                  * your data.
    3559             :                  */
    3560           0 :                 might_sleep();
    3561           0 :                 ptr = (unsigned long *) func;
    3562             :         }
    3563             : 
    3564         252 :         krcp = krc_this_cpu_lock(&flags);
    3565             : 
    3566             :         // Queue the object but don't yet schedule the batch.
    3567         252 :         if (debug_rcu_head_queue(ptr)) {
    3568             :                 // Probable double kfree_rcu(), just leak.
    3569           0 :                 WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
    3570             :                           __func__, head);
    3571             : 
    3572             :                 // Mark as success and leave.
    3573           0 :                 success = true;
    3574           0 :                 goto unlock_return;
    3575             :         }
    3576             : 
    3577         252 :         kasan_record_aux_stack(ptr);
    3578         252 :         success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
    3579         252 :         if (!success) {
    3580           4 :                 run_page_cache_worker(krcp);
    3581             : 
    3582           4 :                 if (head == NULL)
    3583             :                         // Inline if kvfree_rcu(one_arg) call.
    3584           0 :                         goto unlock_return;
    3585             : 
    3586           4 :                 head->func = func;
    3587           4 :                 head->next = krcp->head;
    3588           4 :                 krcp->head = head;
    3589           4 :                 success = true;
    3590             :         }
    3591             : 
    3592         252 :         WRITE_ONCE(krcp->count, krcp->count + 1);
    3593             : 
    3594             :         // Set timer to drain after KFREE_DRAIN_JIFFIES.
    3595         252 :         if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
    3596         252 :             !krcp->monitor_todo) {
    3597         139 :                 krcp->monitor_todo = true;
    3598         139 :                 schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
    3599             :         }
    3600             : 
    3601         113 : unlock_return:
    3602         252 :         krc_this_cpu_unlock(krcp, flags);
    3603             : 
    3604             :         /*
    3605             :          * Inline kvfree() after synchronize_rcu(). We can do
    3606             :          * it from might_sleep() context only, so the current
    3607             :          * CPU can pass the QS state.
    3608             :          */
    3609         252 :         if (!success) {
    3610           0 :                 debug_rcu_head_unqueue((struct rcu_head *) ptr);
    3611           0 :                 synchronize_rcu();
    3612           0 :                 kvfree(ptr);
    3613             :         }
    3614         252 : }
    3615             : EXPORT_SYMBOL_GPL(kvfree_call_rcu);
    3616             : 
    3617             : static unsigned long
    3618           0 : kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
    3619             : {
    3620           0 :         int cpu;
    3621           0 :         unsigned long count = 0;
    3622             : 
    3623             :         /* Snapshot count of all CPUs */
    3624           0 :         for_each_possible_cpu(cpu) {
    3625           0 :                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
    3626             : 
    3627           0 :                 count += READ_ONCE(krcp->count);
    3628             :         }
    3629             : 
    3630           0 :         return count;
    3631             : }
    3632             : 
    3633             : static unsigned long
    3634           0 : kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
    3635             : {
    3636           0 :         int cpu, freed = 0;
    3637           0 :         unsigned long flags;
    3638             : 
    3639           0 :         for_each_possible_cpu(cpu) {
    3640           0 :                 int count;
    3641           0 :                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
    3642             : 
    3643           0 :                 count = krcp->count;
    3644           0 :                 raw_spin_lock_irqsave(&krcp->lock, flags);
    3645           0 :                 if (krcp->monitor_todo)
    3646           0 :                         kfree_rcu_drain_unlock(krcp, flags);
    3647             :                 else
    3648           0 :                         raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3649             : 
    3650           0 :                 sc->nr_to_scan -= count;
    3651           0 :                 freed += count;
    3652             : 
    3653           0 :                 if (sc->nr_to_scan <= 0)
    3654             :                         break;
    3655             :         }
    3656             : 
    3657           0 :         return freed == 0 ? SHRINK_STOP : freed;
    3658             : }
    3659             : 
    3660             : static struct shrinker kfree_rcu_shrinker = {
    3661             :         .count_objects = kfree_rcu_shrink_count,
    3662             :         .scan_objects = kfree_rcu_shrink_scan,
    3663             :         .batch = 0,
    3664             :         .seeks = DEFAULT_SEEKS,
    3665             : };
    3666             : 
    3667           1 : void __init kfree_rcu_scheduler_running(void)
    3668             : {
    3669           1 :         int cpu;
    3670           1 :         unsigned long flags;
    3671             : 
    3672           6 :         for_each_possible_cpu(cpu) {
    3673           4 :                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
    3674             : 
    3675           4 :                 raw_spin_lock_irqsave(&krcp->lock, flags);
    3676           4 :                 if (!krcp->head || krcp->monitor_todo) {
    3677           4 :                         raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3678           4 :                         continue;
    3679             :                 }
    3680           0 :                 krcp->monitor_todo = true;
    3681           0 :                 schedule_delayed_work_on(cpu, &krcp->monitor_work,
    3682             :                                          KFREE_DRAIN_JIFFIES);
    3683           5 :                 raw_spin_unlock_irqrestore(&krcp->lock, flags);
    3684             :         }
    3685           1 : }
    3686             : 
    3687             : /*
    3688             :  * During early boot, any blocking grace-period wait automatically
    3689             :  * implies a grace period.  Later on, this is never the case for PREEMPTION.
    3690             :  *
    3691             :  * However, because a context switch is a grace period for !PREEMPTION, any
    3692             :  * blocking grace-period wait automatically implies a grace period if
    3693             :  * there is only one CPU online at any point time during execution of
    3694             :  * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to
    3695             :  * occasionally incorrectly indicate that there are multiple CPUs online
    3696             :  * when there was in fact only one the whole time, as this just adds some
    3697             :  * overhead: RCU still operates correctly.
    3698             :  */
    3699         174 : static int rcu_blocking_is_gp(void)
    3700             : {
    3701         174 :         int ret;
    3702             : 
    3703         174 :         if (IS_ENABLED(CONFIG_PREEMPTION))
    3704             :                 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
    3705         174 :         might_sleep();  /* Check for RCU read-side critical section. */
    3706         174 :         preempt_disable();
    3707             :         /*
    3708             :          * If the rcu_state.n_online_cpus counter is equal to one,
    3709             :          * there is only one CPU, and that CPU sees all prior accesses
    3710             :          * made by any CPU that was online at the time of its access.
    3711             :          * Furthermore, if this counter is equal to one, its value cannot
    3712             :          * change until after the preempt_enable() below.
    3713             :          *
    3714             :          * Furthermore, if rcu_state.n_online_cpus is equal to one here,
    3715             :          * all later CPUs (both this one and any that come online later
    3716             :          * on) are guaranteed to see all accesses prior to this point
    3717             :          * in the code, without the need for additional memory barriers.
    3718             :          * Those memory barriers are provided by CPU-hotplug code.
    3719             :          */
    3720         174 :         ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
    3721         174 :         preempt_enable();
    3722         174 :         return ret;
    3723             : }
    3724             : 
    3725             : /**
    3726             :  * synchronize_rcu - wait until a grace period has elapsed.
    3727             :  *
    3728             :  * Control will return to the caller some time after a full grace
    3729             :  * period has elapsed, in other words after all currently executing RCU
    3730             :  * read-side critical sections have completed.  Note, however, that
    3731             :  * upon return from synchronize_rcu(), the caller might well be executing
    3732             :  * concurrently with new RCU read-side critical sections that began while
    3733             :  * synchronize_rcu() was waiting.  RCU read-side critical sections are
    3734             :  * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
    3735             :  * In addition, regions of code across which interrupts, preemption, or
    3736             :  * softirqs have been disabled also serve as RCU read-side critical
    3737             :  * sections.  This includes hardware interrupt handlers, softirq handlers,
    3738             :  * and NMI handlers.
    3739             :  *
    3740             :  * Note that this guarantee implies further memory-ordering guarantees.
    3741             :  * On systems with more than one CPU, when synchronize_rcu() returns,
    3742             :  * each CPU is guaranteed to have executed a full memory barrier since
    3743             :  * the end of its last RCU read-side critical section whose beginning
    3744             :  * preceded the call to synchronize_rcu().  In addition, each CPU having
    3745             :  * an RCU read-side critical section that extends beyond the return from
    3746             :  * synchronize_rcu() is guaranteed to have executed a full memory barrier
    3747             :  * after the beginning of synchronize_rcu() and before the beginning of
    3748             :  * that RCU read-side critical section.  Note that these guarantees include
    3749             :  * CPUs that are offline, idle, or executing in user mode, as well as CPUs
    3750             :  * that are executing in the kernel.
    3751             :  *
    3752             :  * Furthermore, if CPU A invoked synchronize_rcu(), which returned
    3753             :  * to its caller on CPU B, then both CPU A and CPU B are guaranteed
    3754             :  * to have executed a full memory barrier during the execution of
    3755             :  * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
    3756             :  * again only if the system has more than one CPU).
    3757             :  */
    3758          10 : void synchronize_rcu(void)
    3759             : {
    3760          31 :         RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
    3761             :                          lock_is_held(&rcu_lock_map) ||
    3762             :                          lock_is_held(&rcu_sched_lock_map),
    3763             :                          "Illegal synchronize_rcu() in RCU read-side critical section");
    3764          10 :         if (rcu_blocking_is_gp())
    3765             :                 return;  // Context allows vacuous grace periods.
    3766           5 :         if (rcu_gp_is_expedited())
    3767           4 :                 synchronize_rcu_expedited();
    3768             :         else
    3769           1 :                 wait_rcu_gp(call_rcu);
    3770             : }
    3771             : EXPORT_SYMBOL_GPL(synchronize_rcu);
    3772             : 
    3773             : /**
    3774             :  * get_state_synchronize_rcu - Snapshot current RCU state
    3775             :  *
    3776             :  * Returns a cookie that is used by a later call to cond_synchronize_rcu()
    3777             :  * to determine whether or not a full grace period has elapsed in the
    3778             :  * meantime.
    3779             :  */
    3780           0 : unsigned long get_state_synchronize_rcu(void)
    3781             : {
    3782             :         /*
    3783             :          * Any prior manipulation of RCU-protected data must happen
    3784             :          * before the load from ->gp_seq.
    3785             :          */
    3786           0 :         smp_mb();  /* ^^^ */
    3787           0 :         return rcu_seq_snap(&rcu_state.gp_seq);
    3788             : }
    3789             : EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
    3790             : 
    3791             : /**
    3792             :  * cond_synchronize_rcu - Conditionally wait for an RCU grace period
    3793             :  *
    3794             :  * @oldstate: return value from earlier call to get_state_synchronize_rcu()
    3795             :  *
    3796             :  * If a full RCU grace period has elapsed since the earlier call to
    3797             :  * get_state_synchronize_rcu(), just return.  Otherwise, invoke
    3798             :  * synchronize_rcu() to wait for a full grace period.
    3799             :  *
    3800             :  * Yes, this function does not take counter wrap into account.  But
    3801             :  * counter wrap is harmless.  If the counter wraps, we have waited for
    3802             :  * more than 2 billion grace periods (and way more on a 64-bit system!),
    3803             :  * so waiting for one additional grace period should be just fine.
    3804             :  */
    3805           0 : void cond_synchronize_rcu(unsigned long oldstate)
    3806             : {
    3807           0 :         if (!rcu_seq_done(&rcu_state.gp_seq, oldstate))
    3808           0 :                 synchronize_rcu();
    3809             :         else
    3810           0 :                 smp_mb(); /* Ensure GP ends before subsequent accesses. */
    3811           0 : }
    3812             : EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
    3813             : 
    3814             : /*
    3815             :  * Check to see if there is any immediate RCU-related work to be done by
    3816             :  * the current CPU, returning 1 if so and zero otherwise.  The checks are
    3817             :  * in order of increasing expense: checks that can be carried out against
    3818             :  * CPU-local state are performed first.  However, we must check for CPU
    3819             :  * stalls first, else we might not get a chance.
    3820             :  */
    3821       27560 : static int rcu_pending(int user)
    3822             : {
    3823       27560 :         bool gp_in_progress;
    3824       27560 :         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    3825       28034 :         struct rcu_node *rnp = rdp->mynode;
    3826             : 
    3827       56088 :         lockdep_assert_irqs_disabled();
    3828             : 
    3829             :         /* Check for CPU stalls, if enabled. */
    3830       27984 :         check_cpu_stall(rdp);
    3831             : 
    3832             :         /* Does this CPU need a deferred NOCB wakeup? */
    3833       27740 :         if (rcu_nocb_need_deferred_wakeup(rdp))
    3834             :                 return 1;
    3835             : 
    3836             :         /* Is this a nohz_full CPU in userspace or idle?  (Ignore RCU if so.) */
    3837       27740 :         if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
    3838             :                 return 0;
    3839             : 
    3840             :         /* Is the RCU core waiting for a quiescent state from this CPU? */
    3841       28183 :         gp_in_progress = rcu_gp_in_progress();
    3842       28183 :         if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
    3843             :                 return 1;
    3844             : 
    3845             :         /* Does this CPU have callbacks ready to invoke? */
    3846       22765 :         if (!rcu_segcblist_is_offloaded(&rdp->cblist) &&
    3847       22765 :             rcu_segcblist_ready_cbs(&rdp->cblist))
    3848             :                 return 1;
    3849             : 
    3850             :         /* Has RCU gone idle with this CPU needing another grace period? */
    3851       21350 :         if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
    3852          83 :             !rcu_segcblist_is_offloaded(&rdp->cblist) &&
    3853          83 :             !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
    3854             :                 return 1;
    3855             : 
    3856             :         /* Have RCU grace period completed or started?  */
    3857       21330 :         if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq ||
    3858       16144 :             unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
    3859        5186 :                 return 1;
    3860             : 
    3861             :         /* nothing to do */
    3862             :         return 0;
    3863             : }
    3864             : 
    3865             : /*
    3866             :  * Helper function for rcu_barrier() tracing.  If tracing is disabled,
    3867             :  * the compiler is expected to optimize this away.
    3868             :  */
    3869          11 : static void rcu_barrier_trace(const char *s, int cpu, unsigned long done)
    3870             : {
    3871          11 :         trace_rcu_barrier(rcu_state.name, s, cpu,
    3872             :                           atomic_read(&rcu_state.barrier_cpu_count), done);
    3873           3 : }
    3874             : 
    3875             : /*
    3876             :  * RCU callback function for rcu_barrier().  If we are last, wake
    3877             :  * up the task executing rcu_barrier().
    3878             :  *
    3879             :  * Note that the value of rcu_state.barrier_sequence must be captured
    3880             :  * before the atomic_dec_and_test().  Otherwise, if this CPU is not last,
    3881             :  * other CPUs might count the value down to zero before this CPU gets
    3882             :  * around to invoking rcu_barrier_trace(), which might result in bogus
    3883             :  * data from the next instance of rcu_barrier().
    3884             :  */
    3885           2 : static void rcu_barrier_callback(struct rcu_head *rhp)
    3886             : {
    3887           2 :         unsigned long __maybe_unused s = rcu_state.barrier_sequence;
    3888             : 
    3889           4 :         if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
    3890           1 :                 rcu_barrier_trace(TPS("LastCB"), -1, s);
    3891           1 :                 complete(&rcu_state.barrier_completion);
    3892             :         } else {
    3893           1 :                 rcu_barrier_trace(TPS("CB"), -1, s);
    3894             :         }
    3895           2 : }
    3896             : 
    3897             : /*
    3898             :  * Called with preemption disabled, and from cross-cpu IRQ context.
    3899             :  */
    3900           2 : static void rcu_barrier_func(void *cpu_in)
    3901             : {
    3902           2 :         uintptr_t cpu = (uintptr_t)cpu_in;
    3903           2 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    3904             : 
    3905           2 :         rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
    3906           2 :         rdp->barrier_head.func = rcu_barrier_callback;
    3907           2 :         debug_rcu_head_queue(&rdp->barrier_head);
    3908           2 :         rcu_nocb_lock(rdp);
    3909           2 :         WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
    3910           2 :         if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
    3911           2 :                 atomic_inc(&rcu_state.barrier_cpu_count);
    3912             :         } else {
    3913           0 :                 debug_rcu_head_unqueue(&rdp->barrier_head);
    3914           0 :                 rcu_barrier_trace(TPS("IRQNQ"), -1,
    3915             :                                   rcu_state.barrier_sequence);
    3916             :         }
    3917           2 :         rcu_nocb_unlock(rdp);
    3918           2 : }
    3919             : 
    3920             : /**
    3921             :  * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
    3922             :  *
    3923             :  * Note that this primitive does not necessarily wait for an RCU grace period
    3924             :  * to complete.  For example, if there are no RCU callbacks queued anywhere
    3925             :  * in the system, then rcu_barrier() is within its rights to return
    3926             :  * immediately, without waiting for anything, much less an RCU grace period.
    3927             :  */
    3928           1 : void rcu_barrier(void)
    3929             : {
    3930           1 :         uintptr_t cpu;
    3931           1 :         struct rcu_data *rdp;
    3932           1 :         unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
    3933             : 
    3934           1 :         rcu_barrier_trace(TPS("Begin"), -1, s);
    3935             : 
    3936             :         /* Take mutex to serialize concurrent rcu_barrier() requests. */
    3937           1 :         mutex_lock(&rcu_state.barrier_mutex);
    3938             : 
    3939             :         /* Did someone else do our work for us? */
    3940           1 :         if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
    3941           0 :                 rcu_barrier_trace(TPS("EarlyExit"), -1,
    3942             :                                   rcu_state.barrier_sequence);
    3943           0 :                 smp_mb(); /* caller's subsequent code after above check. */
    3944           0 :                 mutex_unlock(&rcu_state.barrier_mutex);
    3945           0 :                 return;
    3946             :         }
    3947             : 
    3948             :         /* Mark the start of the barrier operation. */
    3949           1 :         rcu_seq_start(&rcu_state.barrier_sequence);
    3950           1 :         rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
    3951             : 
    3952             :         /*
    3953             :          * Initialize the count to two rather than to zero in order
    3954             :          * to avoid a too-soon return to zero in case of an immediate
    3955             :          * invocation of the just-enqueued callback (or preemption of
    3956             :          * this task).  Exclude CPU-hotplug operations to ensure that no
    3957             :          * offline non-offloaded CPU has callbacks queued.
    3958             :          */
    3959           1 :         init_completion(&rcu_state.barrier_completion);
    3960           1 :         atomic_set(&rcu_state.barrier_cpu_count, 2);
    3961           1 :         get_online_cpus();
    3962             : 
    3963             :         /*
    3964             :          * Force each CPU with callbacks to register a new callback.
    3965             :          * When that callback is invoked, we will know that all of the
    3966             :          * corresponding CPU's preceding callbacks have been invoked.
    3967             :          */
    3968           6 :         for_each_possible_cpu(cpu) {
    3969           4 :                 rdp = per_cpu_ptr(&rcu_data, cpu);
    3970           4 :                 if (cpu_is_offline(cpu) &&
    3971           0 :                     !rcu_segcblist_is_offloaded(&rdp->cblist))
    3972           0 :                         continue;
    3973           4 :                 if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
    3974           2 :                         rcu_barrier_trace(TPS("OnlineQ"), cpu,
    3975             :                                           rcu_state.barrier_sequence);
    3976           2 :                         smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
    3977           2 :                 } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
    3978           0 :                            cpu_is_offline(cpu)) {
    3979           0 :                         rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
    3980             :                                           rcu_state.barrier_sequence);
    3981           0 :                         local_irq_disable();
    3982           0 :                         rcu_barrier_func((void *)cpu);
    3983           0 :                         local_irq_enable();
    3984           2 :                 } else if (cpu_is_offline(cpu)) {
    3985           0 :                         rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
    3986             :                                           rcu_state.barrier_sequence);
    3987             :                 } else {
    3988           7 :                         rcu_barrier_trace(TPS("OnlineNQ"), cpu,
    3989             :                                           rcu_state.barrier_sequence);
    3990             :                 }
    3991             :         }
    3992           1 :         put_online_cpus();
    3993             : 
    3994             :         /*
    3995             :          * Now that we have an rcu_barrier_callback() callback on each
    3996             :          * CPU, and thus each counted, remove the initial count.
    3997             :          */
    3998           2 :         if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count))
    3999           0 :                 complete(&rcu_state.barrier_completion);
    4000             : 
    4001             :         /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
    4002           1 :         wait_for_completion(&rcu_state.barrier_completion);
    4003             : 
    4004             :         /* Mark the end of the barrier operation. */
    4005           1 :         rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
    4006           1 :         rcu_seq_end(&rcu_state.barrier_sequence);
    4007             : 
    4008             :         /* Other rcu_barrier() invocations can now safely proceed. */
    4009           1 :         mutex_unlock(&rcu_state.barrier_mutex);
    4010             : }
    4011             : EXPORT_SYMBOL_GPL(rcu_barrier);
    4012             : 
    4013             : /*
    4014             :  * Propagate ->qsinitmask bits up the rcu_node tree to account for the
    4015             :  * first CPU in a given leaf rcu_node structure coming online.  The caller
    4016             :  * must hold the corresponding leaf rcu_node ->lock with interrrupts
    4017             :  * disabled.
    4018             :  */
    4019           1 : static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
    4020             : {
    4021           1 :         long mask;
    4022           1 :         long oldmask;
    4023           1 :         struct rcu_node *rnp = rnp_leaf;
    4024             : 
    4025           2 :         raw_lockdep_assert_held_rcu_node(rnp_leaf);
    4026           1 :         WARN_ON_ONCE(rnp->wait_blkd_tasks);
    4027           1 :         for (;;) {
    4028           1 :                 mask = rnp->grpmask;
    4029           1 :                 rnp = rnp->parent;
    4030           1 :                 if (rnp == NULL)
    4031             :                         return;
    4032           0 :                 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
    4033           0 :                 oldmask = rnp->qsmaskinit;
    4034           0 :                 rnp->qsmaskinit |= mask;
    4035           0 :                 raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
    4036           0 :                 if (oldmask)
    4037             :                         return;
    4038             :         }
    4039             : }
    4040             : 
    4041             : /*
    4042             :  * Do boot-time initialization of a CPU's per-CPU RCU data.
    4043             :  */
    4044             : static void __init
    4045           4 : rcu_boot_init_percpu_data(int cpu)
    4046             : {
    4047           4 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    4048             : 
    4049             :         /* Set up local state, ensuring consistent view of global state. */
    4050           4 :         rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
    4051           4 :         INIT_WORK(&rdp->strict_work, strict_work_handler);
    4052           4 :         WARN_ON_ONCE(rdp->dynticks_nesting != 1);
    4053           4 :         WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
    4054           4 :         rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
    4055           4 :         rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
    4056           4 :         rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
    4057           4 :         rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
    4058           4 :         rdp->cpu = cpu;
    4059           4 :         rcu_boot_init_nocb_percpu_data(rdp);
    4060           4 : }
    4061             : 
    4062             : /*
    4063             :  * Invoked early in the CPU-online process, when pretty much all services
    4064             :  * are available.  The incoming CPU is not present.
    4065             :  *
    4066             :  * Initializes a CPU's per-CPU RCU data.  Note that only one online or
    4067             :  * offline event can be happening at a given time.  Note also that we can
    4068             :  * accept some slop in the rsp->gp_seq access due to the fact that this
    4069             :  * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
    4070             :  * And any offloaded callbacks are being numbered elsewhere.
    4071             :  */
    4072           4 : int rcutree_prepare_cpu(unsigned int cpu)
    4073             : {
    4074           4 :         unsigned long flags;
    4075           4 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    4076           4 :         struct rcu_node *rnp = rcu_get_root();
    4077             : 
    4078             :         /* Set up local state, ensuring consistent view of global state. */
    4079           4 :         raw_spin_lock_irqsave_rcu_node(rnp, flags);
    4080           4 :         rdp->qlen_last_fqs_check = 0;
    4081           4 :         rdp->n_force_qs_snap = rcu_state.n_force_qs;
    4082           4 :         rdp->blimit = blimit;
    4083           4 :         rdp->dynticks_nesting = 1;   /* CPU not up, no tearing. */
    4084           4 :         rcu_dynticks_eqs_online();
    4085           8 :         raw_spin_unlock_rcu_node(rnp);          /* irqs remain disabled. */
    4086             :         /*
    4087             :          * Lock in case the CB/GP kthreads are still around handling
    4088             :          * old callbacks (longer term we should flush all callbacks
    4089             :          * before completing CPU offline)
    4090             :          */
    4091           4 :         rcu_nocb_lock(rdp);
    4092           4 :         if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */
    4093           3 :                 rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
    4094           4 :         rcu_nocb_unlock(rdp);
    4095             : 
    4096             :         /*
    4097             :          * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
    4098             :          * propagation up the rcu_node tree will happen at the beginning
    4099             :          * of the next grace period.
    4100             :          */
    4101           4 :         rnp = rdp->mynode;
    4102           4 :         raw_spin_lock_rcu_node(rnp);            /* irqs already disabled. */
    4103           4 :         rdp->beenonline = true;       /* We have now been online. */
    4104           4 :         rdp->gp_seq = READ_ONCE(rnp->gp_seq);
    4105           4 :         rdp->gp_seq_needed = rdp->gp_seq;
    4106           4 :         rdp->cpu_no_qs.b.norm = true;
    4107           4 :         rdp->core_needs_qs = false;
    4108           4 :         rdp->rcu_iw_pending = false;
    4109           4 :         rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
    4110           4 :         rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
    4111           4 :         trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
    4112           8 :         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    4113           4 :         rcu_prepare_kthreads(cpu);
    4114           4 :         rcu_spawn_cpu_nocb_kthread(cpu);
    4115           4 :         WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
    4116             : 
    4117           4 :         return 0;
    4118             : }
    4119             : 
    4120             : /*
    4121             :  * Update RCU priority boot kthread affinity for CPU-hotplug changes.
    4122             :  */
    4123           3 : static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
    4124             : {
    4125           3 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    4126             : 
    4127           3 :         rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
    4128             : }
    4129             : 
    4130             : /*
    4131             :  * Near the end of the CPU-online process.  Pretty much all services
    4132             :  * enabled, and the CPU is now very much alive.
    4133             :  */
    4134           4 : int rcutree_online_cpu(unsigned int cpu)
    4135             : {
    4136           4 :         unsigned long flags;
    4137           4 :         struct rcu_data *rdp;
    4138           4 :         struct rcu_node *rnp;
    4139             : 
    4140           4 :         rdp = per_cpu_ptr(&rcu_data, cpu);
    4141           4 :         rnp = rdp->mynode;
    4142           4 :         raw_spin_lock_irqsave_rcu_node(rnp, flags);
    4143           4 :         rnp->ffmask |= rdp->grpmask;
    4144           8 :         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    4145           4 :         if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
    4146             :                 return 0; /* Too early in boot for scheduler work. */
    4147           3 :         sync_sched_exp_online_cleanup(cpu);
    4148           3 :         rcutree_affinity_setting(cpu, -1);
    4149             : 
    4150             :         // Stop-machine done, so allow nohz_full to disable tick.
    4151           3 :         tick_dep_clear(TICK_DEP_BIT_RCU);
    4152           3 :         return 0;
    4153             : }
    4154             : 
    4155             : /*
    4156             :  * Near the beginning of the process.  The CPU is still very much alive
    4157             :  * with pretty much all services enabled.
    4158             :  */
    4159           0 : int rcutree_offline_cpu(unsigned int cpu)
    4160             : {
    4161           0 :         unsigned long flags;
    4162           0 :         struct rcu_data *rdp;
    4163           0 :         struct rcu_node *rnp;
    4164             : 
    4165           0 :         rdp = per_cpu_ptr(&rcu_data, cpu);
    4166           0 :         rnp = rdp->mynode;
    4167           0 :         raw_spin_lock_irqsave_rcu_node(rnp, flags);
    4168           0 :         rnp->ffmask &= ~rdp->grpmask;
    4169           0 :         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    4170             : 
    4171           0 :         rcutree_affinity_setting(cpu, cpu);
    4172             : 
    4173             :         // nohz_full CPUs need the tick for stop-machine to work quickly
    4174           0 :         tick_dep_set(TICK_DEP_BIT_RCU);
    4175           0 :         return 0;
    4176             : }
    4177             : 
    4178             : /*
    4179             :  * Mark the specified CPU as being online so that subsequent grace periods
    4180             :  * (both expedited and normal) will wait on it.  Note that this means that
    4181             :  * incoming CPUs are not allowed to use RCU read-side critical sections
    4182             :  * until this function is called.  Failing to observe this restriction
    4183             :  * will result in lockdep splats.
    4184             :  *
    4185             :  * Note that this function is special in that it is invoked directly
    4186             :  * from the incoming CPU rather than from the cpuhp_step mechanism.
    4187             :  * This is because this function must be invoked at a precise location.
    4188             :  */
    4189           7 : void rcu_cpu_starting(unsigned int cpu)
    4190             : {
    4191           7 :         unsigned long flags;
    4192           7 :         unsigned long mask;
    4193           7 :         struct rcu_data *rdp;
    4194           7 :         struct rcu_node *rnp;
    4195           7 :         bool newcpu;
    4196             : 
    4197           7 :         rdp = per_cpu_ptr(&rcu_data, cpu);
    4198           7 :         if (rdp->cpu_started)
    4199             :                 return;
    4200           4 :         rdp->cpu_started = true;
    4201             : 
    4202           4 :         rnp = rdp->mynode;
    4203           4 :         mask = rdp->grpmask;
    4204           4 :         WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
    4205           4 :         WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
    4206           4 :         smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
    4207           4 :         raw_spin_lock_irqsave_rcu_node(rnp, flags);
    4208           4 :         WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
    4209           4 :         newcpu = !(rnp->expmaskinitnext & mask);
    4210           4 :         rnp->expmaskinitnext |= mask;
    4211             :         /* Allow lockless access for expedited grace periods. */
    4212           4 :         smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
    4213           4 :         ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
    4214           4 :         rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
    4215           4 :         rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
    4216           4 :         rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
    4217             : 
    4218             :         /* An incoming CPU should never be blocking a grace period. */
    4219           4 :         if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
    4220           0 :                 rcu_disable_urgency_upon_qs(rdp);
    4221             :                 /* Report QS -after- changing ->qsmaskinitnext! */
    4222           0 :                 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
    4223             :         } else {
    4224           8 :                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    4225             :         }
    4226           4 :         smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
    4227           4 :         WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
    4228           4 :         WARN_ON_ONCE(rnp->ofl_seq & 0x1);
    4229           4 :         smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
    4230             : }
    4231             : 
    4232             : /*
    4233             :  * The outgoing function has no further need of RCU, so remove it from
    4234             :  * the rcu_node tree's ->qsmaskinitnext bit masks.
    4235             :  *
    4236             :  * Note that this function is special in that it is invoked directly
    4237             :  * from the outgoing CPU rather than from the cpuhp_step mechanism.
    4238             :  * This is because this function must be invoked at a precise location.
    4239             :  */
    4240           0 : void rcu_report_dead(unsigned int cpu)
    4241             : {
    4242           0 :         unsigned long flags;
    4243           0 :         unsigned long mask;
    4244           0 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    4245           0 :         struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
    4246             : 
    4247             :         // Do any dangling deferred wakeups.
    4248           0 :         do_nocb_deferred_wakeup(rdp);
    4249             : 
    4250             :         /* QS for any half-done expedited grace period. */
    4251           0 :         preempt_disable();
    4252           0 :         rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
    4253           0 :         preempt_enable();
    4254           0 :         rcu_preempt_deferred_qs(current);
    4255             : 
    4256             :         /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
    4257           0 :         mask = rdp->grpmask;
    4258           0 :         WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
    4259           0 :         WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
    4260           0 :         smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
    4261           0 :         raw_spin_lock(&rcu_state.ofl_lock);
    4262           0 :         raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
    4263           0 :         rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
    4264           0 :         rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
    4265           0 :         if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
    4266             :                 /* Report quiescent state -before- changing ->qsmaskinitnext! */
    4267           0 :                 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
    4268           0 :                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
    4269             :         }
    4270           0 :         WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
    4271           0 :         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    4272           0 :         raw_spin_unlock(&rcu_state.ofl_lock);
    4273           0 :         smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
    4274           0 :         WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
    4275           0 :         WARN_ON_ONCE(rnp->ofl_seq & 0x1);
    4276             : 
    4277           0 :         rdp->cpu_started = false;
    4278           0 : }
    4279             : 
    4280             : #ifdef CONFIG_HOTPLUG_CPU
    4281             : /*
    4282             :  * The outgoing CPU has just passed through the dying-idle state, and we
    4283             :  * are being invoked from the CPU that was IPIed to continue the offline
    4284             :  * operation.  Migrate the outgoing CPU's callbacks to the current CPU.
    4285             :  */
    4286           0 : void rcutree_migrate_callbacks(int cpu)
    4287             : {
    4288           0 :         unsigned long flags;
    4289           0 :         struct rcu_data *my_rdp;
    4290           0 :         struct rcu_node *my_rnp;
    4291           0 :         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    4292           0 :         bool needwake;
    4293             : 
    4294           0 :         if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
    4295           0 :             rcu_segcblist_empty(&rdp->cblist))
    4296             :                 return;  /* No callbacks to migrate. */
    4297             : 
    4298           0 :         local_irq_save(flags);
    4299           0 :         my_rdp = this_cpu_ptr(&rcu_data);
    4300           0 :         my_rnp = my_rdp->mynode;
    4301           0 :         rcu_nocb_lock(my_rdp); /* irqs already disabled. */
    4302           0 :         WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
    4303           0 :         raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
    4304             :         /* Leverage recent GPs and set GP for new callbacks. */
    4305           0 :         needwake = rcu_advance_cbs(my_rnp, rdp) ||
    4306           0 :                    rcu_advance_cbs(my_rnp, my_rdp);
    4307           0 :         rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
    4308           0 :         needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
    4309           0 :         rcu_segcblist_disable(&rdp->cblist);
    4310           0 :         WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
    4311             :                      !rcu_segcblist_n_cbs(&my_rdp->cblist));
    4312           0 :         if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
    4313             :                 raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
    4314           0 :                 __call_rcu_nocb_wake(my_rdp, true, flags);
    4315             :         } else {
    4316           0 :                 rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
    4317           0 :                 raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
    4318             :         }
    4319           0 :         if (needwake)
    4320           0 :                 rcu_gp_kthread_wake();
    4321           0 :         lockdep_assert_irqs_enabled();
    4322           0 :         WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
    4323             :                   !rcu_segcblist_empty(&rdp->cblist),
    4324             :                   "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
    4325             :                   cpu, rcu_segcblist_n_cbs(&rdp->cblist),
    4326             :                   rcu_segcblist_first_cb(&rdp->cblist));
    4327             : }
    4328             : #endif
    4329             : 
    4330             : /*
    4331             :  * On non-huge systems, use expedited RCU grace periods to make suspend
    4332             :  * and hibernation run faster.
    4333             :  */
    4334             : static int rcu_pm_notify(struct notifier_block *self,
    4335             :                          unsigned long action, void *hcpu)
    4336             : {
    4337             :         switch (action) {
    4338             :         case PM_HIBERNATION_PREPARE:
    4339             :         case PM_SUSPEND_PREPARE:
    4340             :                 rcu_expedite_gp();
    4341             :                 break;
    4342             :         case PM_POST_HIBERNATION:
    4343             :         case PM_POST_SUSPEND:
    4344             :                 rcu_unexpedite_gp();
    4345             :                 break;
    4346             :         default:
    4347             :                 break;
    4348             :         }
    4349             :         return NOTIFY_OK;
    4350             : }
    4351             : 
    4352             : /*
    4353             :  * Spawn the kthreads that handle RCU's grace periods.
    4354             :  */
    4355           1 : static int __init rcu_spawn_gp_kthread(void)
    4356             : {
    4357           1 :         unsigned long flags;
    4358           1 :         int kthread_prio_in = kthread_prio;
    4359           1 :         struct rcu_node *rnp;
    4360           1 :         struct sched_param sp;
    4361           1 :         struct task_struct *t;
    4362             : 
    4363             :         /* Force priority into range. */
    4364           1 :         if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
    4365             :             && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
    4366             :                 kthread_prio = 2;
    4367           1 :         else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
    4368             :                 kthread_prio = 1;
    4369           1 :         else if (kthread_prio < 0)
    4370           0 :                 kthread_prio = 0;
    4371           1 :         else if (kthread_prio > 99)
    4372           0 :                 kthread_prio = 99;
    4373             : 
    4374           1 :         if (kthread_prio != kthread_prio_in)
    4375           0 :                 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
    4376             :                          kthread_prio, kthread_prio_in);
    4377             : 
    4378           1 :         rcu_scheduler_fully_active = 1;
    4379           1 :         t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
    4380           1 :         if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
    4381             :                 return 0;
    4382           1 :         if (kthread_prio) {
    4383           0 :                 sp.sched_priority = kthread_prio;
    4384           0 :                 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
    4385             :         }
    4386           1 :         rnp = rcu_get_root();
    4387           1 :         raw_spin_lock_irqsave_rcu_node(rnp, flags);
    4388           1 :         WRITE_ONCE(rcu_state.gp_activity, jiffies);
    4389           1 :         WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
    4390             :         // Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
    4391           1 :         smp_store_release(&rcu_state.gp_kthread, t);  /* ^^^ */
    4392           2 :         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    4393           1 :         wake_up_process(t);
    4394           1 :         rcu_spawn_nocb_kthreads();
    4395           1 :         rcu_spawn_boost_kthreads();
    4396           1 :         return 0;
    4397             : }
    4398             : early_initcall(rcu_spawn_gp_kthread);
    4399             : 
    4400             : /*
    4401             :  * This function is invoked towards the end of the scheduler's
    4402             :  * initialization process.  Before this is called, the idle task might
    4403             :  * contain synchronous grace-period primitives (during which time, this idle
    4404             :  * task is booting the system, and such primitives are no-ops).  After this
    4405             :  * function is called, any synchronous grace-period primitives are run as
    4406             :  * expedited, with the requesting task driving the grace period forward.
    4407             :  * A later core_initcall() rcu_set_runtime_mode() will switch to full
    4408             :  * runtime RCU functionality.
    4409             :  */
    4410           1 : void rcu_scheduler_starting(void)
    4411             : {
    4412           1 :         WARN_ON(num_online_cpus() != 1);
    4413           1 :         WARN_ON(nr_context_switches() > 0);
    4414           1 :         rcu_test_sync_prims();
    4415           1 :         rcu_scheduler_active = RCU_SCHEDULER_INIT;
    4416           1 :         rcu_test_sync_prims();
    4417           1 : }
    4418             : 
    4419             : /*
    4420             :  * Helper function for rcu_init() that initializes the rcu_state structure.
    4421             :  */
    4422           1 : static void __init rcu_init_one(void)
    4423             : {
    4424           1 :         static const char * const buf[] = RCU_NODE_NAME_INIT;
    4425           1 :         static const char * const fqs[] = RCU_FQS_NAME_INIT;
    4426           1 :         static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
    4427           1 :         static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
    4428             : 
    4429           1 :         int levelspread[RCU_NUM_LVLS];          /* kids/node in each level. */
    4430           1 :         int cpustride = 1;
    4431           1 :         int i;
    4432           1 :         int j;
    4433           1 :         struct rcu_node *rnp;
    4434             : 
    4435           1 :         BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
    4436             : 
    4437             :         /* Silence gcc 4.8 false positive about array index out of range. */
    4438           1 :         if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
    4439           0 :                 panic("rcu_init_one: rcu_num_lvls out of range");
    4440             : 
    4441             :         /* Initialize the level-tracking arrays. */
    4442             : 
    4443           1 :         for (i = 1; i < rcu_num_lvls; i++)
    4444             :                 rcu_state.level[i] =
    4445             :                         rcu_state.level[i - 1] + num_rcu_lvl[i - 1];
    4446           1 :         rcu_init_levelspread(levelspread, num_rcu_lvl);
    4447             : 
    4448             :         /* Initialize the elements themselves, starting from the leaves. */
    4449             : 
    4450           2 :         for (i = rcu_num_lvls - 1; i >= 0; i--) {
    4451           1 :                 cpustride *= levelspread[i];
    4452           1 :                 rnp = rcu_state.level[i];
    4453           2 :                 for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
    4454           1 :                         raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
    4455           1 :                         lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
    4456             :                                                    &rcu_node_class[i], buf[i]);
    4457           1 :                         raw_spin_lock_init(&rnp->fqslock);
    4458           1 :                         lockdep_set_class_and_name(&rnp->fqslock,
    4459             :                                                    &rcu_fqs_class[i], fqs[i]);
    4460           1 :                         rnp->gp_seq = rcu_state.gp_seq;
    4461           1 :                         rnp->gp_seq_needed = rcu_state.gp_seq;
    4462           1 :                         rnp->completedqs = rcu_state.gp_seq;
    4463           1 :                         rnp->qsmask = 0;
    4464           1 :                         rnp->qsmaskinit = 0;
    4465           1 :                         rnp->grplo = j * cpustride;
    4466           1 :                         rnp->grphi = (j + 1) * cpustride - 1;
    4467           1 :                         if (rnp->grphi >= nr_cpu_ids)
    4468           0 :                                 rnp->grphi = nr_cpu_ids - 1;
    4469           1 :                         if (i == 0) {
    4470           1 :                                 rnp->grpnum = 0;
    4471           1 :                                 rnp->grpmask = 0;
    4472           1 :                                 rnp->parent = NULL;
    4473             :                         } else {
    4474           0 :                                 rnp->grpnum = j % levelspread[i - 1];
    4475           0 :                                 rnp->grpmask = BIT(rnp->grpnum);
    4476           0 :                                 rnp->parent = rcu_state.level[i - 1] +
    4477           0 :                                               j / levelspread[i - 1];
    4478             :                         }
    4479           1 :                         rnp->level = i;
    4480           1 :                         INIT_LIST_HEAD(&rnp->blkd_tasks);
    4481           1 :                         rcu_init_one_nocb(rnp);
    4482           1 :                         init_waitqueue_head(&rnp->exp_wq[0]);
    4483           1 :                         init_waitqueue_head(&rnp->exp_wq[1]);
    4484           1 :                         init_waitqueue_head(&rnp->exp_wq[2]);
    4485           1 :                         init_waitqueue_head(&rnp->exp_wq[3]);
    4486           1 :                         spin_lock_init(&rnp->exp_lock);
    4487             :                 }
    4488             :         }
    4489             : 
    4490           1 :         init_swait_queue_head(&rcu_state.gp_wq);
    4491           1 :         init_swait_queue_head(&rcu_state.expedited_wq);
    4492           1 :         rnp = rcu_first_leaf_node();
    4493           5 :         for_each_possible_cpu(i) {
    4494           4 :                 while (i > rnp->grphi)
    4495           0 :                         rnp++;
    4496           4 :                 per_cpu_ptr(&rcu_data, i)->mynode = rnp;
    4497           4 :                 rcu_boot_init_percpu_data(i);
    4498             :         }
    4499           1 : }
    4500             : 
    4501             : /*
    4502             :  * Compute the rcu_node tree geometry from kernel parameters.  This cannot
    4503             :  * replace the definitions in tree.h because those are needed to size
    4504             :  * the ->node array in the rcu_state structure.
    4505             :  */
    4506           1 : static void __init rcu_init_geometry(void)
    4507             : {
    4508           1 :         ulong d;
    4509           1 :         int i;
    4510           1 :         int rcu_capacity[RCU_NUM_LVLS];
    4511             : 
    4512             :         /*
    4513             :          * Initialize any unspecified boot parameters.
    4514             :          * The default values of jiffies_till_first_fqs and
    4515             :          * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
    4516             :          * value, which is a function of HZ, then adding one for each
    4517             :          * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
    4518             :          */
    4519           1 :         d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
    4520           1 :         if (jiffies_till_first_fqs == ULONG_MAX)
    4521           1 :                 jiffies_till_first_fqs = d;
    4522           1 :         if (jiffies_till_next_fqs == ULONG_MAX)
    4523           1 :                 jiffies_till_next_fqs = d;
    4524           1 :         adjust_jiffies_till_sched_qs();
    4525             : 
    4526             :         /* If the compile-time values are accurate, just leave. */
    4527           1 :         if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
    4528           1 :             nr_cpu_ids == NR_CPUS)
    4529             :                 return;
    4530           1 :         pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
    4531             :                 rcu_fanout_leaf, nr_cpu_ids);
    4532             : 
    4533             :         /*
    4534             :          * The boot-time rcu_fanout_leaf parameter must be at least two
    4535             :          * and cannot exceed the number of bits in the rcu_node masks.
    4536             :          * Complain and fall back to the compile-time values if this
    4537             :          * limit is exceeded.
    4538             :          */
    4539           1 :         if (rcu_fanout_leaf < 2 ||
    4540             :             rcu_fanout_leaf > sizeof(unsigned long) * 8) {
    4541           0 :                 rcu_fanout_leaf = RCU_FANOUT_LEAF;
    4542           0 :                 WARN_ON(1);
    4543           0 :                 return;
    4544             :         }
    4545             : 
    4546             :         /*
    4547             :          * Compute number of nodes that can be handled an rcu_node tree
    4548             :          * with the given number of levels.
    4549             :          */
    4550           1 :         rcu_capacity[0] = rcu_fanout_leaf;
    4551           1 :         for (i = 1; i < RCU_NUM_LVLS; i++)
    4552             :                 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
    4553             : 
    4554             :         /*
    4555             :          * The tree must be able to accommodate the configured number of CPUs.
    4556             :          * If this limit is exceeded, fall back to the compile-time values.
    4557             :          */
    4558           1 :         if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
    4559           0 :                 rcu_fanout_leaf = RCU_FANOUT_LEAF;
    4560           0 :                 WARN_ON(1);
    4561           0 :                 return;
    4562             :         }
    4563             : 
    4564             :         /* Calculate the number of levels in the tree. */
    4565           1 :         for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
    4566             :         }
    4567           1 :         rcu_num_lvls = i + 1;
    4568             : 
    4569             :         /* Calculate the number of rcu_nodes at each level of the tree. */
    4570           2 :         for (i = 0; i < rcu_num_lvls; i++) {
    4571           1 :                 int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
    4572           1 :                 num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
    4573             :         }
    4574             : 
    4575             :         /* Calculate the total number of rcu_node structures. */
    4576           1 :         rcu_num_nodes = 0;
    4577           2 :         for (i = 0; i < rcu_num_lvls; i++)
    4578           1 :                 rcu_num_nodes += num_rcu_lvl[i];
    4579             : }
    4580             : 
    4581             : /*
    4582             :  * Dump out the structure of the rcu_node combining tree associated
    4583             :  * with the rcu_state structure.
    4584             :  */
    4585           0 : static void __init rcu_dump_rcu_node_tree(void)
    4586             : {
    4587           0 :         int level = 0;
    4588           0 :         struct rcu_node *rnp;
    4589             : 
    4590           0 :         pr_info("rcu_node tree layout dump\n");
    4591           0 :         pr_info(" ");
    4592           0 :         rcu_for_each_node_breadth_first(rnp) {
    4593           0 :                 if (rnp->level != level) {
    4594           0 :                         pr_cont("\n");
    4595           0 :                         pr_info(" ");
    4596           0 :                         level = rnp->level;
    4597             :                 }
    4598           0 :                 pr_cont("%d:%d ^%d  ", rnp->grplo, rnp->grphi, rnp->grpnum);
    4599             :         }
    4600           0 :         pr_cont("\n");
    4601           0 : }
    4602             : 
    4603             : struct workqueue_struct *rcu_gp_wq;
    4604             : struct workqueue_struct *rcu_par_gp_wq;
    4605             : 
    4606           1 : static void __init kfree_rcu_batch_init(void)
    4607             : {
    4608           1 :         int cpu;
    4609           1 :         int i;
    4610             : 
    4611           5 :         for_each_possible_cpu(cpu) {
    4612           4 :                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
    4613             : 
    4614          12 :                 for (i = 0; i < KFREE_N_BATCHES; i++) {
    4615           8 :                         INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
    4616           8 :                         krcp->krw_arr[i].krcp = krcp;
    4617             :                 }
    4618             : 
    4619           4 :                 INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
    4620           4 :                 INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
    4621           4 :                 krcp->initialized = true;
    4622             :         }
    4623           1 :         if (register_shrinker(&kfree_rcu_shrinker))
    4624           0 :                 pr_err("Failed to register kfree_rcu() shrinker!\n");
    4625           1 : }
    4626             : 
    4627           1 : void __init rcu_init(void)
    4628             : {
    4629           1 :         int cpu;
    4630             : 
    4631           1 :         rcu_early_boot_tests();
    4632             : 
    4633           1 :         kfree_rcu_batch_init();
    4634           1 :         rcu_bootup_announce();
    4635           1 :         rcu_init_geometry();
    4636           1 :         rcu_init_one();
    4637           1 :         if (dump_tree)
    4638           0 :                 rcu_dump_rcu_node_tree();
    4639           1 :         if (use_softirq)
    4640           1 :                 open_softirq(RCU_SOFTIRQ, rcu_core_si);
    4641             : 
    4642             :         /*
    4643             :          * We don't need protection against CPU-hotplug here because
    4644             :          * this is called early in boot, before either interrupts
    4645             :          * or the scheduler are operational.
    4646             :          */
    4647             :         pm_notifier(rcu_pm_notify, 0);
    4648           2 :         for_each_online_cpu(cpu) {
    4649           1 :                 rcutree_prepare_cpu(cpu);
    4650           1 :                 rcu_cpu_starting(cpu);
    4651           1 :                 rcutree_online_cpu(cpu);
    4652             :         }
    4653             : 
    4654             :         /* Create workqueue for expedited GPs and for Tree SRCU. */
    4655           1 :         rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
    4656           1 :         WARN_ON(!rcu_gp_wq);
    4657           1 :         rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
    4658           1 :         WARN_ON(!rcu_par_gp_wq);
    4659           1 :         srcu_init();
    4660             : 
    4661             :         /* Fill in default value for rcutree.qovld boot parameter. */
    4662             :         /* -After- the rcu_node ->lock fields are initialized! */
    4663           1 :         if (qovld < 0)
    4664           0 :                 qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
    4665             :         else
    4666           1 :                 qovld_calc = qovld;
    4667           1 : }
    4668             : 
    4669             : #include "tree_stall.h"
    4670             : #include "tree_exp.h"
    4671             : #include "tree_plugin.h"

Generated by: LCOV version 1.14