Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0+
2 : /*
3 : * Read-Copy Update mechanism for mutual exclusion (tree-based version)
4 : *
5 : * Copyright IBM Corporation, 2008
6 : *
7 : * Authors: Dipankar Sarma <dipankar@in.ibm.com>
8 : * Manfred Spraul <manfred@colorfullife.com>
9 : * Paul E. McKenney <paulmck@linux.ibm.com>
10 : *
11 : * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
12 : * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
13 : *
14 : * For detailed explanation of Read-Copy Update mechanism see -
15 : * Documentation/RCU
16 : */
17 :
18 : #define pr_fmt(fmt) "rcu: " fmt
19 :
20 : #include <linux/types.h>
21 : #include <linux/kernel.h>
22 : #include <linux/init.h>
23 : #include <linux/spinlock.h>
24 : #include <linux/smp.h>
25 : #include <linux/rcupdate_wait.h>
26 : #include <linux/interrupt.h>
27 : #include <linux/sched.h>
28 : #include <linux/sched/debug.h>
29 : #include <linux/nmi.h>
30 : #include <linux/atomic.h>
31 : #include <linux/bitops.h>
32 : #include <linux/export.h>
33 : #include <linux/completion.h>
34 : #include <linux/moduleparam.h>
35 : #include <linux/percpu.h>
36 : #include <linux/notifier.h>
37 : #include <linux/cpu.h>
38 : #include <linux/mutex.h>
39 : #include <linux/time.h>
40 : #include <linux/kernel_stat.h>
41 : #include <linux/wait.h>
42 : #include <linux/kthread.h>
43 : #include <uapi/linux/sched/types.h>
44 : #include <linux/prefetch.h>
45 : #include <linux/delay.h>
46 : #include <linux/random.h>
47 : #include <linux/trace_events.h>
48 : #include <linux/suspend.h>
49 : #include <linux/ftrace.h>
50 : #include <linux/tick.h>
51 : #include <linux/sysrq.h>
52 : #include <linux/kprobes.h>
53 : #include <linux/gfp.h>
54 : #include <linux/oom.h>
55 : #include <linux/smpboot.h>
56 : #include <linux/jiffies.h>
57 : #include <linux/slab.h>
58 : #include <linux/sched/isolation.h>
59 : #include <linux/sched/clock.h>
60 : #include <linux/vmalloc.h>
61 : #include <linux/mm.h>
62 : #include <linux/kasan.h>
63 : #include "../time/tick-internal.h"
64 :
65 : #include "tree.h"
66 : #include "rcu.h"
67 :
68 : #ifdef MODULE_PARAM_PREFIX
69 : #undef MODULE_PARAM_PREFIX
70 : #endif
71 : #define MODULE_PARAM_PREFIX "rcutree."
72 :
73 : /* Data structures. */
74 :
75 : /*
76 : * Steal a bit from the bottom of ->dynticks for idle entry/exit
77 : * control. Initially this is for TLB flushing.
78 : */
79 : #define RCU_DYNTICK_CTRL_MASK 0x1
80 : #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
81 :
82 : static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
83 : .dynticks_nesting = 1,
84 : .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
85 : .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
86 : #ifdef CONFIG_RCU_NOCB_CPU
87 : .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
88 : #endif
89 : };
90 : static struct rcu_state rcu_state = {
91 : .level = { &rcu_state.node[0] },
92 : .gp_state = RCU_GP_IDLE,
93 : .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
94 : .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
95 : .name = RCU_NAME,
96 : .abbr = RCU_ABBR,
97 : .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
98 : .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
99 : .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
100 : };
101 :
102 : /* Dump rcu_node combining tree at boot to verify correct setup. */
103 : static bool dump_tree;
104 : module_param(dump_tree, bool, 0444);
105 : /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
106 : static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
107 : #ifndef CONFIG_PREEMPT_RT
108 : module_param(use_softirq, bool, 0444);
109 : #endif
110 : /* Control rcu_node-tree auto-balancing at boot time. */
111 : static bool rcu_fanout_exact;
112 : module_param(rcu_fanout_exact, bool, 0444);
113 : /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
114 : static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
115 : module_param(rcu_fanout_leaf, int, 0444);
116 : int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
117 : /* Number of rcu_nodes at specified level. */
118 : int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
119 : int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
120 :
121 : /*
122 : * The rcu_scheduler_active variable is initialized to the value
123 : * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
124 : * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
125 : * RCU can assume that there is but one task, allowing RCU to (for example)
126 : * optimize synchronize_rcu() to a simple barrier(). When this variable
127 : * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
128 : * to detect real grace periods. This variable is also used to suppress
129 : * boot-time false positives from lockdep-RCU error checking. Finally, it
130 : * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
131 : * is fully initialized, including all of its kthreads having been spawned.
132 : */
133 : int rcu_scheduler_active __read_mostly;
134 : EXPORT_SYMBOL_GPL(rcu_scheduler_active);
135 :
136 : /*
137 : * The rcu_scheduler_fully_active variable transitions from zero to one
138 : * during the early_initcall() processing, which is after the scheduler
139 : * is capable of creating new tasks. So RCU processing (for example,
140 : * creating tasks for RCU priority boosting) must be delayed until after
141 : * rcu_scheduler_fully_active transitions from zero to one. We also
142 : * currently delay invocation of any RCU callbacks until after this point.
143 : *
144 : * It might later prove better for people registering RCU callbacks during
145 : * early boot to take responsibility for these callbacks, but one step at
146 : * a time.
147 : */
148 : static int rcu_scheduler_fully_active __read_mostly;
149 :
150 : static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
151 : unsigned long gps, unsigned long flags);
152 : static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
153 : static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
154 : static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
155 : static void invoke_rcu_core(void);
156 : static void rcu_report_exp_rdp(struct rcu_data *rdp);
157 : static void sync_sched_exp_online_cleanup(int cpu);
158 : static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
159 :
160 : /* rcuc/rcub kthread realtime priority */
161 : static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
162 : module_param(kthread_prio, int, 0444);
163 :
164 : /* Delay in jiffies for grace-period initialization delays, debug only. */
165 :
166 : static int gp_preinit_delay;
167 : module_param(gp_preinit_delay, int, 0444);
168 : static int gp_init_delay;
169 : module_param(gp_init_delay, int, 0444);
170 : static int gp_cleanup_delay;
171 : module_param(gp_cleanup_delay, int, 0444);
172 :
173 : // Add delay to rcu_read_unlock() for strict grace periods.
174 : static int rcu_unlock_delay;
175 : #ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
176 : module_param(rcu_unlock_delay, int, 0444);
177 : #endif
178 :
179 : /*
180 : * This rcu parameter is runtime-read-only. It reflects
181 : * a minimum allowed number of objects which can be cached
182 : * per-CPU. Object size is equal to one page. This value
183 : * can be changed at boot time.
184 : */
185 : static int rcu_min_cached_objs = 5;
186 : module_param(rcu_min_cached_objs, int, 0444);
187 :
188 : /* Retrieve RCU kthreads priority for rcutorture */
189 0 : int rcu_get_gp_kthreads_prio(void)
190 : {
191 0 : return kthread_prio;
192 : }
193 : EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
194 :
195 : /*
196 : * Number of grace periods between delays, normalized by the duration of
197 : * the delay. The longer the delay, the more the grace periods between
198 : * each delay. The reason for this normalization is that it means that,
199 : * for non-zero delays, the overall slowdown of grace periods is constant
200 : * regardless of the duration of the delay. This arrangement balances
201 : * the need for long delays to increase some race probabilities with the
202 : * need for fast grace periods to increase other race probabilities.
203 : */
204 : #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
205 :
206 : /*
207 : * Compute the mask of online CPUs for the specified rcu_node structure.
208 : * This will not be stable unless the rcu_node structure's ->lock is
209 : * held, but the bit corresponding to the current CPU will be stable
210 : * in most contexts.
211 : */
212 34985144 : static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
213 : {
214 34985144 : return READ_ONCE(rnp->qsmaskinitnext);
215 : }
216 :
217 : /*
218 : * Return true if an RCU grace period is in progress. The READ_ONCE()s
219 : * permit this function to be invoked without holding the root rcu_node
220 : * structure's ->lock, but of course results can be subject to change.
221 : */
222 168502 : static int rcu_gp_in_progress(void)
223 : {
224 82037 : return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
225 : }
226 :
227 : /*
228 : * Return the number of callbacks queued on the specified CPU.
229 : * Handles both the nocbs and normal cases.
230 : */
231 0 : static long rcu_get_n_cbs_cpu(int cpu)
232 : {
233 0 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
234 :
235 0 : if (rcu_segcblist_is_enabled(&rdp->cblist))
236 0 : return rcu_segcblist_n_cbs(&rdp->cblist);
237 : return 0;
238 : }
239 :
240 33078 : void rcu_softirq_qs(void)
241 : {
242 33078 : rcu_qs();
243 33077 : rcu_preempt_deferred_qs(current);
244 33077 : }
245 :
246 : /*
247 : * Record entry into an extended quiescent state. This is only to be
248 : * called when not already in an extended quiescent state, that is,
249 : * RCU is watching prior to the call to this function and is no longer
250 : * watching upon return.
251 : */
252 34960 : static noinstr void rcu_dynticks_eqs_enter(void)
253 : {
254 34960 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
255 34966 : int seq;
256 :
257 : /*
258 : * CPUs seeing atomic_add_return() must see prior RCU read-side
259 : * critical sections, and we also must force ordering with the
260 : * next idle sojourn.
261 : */
262 34966 : rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
263 34966 : seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
264 : // RCU is no longer watching. Better be in extended quiescent state!
265 35017 : WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
266 : (seq & RCU_DYNTICK_CTRL_CTR));
267 : /* Better not have special action (TLB flush) pending! */
268 35017 : WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
269 : (seq & RCU_DYNTICK_CTRL_MASK));
270 35017 : }
271 :
272 : /*
273 : * Record exit from an extended quiescent state. This is only to be
274 : * called from an extended quiescent state, that is, RCU is not watching
275 : * prior to the call to this function and is watching upon return.
276 : */
277 34256 : static noinstr void rcu_dynticks_eqs_exit(void)
278 : {
279 34256 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
280 34544 : int seq;
281 :
282 : /*
283 : * CPUs seeing atomic_add_return() must see prior idle sojourns,
284 : * and we also must force ordering with the next RCU read-side
285 : * critical section.
286 : */
287 34544 : seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
288 : // RCU is now watching. Better not be in an extended quiescent state!
289 34957 : rcu_dynticks_task_trace_exit(); // After ->dynticks update!
290 34957 : WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
291 : !(seq & RCU_DYNTICK_CTRL_CTR));
292 34957 : if (seq & RCU_DYNTICK_CTRL_MASK) {
293 0 : arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
294 34957 : smp_mb__after_atomic(); /* _exit after clearing mask. */
295 : }
296 34957 : }
297 :
298 : /*
299 : * Reset the current CPU's ->dynticks counter to indicate that the
300 : * newly onlined CPU is no longer in an extended quiescent state.
301 : * This will either leave the counter unchanged, or increment it
302 : * to the next non-quiescent value.
303 : *
304 : * The non-atomic test/increment sequence works because the upper bits
305 : * of the ->dynticks counter are manipulated only by the corresponding CPU,
306 : * or when the corresponding CPU is offline.
307 : */
308 4 : static void rcu_dynticks_eqs_online(void)
309 : {
310 4 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
311 :
312 4 : if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
313 : return;
314 0 : atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
315 : }
316 :
317 : /*
318 : * Is the current CPU in an extended quiescent state?
319 : *
320 : * No ordering, as we are sampling CPU-local information.
321 : */
322 40582375 : static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
323 : {
324 0 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
325 :
326 40624511 : return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
327 : }
328 :
329 : /*
330 : * Snapshot the ->dynticks counter with full ordering so as to allow
331 : * stable comparison of this counter with past and future snapshots.
332 : */
333 4174 : static int rcu_dynticks_snap(struct rcu_data *rdp)
334 : {
335 4174 : int snap = atomic_add_return(0, &rdp->dynticks);
336 :
337 4174 : return snap & ~RCU_DYNTICK_CTRL_MASK;
338 : }
339 :
340 : /*
341 : * Return true if the snapshot returned from rcu_dynticks_snap()
342 : * indicates that RCU is in an extended quiescent state.
343 : */
344 2866 : static bool rcu_dynticks_in_eqs(int snap)
345 : {
346 483 : return !(snap & RCU_DYNTICK_CTRL_CTR);
347 : }
348 :
349 : /* Return true if the specified CPU is currently idle from an RCU viewpoint. */
350 0 : bool rcu_is_idle_cpu(int cpu)
351 : {
352 0 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
353 :
354 0 : return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
355 : }
356 :
357 : /*
358 : * Return true if the CPU corresponding to the specified rcu_data
359 : * structure has spent some time in an extended quiescent state since
360 : * rcu_dynticks_snap() returned the specified snapshot.
361 : */
362 1308 : static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
363 : {
364 303 : return snap != rcu_dynticks_snap(rdp);
365 : }
366 :
367 : /*
368 : * Return true if the referenced integer is zero while the specified
369 : * CPU remains within a single extended quiescent state.
370 : */
371 0 : bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
372 : {
373 0 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
374 0 : int snap;
375 :
376 : // If not quiescent, force back to earlier extended quiescent state.
377 0 : snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
378 : RCU_DYNTICK_CTRL_CTR);
379 :
380 0 : smp_rmb(); // Order ->dynticks and *vp reads.
381 0 : if (READ_ONCE(*vp))
382 : return false; // Non-zero, so report failure;
383 0 : smp_rmb(); // Order *vp read and ->dynticks re-read.
384 :
385 : // If still in the same extended quiescent state, we are good!
386 0 : return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
387 : }
388 :
389 : /*
390 : * Set the special (bottom) bit of the specified CPU so that it
391 : * will take special action (such as flushing its TLB) on the
392 : * next exit from an extended quiescent state. Returns true if
393 : * the bit was successfully set, or false if the CPU was not in
394 : * an extended quiescent state.
395 : */
396 0 : bool rcu_eqs_special_set(int cpu)
397 : {
398 0 : int old;
399 0 : int new;
400 0 : int new_old;
401 0 : struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
402 :
403 0 : new_old = atomic_read(&rdp->dynticks);
404 0 : do {
405 0 : old = new_old;
406 0 : if (old & RCU_DYNTICK_CTRL_CTR)
407 : return false;
408 0 : new = old | RCU_DYNTICK_CTRL_MASK;
409 0 : new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
410 0 : } while (new_old != old);
411 : return true;
412 : }
413 :
414 : /*
415 : * Let the RCU core know that this CPU has gone through the scheduler,
416 : * which is a quiescent state. This is called when the need for a
417 : * quiescent state is urgent, so we burn an atomic operation and full
418 : * memory barriers to let the RCU core know about it, regardless of what
419 : * this CPU might (or might not) do in the near future.
420 : *
421 : * We inform the RCU core by emulating a zero-duration dyntick-idle period.
422 : *
423 : * The caller must have disabled interrupts and must not be idle.
424 : */
425 111 : notrace void rcu_momentary_dyntick_idle(void)
426 : {
427 111 : int special;
428 :
429 111 : raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
430 359 : special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
431 111 : &this_cpu_ptr(&rcu_data)->dynticks);
432 : /* It is illegal to call this from idle state. */
433 128 : WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
434 128 : rcu_preempt_deferred_qs(current);
435 128 : }
436 : EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
437 :
438 : /**
439 : * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
440 : *
441 : * If the current CPU is idle and running at a first-level (not nested)
442 : * interrupt, or directly, from idle, return true.
443 : *
444 : * The caller must have at least disabled IRQs.
445 : */
446 53883 : static int rcu_is_cpu_rrupt_from_idle(void)
447 : {
448 53883 : long nesting;
449 :
450 : /*
451 : * Usually called from the tick; but also used from smp_function_call()
452 : * for expedited grace periods. This latter can result in running from
453 : * the idle task, instead of an actual IPI.
454 : */
455 108170 : lockdep_assert_irqs_disabled();
456 :
457 : /* Check for counter underflows */
458 54395 : RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
459 : "RCU dynticks_nesting counter underflow!");
460 54638 : RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
461 : "RCU dynticks_nmi_nesting counter underflow/zero!");
462 :
463 : /* Are we at first interrupt nesting level? */
464 55133 : nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
465 55133 : if (nesting > 1)
466 : return false;
467 :
468 : /*
469 : * If we're not in an interrupt, we must be in the idle task!
470 : */
471 23053 : WARN_ON_ONCE(!nesting && !is_idle_task(current));
472 :
473 : /* Does CPU appear to be idle from an RCU standpoint? */
474 23053 : return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
475 : }
476 :
477 : #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
478 : // Maximum callbacks per rcu_do_batch ...
479 : #define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
480 : static long blimit = DEFAULT_RCU_BLIMIT;
481 : #define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
482 : static long qhimark = DEFAULT_RCU_QHIMARK;
483 : #define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit.
484 : static long qlowmark = DEFAULT_RCU_QLOMARK;
485 : #define DEFAULT_RCU_QOVLD_MULT 2
486 : #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
487 : static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
488 : static long qovld_calc = -1; // No pre-initialization lock acquisitions!
489 :
490 : module_param(blimit, long, 0444);
491 : module_param(qhimark, long, 0444);
492 : module_param(qlowmark, long, 0444);
493 : module_param(qovld, long, 0444);
494 :
495 : static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX;
496 : static ulong jiffies_till_next_fqs = ULONG_MAX;
497 : static bool rcu_kick_kthreads;
498 : static int rcu_divisor = 7;
499 : module_param(rcu_divisor, int, 0644);
500 :
501 : /* Force an exit from rcu_do_batch() after 3 milliseconds. */
502 : static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
503 : module_param(rcu_resched_ns, long, 0644);
504 :
505 : /*
506 : * How long the grace period must be before we start recruiting
507 : * quiescent-state help from rcu_note_context_switch().
508 : */
509 : static ulong jiffies_till_sched_qs = ULONG_MAX;
510 : module_param(jiffies_till_sched_qs, ulong, 0444);
511 : static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
512 : module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
513 :
514 : /*
515 : * Make sure that we give the grace-period kthread time to detect any
516 : * idle CPUs before taking active measures to force quiescent states.
517 : * However, don't go below 100 milliseconds, adjusted upwards for really
518 : * large systems.
519 : */
520 1 : static void adjust_jiffies_till_sched_qs(void)
521 : {
522 1 : unsigned long j;
523 :
524 : /* If jiffies_till_sched_qs was specified, respect the request. */
525 1 : if (jiffies_till_sched_qs != ULONG_MAX) {
526 0 : WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
527 0 : return;
528 : }
529 : /* Otherwise, set to third fqs scan, but bound below on large system. */
530 1 : j = READ_ONCE(jiffies_till_first_fqs) +
531 1 : 2 * READ_ONCE(jiffies_till_next_fqs);
532 1 : if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
533 : j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
534 1 : pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
535 1 : WRITE_ONCE(jiffies_to_sched_qs, j);
536 : }
537 :
538 0 : static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp)
539 : {
540 0 : ulong j;
541 0 : int ret = kstrtoul(val, 0, &j);
542 :
543 0 : if (!ret) {
544 0 : WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j);
545 0 : adjust_jiffies_till_sched_qs();
546 : }
547 0 : return ret;
548 : }
549 :
550 0 : static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp)
551 : {
552 0 : ulong j;
553 0 : int ret = kstrtoul(val, 0, &j);
554 :
555 0 : if (!ret) {
556 0 : WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1));
557 0 : adjust_jiffies_till_sched_qs();
558 : }
559 0 : return ret;
560 : }
561 :
562 : static const struct kernel_param_ops first_fqs_jiffies_ops = {
563 : .set = param_set_first_fqs_jiffies,
564 : .get = param_get_ulong,
565 : };
566 :
567 : static const struct kernel_param_ops next_fqs_jiffies_ops = {
568 : .set = param_set_next_fqs_jiffies,
569 : .get = param_get_ulong,
570 : };
571 :
572 : module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644);
573 : module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644);
574 : module_param(rcu_kick_kthreads, bool, 0644);
575 :
576 : static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
577 : static int rcu_pending(int user);
578 :
579 : /*
580 : * Return the number of RCU GPs completed thus far for debug & stats.
581 : */
582 0 : unsigned long rcu_get_gp_seq(void)
583 : {
584 0 : return READ_ONCE(rcu_state.gp_seq);
585 : }
586 : EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
587 :
588 : /*
589 : * Return the number of RCU expedited batches completed thus far for
590 : * debug & stats. Odd numbers mean that a batch is in progress, even
591 : * numbers mean idle. The value returned will thus be roughly double
592 : * the cumulative batches since boot.
593 : */
594 0 : unsigned long rcu_exp_batches_completed(void)
595 : {
596 0 : return rcu_state.expedited_sequence;
597 : }
598 : EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
599 :
600 : /*
601 : * Return the root node of the rcu_state structure.
602 : */
603 82132 : static struct rcu_node *rcu_get_root(void)
604 : {
605 54835 : return &rcu_state.node[0];
606 : }
607 :
608 : /*
609 : * Send along grace-period-related data for rcutorture diagnostics.
610 : */
611 0 : void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
612 : unsigned long *gp_seq)
613 : {
614 0 : switch (test_type) {
615 : case RCU_FLAVOR:
616 0 : *flags = READ_ONCE(rcu_state.gp_flags);
617 0 : *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
618 0 : break;
619 : default:
620 : break;
621 : }
622 0 : }
623 : EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
624 :
625 : /*
626 : * Enter an RCU extended quiescent state, which can be either the
627 : * idle loop or adaptive-tickless usermode execution.
628 : *
629 : * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
630 : * the possibility of usermode upcalls having messed up our count
631 : * of interrupt nesting level during the prior busy period.
632 : */
633 17371 : static noinstr void rcu_eqs_enter(bool user)
634 : {
635 17371 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
636 :
637 17371 : WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
638 17371 : WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
639 17371 : WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
640 : rdp->dynticks_nesting == 0);
641 17371 : if (rdp->dynticks_nesting != 1) {
642 : // RCU will still be watching, so just do accounting and leave.
643 0 : rdp->dynticks_nesting--;
644 0 : return;
645 : }
646 :
647 34754 : lockdep_assert_irqs_disabled();
648 17382 : instrumentation_begin();
649 17382 : trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
650 17374 : WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
651 17374 : rdp = this_cpu_ptr(&rcu_data);
652 17375 : rcu_prepare_for_idle();
653 17375 : rcu_preempt_deferred_qs(current);
654 :
655 : // instrumentation for the noinstr rcu_dynticks_eqs_enter()
656 17375 : instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
657 :
658 17393 : instrumentation_end();
659 17393 : WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
660 : // RCU is watching here ...
661 17393 : rcu_dynticks_eqs_enter();
662 : // ... but is no longer watching here.
663 17393 : rcu_dynticks_task_enter();
664 : }
665 :
666 : /**
667 : * rcu_idle_enter - inform RCU that current CPU is entering idle
668 : *
669 : * Enter idle mode, in other words, -leave- the mode in which RCU
670 : * read-side critical sections can occur. (Though RCU read-side
671 : * critical sections can occur in irq handlers in idle, a possibility
672 : * handled by irq_enter() and irq_exit().)
673 : *
674 : * If you add or remove a call to rcu_idle_enter(), be sure to test with
675 : * CONFIG_RCU_EQS_DEBUG=y.
676 : */
677 17379 : void rcu_idle_enter(void)
678 : {
679 34761 : lockdep_assert_irqs_disabled();
680 17389 : rcu_eqs_enter(false);
681 17398 : }
682 : EXPORT_SYMBOL_GPL(rcu_idle_enter);
683 :
684 : #ifdef CONFIG_NO_HZ_FULL
685 :
686 : #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
687 : /*
688 : * An empty function that will trigger a reschedule on
689 : * IRQ tail once IRQs get re-enabled on userspace/guest resume.
690 : */
691 : static void late_wakeup_func(struct irq_work *work)
692 : {
693 : }
694 :
695 : static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
696 : IRQ_WORK_INIT(late_wakeup_func);
697 :
698 : /*
699 : * If either:
700 : *
701 : * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
702 : * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
703 : *
704 : * In these cases the late RCU wake ups aren't supported in the resched loops and our
705 : * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
706 : * get re-enabled again.
707 : */
708 : noinstr static void rcu_irq_work_resched(void)
709 : {
710 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
711 :
712 : if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
713 : return;
714 :
715 : if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
716 : return;
717 :
718 : instrumentation_begin();
719 : if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
720 : irq_work_queue(this_cpu_ptr(&late_wakeup_work));
721 : }
722 : instrumentation_end();
723 : }
724 :
725 : #else
726 : static inline void rcu_irq_work_resched(void) { }
727 : #endif
728 :
729 : /**
730 : * rcu_user_enter - inform RCU that we are resuming userspace.
731 : *
732 : * Enter RCU idle mode right before resuming userspace. No use of RCU
733 : * is permitted between this call and rcu_user_exit(). This way the
734 : * CPU doesn't need to maintain the tick for RCU maintenance purposes
735 : * when the CPU runs in userspace.
736 : *
737 : * If you add or remove a call to rcu_user_enter(), be sure to test with
738 : * CONFIG_RCU_EQS_DEBUG=y.
739 : */
740 : noinstr void rcu_user_enter(void)
741 : {
742 : lockdep_assert_irqs_disabled();
743 :
744 : /*
745 : * Other than generic entry implementation, we may be past the last
746 : * rescheduling opportunity in the entry code. Trigger a self IPI
747 : * that will fire and reschedule once we resume in user/guest mode.
748 : */
749 : rcu_irq_work_resched();
750 : rcu_eqs_enter(true);
751 : }
752 :
753 : #endif /* CONFIG_NO_HZ_FULL */
754 :
755 : /**
756 : * rcu_nmi_exit - inform RCU of exit from NMI context
757 : *
758 : * If we are returning from the outermost NMI handler that interrupted an
759 : * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
760 : * to let the RCU grace-period handling know that the CPU is back to
761 : * being RCU-idle.
762 : *
763 : * If you add or remove a call to rcu_nmi_exit(), be sure to test
764 : * with CONFIG_RCU_EQS_DEBUG=y.
765 : */
766 19178 : noinstr void rcu_nmi_exit(void)
767 : {
768 19178 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
769 :
770 19199 : instrumentation_begin();
771 : /*
772 : * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
773 : * (We are exiting an NMI handler, so RCU better be paying attention
774 : * to us!)
775 : */
776 19199 : WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
777 19199 : WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
778 :
779 : /*
780 : * If the nesting level is not 1, the CPU wasn't RCU-idle, so
781 : * leave it in non-RCU-idle state.
782 : */
783 19199 : if (rdp->dynticks_nmi_nesting != 1) {
784 3240 : trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
785 1620 : atomic_read(&rdp->dynticks));
786 1620 : WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
787 : rdp->dynticks_nmi_nesting - 2);
788 1620 : instrumentation_end();
789 1620 : return;
790 : }
791 :
792 : /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
793 17579 : trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
794 17569 : WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
795 :
796 17569 : if (!in_nmi())
797 : rcu_prepare_for_idle();
798 :
799 : // instrumentation for the noinstr rcu_dynticks_eqs_enter()
800 17569 : instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
801 17577 : instrumentation_end();
802 :
803 : // RCU is watching here ...
804 17577 : rcu_dynticks_eqs_enter();
805 : // ... but is no longer watching here.
806 :
807 17608 : if (!in_nmi())
808 : rcu_dynticks_task_enter();
809 : }
810 :
811 : /**
812 : * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
813 : *
814 : * Exit from an interrupt handler, which might possibly result in entering
815 : * idle mode, in other words, leaving the mode in which read-side critical
816 : * sections can occur. The caller must have disabled interrupts.
817 : *
818 : * This code assumes that the idle loop never does anything that might
819 : * result in unbalanced calls to irq_enter() and irq_exit(). If your
820 : * architecture's idle loop violates this assumption, RCU will give you what
821 : * you deserve, good and hard. But very infrequently and irreproducibly.
822 : *
823 : * Use things like work queues to work around this limitation.
824 : *
825 : * You have been warned.
826 : *
827 : * If you add or remove a call to rcu_irq_exit(), be sure to test with
828 : * CONFIG_RCU_EQS_DEBUG=y.
829 : */
830 19183 : void noinstr rcu_irq_exit(void)
831 : {
832 38388 : lockdep_assert_irqs_disabled();
833 19202 : rcu_nmi_exit();
834 19230 : }
835 :
836 : /**
837 : * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
838 : * towards in kernel preemption
839 : *
840 : * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
841 : * from RCU point of view. Invoked from return from interrupt before kernel
842 : * preemption.
843 : */
844 0 : void rcu_irq_exit_preempt(void)
845 : {
846 0 : lockdep_assert_irqs_disabled();
847 0 : rcu_nmi_exit();
848 :
849 0 : RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
850 : "RCU dynticks_nesting counter underflow/zero!");
851 0 : RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
852 : DYNTICK_IRQ_NONIDLE,
853 : "Bad RCU dynticks_nmi_nesting counter\n");
854 0 : RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
855 : "RCU in extended quiescent state!");
856 0 : }
857 :
858 : #ifdef CONFIG_PROVE_RCU
859 : /**
860 : * rcu_irq_exit_check_preempt - Validate that scheduling is possible
861 : */
862 0 : void rcu_irq_exit_check_preempt(void)
863 : {
864 0 : lockdep_assert_irqs_disabled();
865 :
866 0 : RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
867 : "RCU dynticks_nesting counter underflow/zero!");
868 0 : RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
869 : DYNTICK_IRQ_NONIDLE,
870 : "Bad RCU dynticks_nmi_nesting counter\n");
871 0 : RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
872 : "RCU in extended quiescent state!");
873 0 : }
874 : #endif /* #ifdef CONFIG_PROVE_RCU */
875 :
876 : /*
877 : * Wrapper for rcu_irq_exit() where interrupts are enabled.
878 : *
879 : * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
880 : * with CONFIG_RCU_EQS_DEBUG=y.
881 : */
882 0 : void rcu_irq_exit_irqson(void)
883 : {
884 0 : unsigned long flags;
885 :
886 0 : local_irq_save(flags);
887 0 : rcu_irq_exit();
888 0 : local_irq_restore(flags);
889 0 : }
890 :
891 : /*
892 : * Exit an RCU extended quiescent state, which can be either the
893 : * idle loop or adaptive-tickless usermode execution.
894 : *
895 : * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
896 : * allow for the possibility of usermode upcalls messing up our count of
897 : * interrupt nesting level during the busy period that is just now starting.
898 : */
899 17360 : static void noinstr rcu_eqs_exit(bool user)
900 : {
901 17360 : struct rcu_data *rdp;
902 17360 : long oldval;
903 :
904 34727 : lockdep_assert_irqs_disabled();
905 17370 : rdp = this_cpu_ptr(&rcu_data);
906 17373 : oldval = rdp->dynticks_nesting;
907 17373 : WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
908 17373 : if (oldval) {
909 : // RCU was already watching, so just do accounting and leave.
910 0 : rdp->dynticks_nesting++;
911 0 : return;
912 : }
913 17373 : rcu_dynticks_task_exit();
914 : // RCU is not watching here ...
915 17373 : rcu_dynticks_eqs_exit();
916 : // ... but is watching here.
917 17397 : instrumentation_begin();
918 :
919 : // instrumentation for the noinstr rcu_dynticks_eqs_exit()
920 17397 : instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
921 :
922 17349 : rcu_cleanup_after_idle();
923 17349 : trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
924 17355 : WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
925 17355 : WRITE_ONCE(rdp->dynticks_nesting, 1);
926 17355 : WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
927 17355 : WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
928 17355 : instrumentation_end();
929 : }
930 :
931 : /**
932 : * rcu_idle_exit - inform RCU that current CPU is leaving idle
933 : *
934 : * Exit idle mode, in other words, -enter- the mode in which RCU
935 : * read-side critical sections can occur.
936 : *
937 : * If you add or remove a call to rcu_idle_exit(), be sure to test with
938 : * CONFIG_RCU_EQS_DEBUG=y.
939 : */
940 17315 : void rcu_idle_exit(void)
941 : {
942 17315 : unsigned long flags;
943 :
944 34671 : local_irq_save(flags);
945 17356 : rcu_eqs_exit(false);
946 17355 : local_irq_restore(flags);
947 17362 : }
948 : EXPORT_SYMBOL_GPL(rcu_idle_exit);
949 :
950 : #ifdef CONFIG_NO_HZ_FULL
951 : /**
952 : * rcu_user_exit - inform RCU that we are exiting userspace.
953 : *
954 : * Exit RCU idle mode while entering the kernel because it can
955 : * run a RCU read side critical section anytime.
956 : *
957 : * If you add or remove a call to rcu_user_exit(), be sure to test with
958 : * CONFIG_RCU_EQS_DEBUG=y.
959 : */
960 : void noinstr rcu_user_exit(void)
961 : {
962 : rcu_eqs_exit(1);
963 : }
964 :
965 : /**
966 : * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
967 : *
968 : * The scheduler tick is not normally enabled when CPUs enter the kernel
969 : * from nohz_full userspace execution. After all, nohz_full userspace
970 : * execution is an RCU quiescent state and the time executing in the kernel
971 : * is quite short. Except of course when it isn't. And it is not hard to
972 : * cause a large system to spend tens of seconds or even minutes looping
973 : * in the kernel, which can cause a number of problems, include RCU CPU
974 : * stall warnings.
975 : *
976 : * Therefore, if a nohz_full CPU fails to report a quiescent state
977 : * in a timely manner, the RCU grace-period kthread sets that CPU's
978 : * ->rcu_urgent_qs flag with the expectation that the next interrupt or
979 : * exception will invoke this function, which will turn on the scheduler
980 : * tick, which will enable RCU to detect that CPU's quiescent states,
981 : * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels.
982 : * The tick will be disabled once a quiescent state is reported for
983 : * this CPU.
984 : *
985 : * Of course, in carefully tuned systems, there might never be an
986 : * interrupt or exception. In that case, the RCU grace-period kthread
987 : * will eventually cause one to happen. However, in less carefully
988 : * controlled environments, this function allows RCU to get what it
989 : * needs without creating otherwise useless interruptions.
990 : */
991 : void __rcu_irq_enter_check_tick(void)
992 : {
993 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
994 :
995 : // If we're here from NMI there's nothing to do.
996 : if (in_nmi())
997 : return;
998 :
999 : RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
1000 : "Illegal rcu_irq_enter_check_tick() from extended quiescent state");
1001 :
1002 : if (!tick_nohz_full_cpu(rdp->cpu) ||
1003 : !READ_ONCE(rdp->rcu_urgent_qs) ||
1004 : READ_ONCE(rdp->rcu_forced_tick)) {
1005 : // RCU doesn't need nohz_full help from this CPU, or it is
1006 : // already getting that help.
1007 : return;
1008 : }
1009 :
1010 : // We get here only when not in an extended quiescent state and
1011 : // from interrupts (as opposed to NMIs). Therefore, (1) RCU is
1012 : // already watching and (2) The fact that we are in an interrupt
1013 : // handler and that the rcu_node lock is an irq-disabled lock
1014 : // prevents self-deadlock. So we can safely recheck under the lock.
1015 : // Note that the nohz_full state currently cannot change.
1016 : raw_spin_lock_rcu_node(rdp->mynode);
1017 : if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
1018 : // A nohz_full CPU is in the kernel and RCU needs a
1019 : // quiescent state. Turn on the tick!
1020 : WRITE_ONCE(rdp->rcu_forced_tick, true);
1021 : tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
1022 : }
1023 : raw_spin_unlock_rcu_node(rdp->mynode);
1024 : }
1025 : #endif /* CONFIG_NO_HZ_FULL */
1026 :
1027 : /**
1028 : * rcu_nmi_enter - inform RCU of entry to NMI context
1029 : *
1030 : * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
1031 : * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
1032 : * that the CPU is active. This implementation permits nested NMIs, as
1033 : * long as the nesting level does not overflow an int. (You will probably
1034 : * run out of stack space first.)
1035 : *
1036 : * If you add or remove a call to rcu_nmi_enter(), be sure to test
1037 : * with CONFIG_RCU_EQS_DEBUG=y.
1038 : */
1039 18865 : noinstr void rcu_nmi_enter(void)
1040 : {
1041 18865 : long incby = 2;
1042 18865 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1043 :
1044 : /* Complain about underflow. */
1045 19083 : WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
1046 :
1047 : /*
1048 : * If idle from RCU viewpoint, atomically increment ->dynticks
1049 : * to mark non-idle and increment ->dynticks_nmi_nesting by one.
1050 : * Otherwise, increment ->dynticks_nmi_nesting by two. This means
1051 : * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
1052 : * to be in the outermost NMI handler that interrupted an RCU-idle
1053 : * period (observation due to Andy Lutomirski).
1054 : */
1055 19083 : if (rcu_dynticks_curr_cpu_in_eqs()) {
1056 :
1057 17572 : if (!in_nmi())
1058 : rcu_dynticks_task_exit();
1059 :
1060 : // RCU is not watching here ...
1061 17572 : rcu_dynticks_eqs_exit();
1062 : // ... but is watching here.
1063 :
1064 17516 : if (!in_nmi()) {
1065 : instrumentation_begin();
1066 : rcu_cleanup_after_idle();
1067 17516 : instrumentation_end();
1068 : }
1069 :
1070 17516 : instrumentation_begin();
1071 : // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
1072 17516 : instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
1073 : // instrumentation for the noinstr rcu_dynticks_eqs_exit()
1074 17193 : instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
1075 :
1076 17193 : incby = 1;
1077 1620 : } else if (!in_nmi()) {
1078 : instrumentation_begin();
1079 : rcu_irq_enter_check_tick();
1080 : instrumentation_end();
1081 : } else {
1082 19122 : instrumentation_begin();
1083 : }
1084 :
1085 38130 : trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
1086 : rdp->dynticks_nmi_nesting,
1087 19122 : rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
1088 19008 : instrumentation_end();
1089 19008 : WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
1090 : rdp->dynticks_nmi_nesting + incby);
1091 19008 : barrier();
1092 19030 : }
1093 :
1094 : /**
1095 : * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
1096 : *
1097 : * Enter an interrupt handler, which might possibly result in exiting
1098 : * idle mode, in other words, entering the mode in which read-side critical
1099 : * sections can occur. The caller must have disabled interrupts.
1100 : *
1101 : * Note that the Linux kernel is fully capable of entering an interrupt
1102 : * handler that it never exits, for example when doing upcalls to user mode!
1103 : * This code assumes that the idle loop never does upcalls to user mode.
1104 : * If your architecture's idle loop does do upcalls to user mode (or does
1105 : * anything else that results in unbalanced calls to the irq_enter() and
1106 : * irq_exit() functions), RCU will give you what you deserve, good and hard.
1107 : * But very infrequently and irreproducibly.
1108 : *
1109 : * Use things like work queues to work around this limitation.
1110 : *
1111 : * You have been warned.
1112 : *
1113 : * If you add or remove a call to rcu_irq_enter(), be sure to test with
1114 : * CONFIG_RCU_EQS_DEBUG=y.
1115 : */
1116 18860 : noinstr void rcu_irq_enter(void)
1117 : {
1118 37736 : lockdep_assert_irqs_disabled();
1119 18867 : rcu_nmi_enter();
1120 19031 : }
1121 :
1122 : /*
1123 : * Wrapper for rcu_irq_enter() where interrupts are enabled.
1124 : *
1125 : * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
1126 : * with CONFIG_RCU_EQS_DEBUG=y.
1127 : */
1128 0 : void rcu_irq_enter_irqson(void)
1129 : {
1130 0 : unsigned long flags;
1131 :
1132 0 : local_irq_save(flags);
1133 0 : rcu_irq_enter();
1134 0 : local_irq_restore(flags);
1135 0 : }
1136 :
1137 : /*
1138 : * If any sort of urgency was applied to the current CPU (for example,
1139 : * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
1140 : * to get to a quiescent state, disable it.
1141 : */
1142 8072 : static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
1143 : {
1144 16144 : raw_lockdep_assert_held_rcu_node(rdp->mynode);
1145 8072 : WRITE_ONCE(rdp->rcu_urgent_qs, false);
1146 8072 : WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
1147 8072 : if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
1148 : tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
1149 8072 : WRITE_ONCE(rdp->rcu_forced_tick, false);
1150 : }
1151 8072 : }
1152 :
1153 : /**
1154 : * rcu_is_watching - see if RCU thinks that the current CPU is not idle
1155 : *
1156 : * Return true if RCU is watching the running CPU, which means that this
1157 : * CPU can safely enter RCU read-side critical sections. In other words,
1158 : * if the current CPU is not in its idle loop or is in an interrupt or
1159 : * NMI handler, return true.
1160 : *
1161 : * Make notrace because it can be called by the internal functions of
1162 : * ftrace, and making this notrace removes unnecessary recursion calls.
1163 : */
1164 40529675 : notrace bool rcu_is_watching(void)
1165 : {
1166 40529675 : bool ret;
1167 :
1168 39902398 : preempt_disable_notrace();
1169 40544093 : ret = !rcu_dynticks_curr_cpu_in_eqs();
1170 40586120 : preempt_enable_notrace();
1171 0 : return ret;
1172 : }
1173 : EXPORT_SYMBOL_GPL(rcu_is_watching);
1174 :
1175 : /*
1176 : * If a holdout task is actually running, request an urgent quiescent
1177 : * state from its CPU. This is unsynchronized, so migrations can cause
1178 : * the request to go to the wrong CPU. Which is OK, all that will happen
1179 : * is that the CPU's next context switch will be a bit slower and next
1180 : * time around this task will generate another request.
1181 : */
1182 0 : void rcu_request_urgent_qs_task(struct task_struct *t)
1183 : {
1184 0 : int cpu;
1185 :
1186 0 : barrier();
1187 0 : cpu = task_cpu(t);
1188 0 : if (!task_curr(t))
1189 : return; /* This task is not running on that CPU. */
1190 0 : smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
1191 : }
1192 :
1193 : #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
1194 :
1195 : /*
1196 : * Is the current CPU online as far as RCU is concerned?
1197 : *
1198 : * Disable preemption to avoid false positives that could otherwise
1199 : * happen due to the current CPU number being sampled, this task being
1200 : * preempted, its old CPU being taken offline, resuming on some other CPU,
1201 : * then determining that its old CPU is now offline.
1202 : *
1203 : * Disable checking if in an NMI handler because we cannot safely
1204 : * report errors from NMI handlers anyway. In addition, it is OK to use
1205 : * RCU on an offline processor during initial boot, hence the check for
1206 : * rcu_scheduler_fully_active.
1207 : */
1208 34980094 : bool rcu_lockdep_current_cpu_online(void)
1209 : {
1210 34980094 : struct rcu_data *rdp;
1211 34980094 : struct rcu_node *rnp;
1212 34980094 : bool ret = false;
1213 :
1214 34980094 : if (in_nmi() || !rcu_scheduler_fully_active)
1215 : return true;
1216 34975720 : preempt_disable_notrace();
1217 34924124 : rdp = this_cpu_ptr(&rcu_data);
1218 34984186 : rnp = rdp->mynode;
1219 34984186 : if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
1220 34984186 : ret = true;
1221 34984186 : preempt_enable_notrace();
1222 34988061 : return ret;
1223 : }
1224 : EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
1225 :
1226 : #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
1227 :
1228 : /*
1229 : * We are reporting a quiescent state on behalf of some other CPU, so
1230 : * it is our responsibility to check for and handle potential overflow
1231 : * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
1232 : * After all, the CPU might be in deep idle state, and thus executing no
1233 : * code whatsoever.
1234 : */
1235 10453 : static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
1236 : {
1237 20906 : raw_lockdep_assert_held_rcu_node(rnp);
1238 10453 : if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
1239 : rnp->gp_seq))
1240 0 : WRITE_ONCE(rdp->gpwrap, true);
1241 10453 : if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
1242 0 : rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
1243 10453 : }
1244 :
1245 : /*
1246 : * Snapshot the specified CPU's dynticks counter so that we can later
1247 : * credit them with an implicit quiescent state. Return 1 if this CPU
1248 : * is in dynticks idle mode, which is an extended quiescent state.
1249 : */
1250 2379 : static int dyntick_save_progress_counter(struct rcu_data *rdp)
1251 : {
1252 2379 : rdp->dynticks_snap = rcu_dynticks_snap(rdp);
1253 2379 : if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
1254 710 : trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
1255 710 : rcu_gpnum_ovf(rdp->mynode, rdp);
1256 710 : return 1;
1257 : }
1258 : return 0;
1259 : }
1260 :
1261 : /*
1262 : * Return true if the specified CPU has passed through a quiescent
1263 : * state by virtue of being in or having passed through an dynticks
1264 : * idle state since the last call to dyntick_save_progress_counter()
1265 : * for this same CPU, or by virtue of having been offline.
1266 : */
1267 1005 : static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
1268 : {
1269 1005 : unsigned long jtsq;
1270 1005 : bool *rnhqp;
1271 1005 : bool *ruqp;
1272 1005 : struct rcu_node *rnp = rdp->mynode;
1273 :
1274 : /*
1275 : * If the CPU passed through or entered a dynticks idle phase with
1276 : * no active irq/NMI handlers, then we can safely pretend that the CPU
1277 : * already acknowledged the request to pass through a quiescent
1278 : * state. Either way, that CPU cannot possibly be in an RCU
1279 : * read-side critical section that started before the beginning
1280 : * of the current RCU grace period.
1281 : */
1282 1005 : if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) {
1283 47 : trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
1284 47 : rcu_gpnum_ovf(rnp, rdp);
1285 47 : return 1;
1286 : }
1287 :
1288 : /*
1289 : * Complain if a CPU that is considered to be offline from RCU's
1290 : * perspective has not yet reported a quiescent state. After all,
1291 : * the offline CPU should have reported a quiescent state during
1292 : * the CPU-offline process, or, failing that, by rcu_gp_init()
1293 : * if it ran concurrently with either the CPU going offline or the
1294 : * last task on a leaf rcu_node structure exiting its RCU read-side
1295 : * critical section while all CPUs corresponding to that structure
1296 : * are offline. This added warning detects bugs in any of these
1297 : * code paths.
1298 : *
1299 : * The rcu_node structure's ->lock is held here, which excludes
1300 : * the relevant portions the CPU-hotplug code, the grace-period
1301 : * initialization code, and the rcu_read_unlock() code paths.
1302 : *
1303 : * For more detail, please refer to the "Hotplug CPU" section
1304 : * of RCU's Requirements documentation.
1305 : */
1306 958 : if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
1307 0 : bool onl;
1308 0 : struct rcu_node *rnp1;
1309 :
1310 0 : pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
1311 : __func__, rnp->grplo, rnp->grphi, rnp->level,
1312 : (long)rnp->gp_seq, (long)rnp->completedqs);
1313 0 : for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
1314 0 : pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
1315 : __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
1316 0 : onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
1317 0 : pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
1318 : __func__, rdp->cpu, ".o"[onl],
1319 : (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
1320 : (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
1321 0 : return 1; /* Break things loose after complaining. */
1322 : }
1323 :
1324 : /*
1325 : * A CPU running for an extended time within the kernel can
1326 : * delay RCU grace periods: (1) At age jiffies_to_sched_qs,
1327 : * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
1328 : * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the
1329 : * unsynchronized assignments to the per-CPU rcu_need_heavy_qs
1330 : * variable are safe because the assignments are repeated if this
1331 : * CPU failed to pass through a quiescent state. This code
1332 : * also checks .jiffies_resched in case jiffies_to_sched_qs
1333 : * is set way high.
1334 : */
1335 958 : jtsq = READ_ONCE(jiffies_to_sched_qs);
1336 958 : ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
1337 958 : rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
1338 958 : if (!READ_ONCE(*rnhqp) &&
1339 958 : (time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
1340 958 : time_after(jiffies, rcu_state.jiffies_resched) ||
1341 958 : rcu_state.cbovld)) {
1342 0 : WRITE_ONCE(*rnhqp, true);
1343 : /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
1344 0 : smp_store_release(ruqp, true);
1345 958 : } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
1346 22 : WRITE_ONCE(*ruqp, true);
1347 : }
1348 :
1349 : /*
1350 : * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
1351 : * The above code handles this, but only for straight cond_resched().
1352 : * And some in-kernel loops check need_resched() before calling
1353 : * cond_resched(), which defeats the above code for CPUs that are
1354 : * running in-kernel with scheduling-clock interrupts disabled.
1355 : * So hit them over the head with the resched_cpu() hammer!
1356 : */
1357 958 : if (tick_nohz_full_cpu(rdp->cpu) &&
1358 : (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
1359 : rcu_state.cbovld)) {
1360 : WRITE_ONCE(*ruqp, true);
1361 : resched_cpu(rdp->cpu);
1362 958 : WRITE_ONCE(rdp->last_fqs_resched, jiffies);
1363 : }
1364 :
1365 : /*
1366 : * If more than halfway to RCU CPU stall-warning time, invoke
1367 : * resched_cpu() more frequently to try to loosen things up a bit.
1368 : * Also check to see if the CPU is getting hammered with interrupts,
1369 : * but only once per grace period, just to keep the IPIs down to
1370 : * a dull roar.
1371 : */
1372 958 : if (time_after(jiffies, rcu_state.jiffies_resched)) {
1373 0 : if (time_after(jiffies,
1374 : READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
1375 0 : resched_cpu(rdp->cpu);
1376 0 : WRITE_ONCE(rdp->last_fqs_resched, jiffies);
1377 : }
1378 0 : if (IS_ENABLED(CONFIG_IRQ_WORK) &&
1379 0 : !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
1380 0 : (rnp->ffmask & rdp->grpmask)) {
1381 0 : rdp->rcu_iw_pending = true;
1382 0 : rdp->rcu_iw_gp_seq = rnp->gp_seq;
1383 0 : irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
1384 : }
1385 : }
1386 :
1387 : return 0;
1388 : }
1389 :
1390 : /* Trace-event wrapper function for trace_rcu_future_grace_period. */
1391 26648 : static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1392 : unsigned long gp_seq_req, const char *s)
1393 : {
1394 26648 : trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
1395 26648 : gp_seq_req, rnp->level,
1396 : rnp->grplo, rnp->grphi, s);
1397 : }
1398 :
1399 : /*
1400 : * rcu_start_this_gp - Request the start of a particular grace period
1401 : * @rnp_start: The leaf node of the CPU from which to start.
1402 : * @rdp: The rcu_data corresponding to the CPU from which to start.
1403 : * @gp_seq_req: The gp_seq of the grace period to start.
1404 : *
1405 : * Start the specified grace period, as needed to handle newly arrived
1406 : * callbacks. The required future grace periods are recorded in each
1407 : * rcu_node structure's ->gp_seq_needed field. Returns true if there
1408 : * is reason to awaken the grace-period kthread.
1409 : *
1410 : * The caller must hold the specified rcu_node structure's ->lock, which
1411 : * is why the caller is responsible for waking the grace-period kthread.
1412 : *
1413 : * Returns true if the GP thread needs to be awakened else false.
1414 : */
1415 12314 : static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
1416 : unsigned long gp_seq_req)
1417 : {
1418 12314 : bool ret = false;
1419 12314 : struct rcu_node *rnp;
1420 :
1421 : /*
1422 : * Use funnel locking to either acquire the root rcu_node
1423 : * structure's lock or bail out if the need for this grace period
1424 : * has already been recorded -- or if that grace period has in
1425 : * fact already started. If there is already a grace period in
1426 : * progress in a non-leaf node, no recording is needed because the
1427 : * end of the grace period will scan the leaf rcu_node structures.
1428 : * Note that rnp_start->lock must not be released.
1429 : */
1430 24628 : raw_lockdep_assert_held_rcu_node(rnp_start);
1431 12314 : trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
1432 12314 : for (rnp = rnp_start; 1; rnp = rnp->parent) {
1433 12314 : if (rnp != rnp_start)
1434 0 : raw_spin_lock_rcu_node(rnp);
1435 12314 : if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) ||
1436 2021 : rcu_seq_started(&rnp->gp_seq, gp_seq_req) ||
1437 0 : (rnp != rnp_start &&
1438 0 : rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) {
1439 10293 : trace_rcu_this_gp(rnp, rdp, gp_seq_req,
1440 10293 : TPS("Prestarted"));
1441 10293 : goto unlock_out;
1442 : }
1443 2021 : WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
1444 2021 : if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
1445 : /*
1446 : * We just marked the leaf or internal node, and a
1447 : * grace period is in progress, which means that
1448 : * rcu_gp_cleanup() will see the marking. Bail to
1449 : * reduce contention.
1450 : */
1451 1993 : trace_rcu_this_gp(rnp_start, rdp, gp_seq_req,
1452 1993 : TPS("Startedleaf"));
1453 1993 : goto unlock_out;
1454 : }
1455 28 : if (rnp != rnp_start && rnp->parent != NULL)
1456 0 : raw_spin_unlock_rcu_node(rnp);
1457 28 : if (!rnp->parent)
1458 : break; /* At root, and perhaps also leaf. */
1459 : }
1460 :
1461 : /* If GP already in progress, just leave, otherwise start one. */
1462 28 : if (rcu_gp_in_progress()) {
1463 12 : trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
1464 12 : goto unlock_out;
1465 : }
1466 16 : trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
1467 16 : WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
1468 16 : WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
1469 16 : if (!READ_ONCE(rcu_state.gp_kthread)) {
1470 1 : trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
1471 1 : goto unlock_out;
1472 : }
1473 12314 : trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq"));
1474 12314 : ret = true; /* Caller must wake GP kthread. */
1475 12314 : unlock_out:
1476 : /* Push furthest requested GP to leaf node and rcu_data structure. */
1477 12314 : if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
1478 0 : WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
1479 0 : WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
1480 : }
1481 12314 : if (rnp != rnp_start)
1482 0 : raw_spin_unlock_rcu_node(rnp);
1483 12314 : return ret;
1484 : }
1485 :
1486 : /*
1487 : * Clean up any old requests for the just-ended grace period. Also return
1488 : * whether any additional grace periods have been requested.
1489 : */
1490 2019 : static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
1491 : {
1492 2019 : bool needmore;
1493 4038 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1494 :
1495 2019 : needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
1496 2019 : if (!needmore)
1497 15 : rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */
1498 2019 : trace_rcu_this_gp(rnp, rdp, rnp->gp_seq,
1499 2019 : needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1500 2019 : return needmore;
1501 : }
1502 :
1503 : /*
1504 : * Awaken the grace-period kthread. Don't do a self-awaken (unless in an
1505 : * interrupt or softirq handler, in which case we just might immediately
1506 : * sleep upon return, resulting in a grace-period hang), and don't bother
1507 : * awakening when there is nothing for the grace-period kthread to do
1508 : * (as in several CPUs raced to awaken, we lost), and finally don't try
1509 : * to awaken a kthread that has not yet been created. If all those checks
1510 : * are passed, track some debug information and awaken.
1511 : *
1512 : * So why do the self-wakeup when in an interrupt or softirq handler
1513 : * in the grace-period kthread's context? Because the kthread might have
1514 : * been interrupted just as it was going to sleep, and just after the final
1515 : * pre-sleep check of the awaken condition. In this case, a wakeup really
1516 : * is required, and is therefore supplied.
1517 : */
1518 2034 : static void rcu_gp_kthread_wake(void)
1519 : {
1520 2034 : struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
1521 :
1522 2034 : if ((current == t && !in_irq() && !in_serving_softirq()) ||
1523 1683 : !READ_ONCE(rcu_state.gp_flags) || !t)
1524 : return;
1525 1683 : WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
1526 1683 : WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
1527 1683 : swake_up_one(&rcu_state.gp_wq);
1528 : }
1529 :
1530 : /*
1531 : * If there is room, assign a ->gp_seq number to any callbacks on this
1532 : * CPU that have not already been assigned. Also accelerate any callbacks
1533 : * that were previously assigned a ->gp_seq number that has since proven
1534 : * to be too conservative, which can happen if callbacks get assigned a
1535 : * ->gp_seq number while RCU is idle, but with reference to a non-root
1536 : * rcu_node structure. This function is idempotent, so it does not hurt
1537 : * to call it repeatedly. Returns an flag saying that we should awaken
1538 : * the RCU grace-period kthread.
1539 : *
1540 : * The caller must hold rnp->lock with interrupts disabled.
1541 : */
1542 18684 : static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1543 : {
1544 18684 : unsigned long gp_seq_req;
1545 18684 : bool ret = false;
1546 :
1547 18684 : rcu_lockdep_assert_cblist_protected(rdp);
1548 37368 : raw_lockdep_assert_held_rcu_node(rnp);
1549 :
1550 : /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1551 18684 : if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1552 : return false;
1553 :
1554 16532 : trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
1555 :
1556 : /*
1557 : * Callbacks are often registered with incomplete grace-period
1558 : * information. Something about the fact that getting exact
1559 : * information requires acquiring a global lock... RCU therefore
1560 : * makes a conservative estimate of the grace period number at which
1561 : * a given callback will become ready to invoke. The following
1562 : * code checks this estimate and improves it when possible, thus
1563 : * accelerating callback invocation to an earlier grace-period
1564 : * number.
1565 : */
1566 16532 : gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
1567 16532 : if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
1568 12314 : ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
1569 :
1570 : /* Trace depending on how much we were able to accelerate. */
1571 16532 : if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
1572 18684 : trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
1573 : else
1574 16532 : trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
1575 :
1576 18684 : trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
1577 :
1578 : return ret;
1579 : }
1580 :
1581 : /*
1582 : * Similar to rcu_accelerate_cbs(), but does not require that the leaf
1583 : * rcu_node structure's ->lock be held. It consults the cached value
1584 : * of ->gp_seq_needed in the rcu_data structure, and if that indicates
1585 : * that a new grace-period request be made, invokes rcu_accelerate_cbs()
1586 : * while holding the leaf rcu_node structure's ->lock.
1587 : */
1588 14 : static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
1589 : struct rcu_data *rdp)
1590 : {
1591 14 : unsigned long c;
1592 14 : bool needwake;
1593 :
1594 14 : rcu_lockdep_assert_cblist_protected(rdp);
1595 14 : c = rcu_seq_snap(&rcu_state.gp_seq);
1596 14 : if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
1597 : /* Old request still live, so mark recent callbacks. */
1598 1 : (void)rcu_segcblist_accelerate(&rdp->cblist, c);
1599 1 : return;
1600 : }
1601 13 : raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
1602 13 : needwake = rcu_accelerate_cbs(rnp, rdp);
1603 26 : raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1604 13 : if (needwake)
1605 9 : rcu_gp_kthread_wake();
1606 : }
1607 :
1608 : /*
1609 : * Move any callbacks whose grace period has completed to the
1610 : * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1611 : * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
1612 : * sublist. This function is idempotent, so it does not hurt to
1613 : * invoke it repeatedly. As long as it is not invoked -too- often...
1614 : * Returns true if the RCU grace-period kthread needs to be awakened.
1615 : *
1616 : * The caller must hold rnp->lock with interrupts disabled.
1617 : */
1618 7622 : static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1619 : {
1620 7622 : rcu_lockdep_assert_cblist_protected(rdp);
1621 15244 : raw_lockdep_assert_held_rcu_node(rnp);
1622 :
1623 : /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1624 7622 : if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1625 : return false;
1626 :
1627 : /*
1628 : * Find all callbacks whose ->gp_seq numbers indicate that they
1629 : * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1630 : */
1631 7267 : rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
1632 :
1633 : /* Classify any remaining callbacks. */
1634 7267 : return rcu_accelerate_cbs(rnp, rdp);
1635 : }
1636 :
1637 : /*
1638 : * Move and classify callbacks, but only if doing so won't require
1639 : * that the RCU grace-period kthread be awakened.
1640 : */
1641 : static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
1642 : struct rcu_data *rdp)
1643 : {
1644 : rcu_lockdep_assert_cblist_protected(rdp);
1645 : if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
1646 : !raw_spin_trylock_rcu_node(rnp))
1647 : return;
1648 : WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
1649 : raw_spin_unlock_rcu_node(rnp);
1650 : }
1651 :
1652 : /*
1653 : * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
1654 : * quiescent state. This is intended to be invoked when the CPU notices
1655 : * a new grace period.
1656 : */
1657 5653 : static void rcu_strict_gp_check_qs(void)
1658 : {
1659 5653 : if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
1660 : rcu_read_lock();
1661 : rcu_read_unlock();
1662 : }
1663 : }
1664 :
1665 : /*
1666 : * Update CPU-local rcu_data state to record the beginnings and ends of
1667 : * grace periods. The caller must hold the ->lock of the leaf rcu_node
1668 : * structure corresponding to the current CPU, and must have irqs disabled.
1669 : * Returns true if the grace-period kthread needs to be awakened.
1670 : */
1671 9692 : static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
1672 : {
1673 9692 : bool ret = false;
1674 9692 : bool need_qs;
1675 9692 : const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
1676 :
1677 19384 : raw_lockdep_assert_held_rcu_node(rnp);
1678 :
1679 9692 : if (rdp->gp_seq == rnp->gp_seq)
1680 : return false; /* Nothing to do. */
1681 :
1682 : /* Handle the ends of any preceding grace periods first. */
1683 9692 : if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
1684 2070 : unlikely(READ_ONCE(rdp->gpwrap))) {
1685 7622 : if (!offloaded)
1686 7622 : ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
1687 7622 : rdp->core_needs_qs = false;
1688 7622 : trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
1689 : } else {
1690 2070 : if (!offloaded)
1691 2070 : ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
1692 2070 : if (rdp->core_needs_qs)
1693 0 : rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
1694 : }
1695 :
1696 : /* Now handle the beginnings of any new-to-this-CPU grace periods. */
1697 9692 : if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
1698 2064 : unlikely(READ_ONCE(rdp->gpwrap))) {
1699 : /*
1700 : * If the current grace period is waiting for this CPU,
1701 : * set up to detect a quiescent state, otherwise don't
1702 : * go looking for one.
1703 : */
1704 7628 : trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
1705 7628 : need_qs = !!(rnp->qsmask & rdp->grpmask);
1706 7628 : rdp->cpu_no_qs.b.norm = need_qs;
1707 7628 : rdp->core_needs_qs = need_qs;
1708 7628 : zero_cpu_stall_ticks(rdp);
1709 : }
1710 9692 : rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
1711 9692 : if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
1712 6955 : WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
1713 9692 : WRITE_ONCE(rdp->gpwrap, false);
1714 9692 : rcu_gpnum_ovf(rnp, rdp);
1715 9692 : return ret;
1716 : }
1717 :
1718 54025 : static void note_gp_changes(struct rcu_data *rdp)
1719 : {
1720 54025 : unsigned long flags;
1721 54025 : bool needwake;
1722 54025 : struct rcu_node *rnp;
1723 :
1724 108130 : local_irq_save(flags);
1725 54109 : rnp = rdp->mynode;
1726 54109 : if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) &&
1727 47761 : !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
1728 6348 : !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
1729 48535 : local_irq_restore(flags);
1730 48562 : return;
1731 : }
1732 5653 : needwake = __note_gp_changes(rnp, rdp);
1733 11306 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1734 5653 : rcu_strict_gp_check_qs();
1735 5653 : if (needwake)
1736 6 : rcu_gp_kthread_wake();
1737 : }
1738 :
1739 6059 : static void rcu_gp_slow(int delay)
1740 : {
1741 6059 : if (delay > 0 &&
1742 0 : !(rcu_seq_ctr(rcu_state.gp_seq) %
1743 0 : (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
1744 0 : schedule_timeout_idle(delay);
1745 6059 : }
1746 :
1747 : static unsigned long sleep_duration;
1748 :
1749 : /* Allow rcutorture to stall the grace-period kthread. */
1750 0 : void rcu_gp_set_torture_wait(int duration)
1751 : {
1752 0 : if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0)
1753 0 : WRITE_ONCE(sleep_duration, duration);
1754 0 : }
1755 : EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait);
1756 :
1757 : /* Actually implement the aforementioned wait. */
1758 6495 : static void rcu_gp_torture_wait(void)
1759 : {
1760 6495 : unsigned long duration;
1761 :
1762 6495 : if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST))
1763 6495 : return;
1764 : duration = xchg(&sleep_duration, 0UL);
1765 : if (duration > 0) {
1766 : pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
1767 : schedule_timeout_idle(duration);
1768 : pr_alert("%s: Wait complete\n", __func__);
1769 : }
1770 : }
1771 :
1772 : /*
1773 : * Handler for on_each_cpu() to invoke the target CPU's RCU core
1774 : * processing.
1775 : */
1776 : static void rcu_strict_gp_boundary(void *unused)
1777 : {
1778 : invoke_rcu_core();
1779 : }
1780 :
1781 : /*
1782 : * Initialize a new grace period. Return false if no grace period required.
1783 : */
1784 2020 : static bool rcu_gp_init(void)
1785 : {
1786 2020 : unsigned long firstseq;
1787 2020 : unsigned long flags;
1788 2020 : unsigned long oldmask;
1789 2020 : unsigned long mask;
1790 2020 : struct rcu_data *rdp;
1791 2020 : struct rcu_node *rnp = rcu_get_root();
1792 :
1793 2020 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
1794 2020 : raw_spin_lock_irq_rcu_node(rnp);
1795 2020 : if (!READ_ONCE(rcu_state.gp_flags)) {
1796 : /* Spurious wakeup, tell caller to go back to sleep. */
1797 0 : raw_spin_unlock_irq_rcu_node(rnp);
1798 0 : return false;
1799 : }
1800 2020 : WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */
1801 :
1802 2020 : if (WARN_ON_ONCE(rcu_gp_in_progress())) {
1803 : /*
1804 : * Grace period already in progress, don't start another.
1805 : * Not supposed to be able to happen.
1806 : */
1807 0 : raw_spin_unlock_irq_rcu_node(rnp);
1808 0 : return false;
1809 : }
1810 :
1811 : /* Advance to a new grace period and initialize state. */
1812 2020 : record_gp_stall_check_time();
1813 : /* Record GP times before starting GP, hence rcu_seq_start(). */
1814 2020 : rcu_seq_start(&rcu_state.gp_seq);
1815 2020 : ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
1816 2020 : trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
1817 4040 : raw_spin_unlock_irq_rcu_node(rnp);
1818 :
1819 : /*
1820 : * Apply per-leaf buffered online and offline operations to
1821 : * the rcu_node tree. Note that this new grace period need not
1822 : * wait for subsequent online CPUs, and that RCU hooks in the CPU
1823 : * offlining path, when combined with checks in this function,
1824 : * will handle CPUs that are currently going offline or that will
1825 : * go offline later. Please also refer to "Hotplug CPU" section
1826 : * of RCU's Requirements documentation.
1827 : */
1828 2020 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
1829 4040 : rcu_for_each_leaf_node(rnp) {
1830 2020 : smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
1831 2020 : firstseq = READ_ONCE(rnp->ofl_seq);
1832 2020 : if (firstseq & 0x1)
1833 0 : while (firstseq == READ_ONCE(rnp->ofl_seq))
1834 0 : schedule_timeout_idle(1); // Can't wake unless RCU is watching.
1835 2020 : smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
1836 2020 : raw_spin_lock(&rcu_state.ofl_lock);
1837 2020 : raw_spin_lock_irq_rcu_node(rnp);
1838 2020 : if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1839 2017 : !rnp->wait_blkd_tasks) {
1840 : /* Nothing to do on this leaf rcu_node structure. */
1841 4034 : raw_spin_unlock_irq_rcu_node(rnp);
1842 2017 : raw_spin_unlock(&rcu_state.ofl_lock);
1843 2017 : continue;
1844 : }
1845 :
1846 : /* Record old state, apply changes to ->qsmaskinit field. */
1847 3 : oldmask = rnp->qsmaskinit;
1848 3 : rnp->qsmaskinit = rnp->qsmaskinitnext;
1849 :
1850 : /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
1851 3 : if (!oldmask != !rnp->qsmaskinit) {
1852 1 : if (!oldmask) { /* First online CPU for rcu_node. */
1853 1 : if (!rnp->wait_blkd_tasks) /* Ever offline? */
1854 1 : rcu_init_new_rnp(rnp);
1855 0 : } else if (rcu_preempt_has_tasks(rnp)) {
1856 : rnp->wait_blkd_tasks = true; /* blocked tasks */
1857 : } else { /* Last offline CPU and can propagate. */
1858 0 : rcu_cleanup_dead_rnp(rnp);
1859 : }
1860 : }
1861 :
1862 : /*
1863 : * If all waited-on tasks from prior grace period are
1864 : * done, and if all this rcu_node structure's CPUs are
1865 : * still offline, propagate up the rcu_node tree and
1866 : * clear ->wait_blkd_tasks. Otherwise, if one of this
1867 : * rcu_node structure's CPUs has since come back online,
1868 : * simply clear ->wait_blkd_tasks.
1869 : */
1870 3 : if (rnp->wait_blkd_tasks &&
1871 0 : (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) {
1872 0 : rnp->wait_blkd_tasks = false;
1873 0 : if (!rnp->qsmaskinit)
1874 0 : rcu_cleanup_dead_rnp(rnp);
1875 : }
1876 :
1877 6 : raw_spin_unlock_irq_rcu_node(rnp);
1878 3 : raw_spin_unlock(&rcu_state.ofl_lock);
1879 : }
1880 2020 : rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
1881 :
1882 : /*
1883 : * Set the quiescent-state-needed bits in all the rcu_node
1884 : * structures for all currently online CPUs in breadth-first
1885 : * order, starting from the root rcu_node structure, relying on the
1886 : * layout of the tree within the rcu_state.node[] array. Note that
1887 : * other CPUs will access only the leaves of the hierarchy, thus
1888 : * seeing that no grace period is in progress, at least until the
1889 : * corresponding leaf node has been initialized.
1890 : *
1891 : * The grace period cannot complete until the initialization
1892 : * process finishes, because this kthread handles both.
1893 : */
1894 2020 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
1895 4040 : rcu_for_each_node_breadth_first(rnp) {
1896 2020 : rcu_gp_slow(gp_init_delay);
1897 2020 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
1898 2020 : rdp = this_cpu_ptr(&rcu_data);
1899 2020 : rcu_preempt_check_blocked_tasks(rnp);
1900 2020 : rnp->qsmask = rnp->qsmaskinit;
1901 2020 : WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
1902 2020 : if (rnp == rdp->mynode)
1903 2020 : (void)__note_gp_changes(rnp, rdp);
1904 2020 : rcu_preempt_boost_start_gp(rnp);
1905 2020 : trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
1906 2020 : rnp->level, rnp->grplo,
1907 : rnp->grphi, rnp->qsmask);
1908 : /* Quiescent states for tasks on any now-offline CPUs. */
1909 2020 : mask = rnp->qsmask & ~rnp->qsmaskinitnext;
1910 2020 : rnp->rcu_gp_init_mask = mask;
1911 2020 : if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
1912 0 : rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
1913 : else
1914 4040 : raw_spin_unlock_irq_rcu_node(rnp);
1915 2020 : cond_resched_tasks_rcu_qs();
1916 2020 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
1917 : }
1918 :
1919 : // If strict, make all CPUs aware of new grace period.
1920 : if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
1921 : on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
1922 :
1923 : return true;
1924 : }
1925 :
1926 : /*
1927 : * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
1928 : * time.
1929 : */
1930 12720 : static bool rcu_gp_fqs_check_wake(int *gfp)
1931 : {
1932 12720 : struct rcu_node *rnp = rcu_get_root();
1933 :
1934 : // If under overload conditions, force an immediate FQS scan.
1935 12720 : if (*gfp & RCU_GP_FLAG_OVLD)
1936 : return true;
1937 :
1938 : // Someone like call_rcu() requested a force-quiescent-state scan.
1939 12720 : *gfp = READ_ONCE(rcu_state.gp_flags);
1940 12720 : if (*gfp & RCU_GP_FLAG_FQS)
1941 : return true;
1942 :
1943 : // The current grace period has completed.
1944 11055 : if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
1945 353 : return true;
1946 :
1947 : return false;
1948 : }
1949 :
1950 : /*
1951 : * Do one round of quiescent-state forcing.
1952 : */
1953 2456 : static void rcu_gp_fqs(bool first_time)
1954 : {
1955 2456 : struct rcu_node *rnp = rcu_get_root();
1956 :
1957 2456 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
1958 2456 : rcu_state.n_force_qs++;
1959 2456 : if (first_time) {
1960 : /* Collect dyntick-idle snapshots. */
1961 1510 : force_qs_rnp(dyntick_save_progress_counter);
1962 : } else {
1963 : /* Handle dyntick-idle and offline CPUs. */
1964 946 : force_qs_rnp(rcu_implicit_dynticks_qs);
1965 : }
1966 : /* Clear flag to prevent immediate re-entry. */
1967 2456 : if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
1968 352 : raw_spin_lock_irq_rcu_node(rnp);
1969 352 : WRITE_ONCE(rcu_state.gp_flags,
1970 : READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
1971 704 : raw_spin_unlock_irq_rcu_node(rnp);
1972 : }
1973 2456 : }
1974 :
1975 : /*
1976 : * Loop doing repeated quiescent-state forcing until the grace period ends.
1977 : */
1978 2020 : static void rcu_gp_fqs_loop(void)
1979 : {
1980 2020 : bool first_gp_fqs;
1981 2020 : int gf = 0;
1982 2020 : unsigned long j;
1983 2020 : int ret;
1984 2020 : struct rcu_node *rnp = rcu_get_root();
1985 :
1986 2020 : first_gp_fqs = true;
1987 2020 : j = READ_ONCE(jiffies_till_first_fqs);
1988 2020 : if (rcu_state.cbovld)
1989 0 : gf = RCU_GP_FLAG_OVLD;
1990 : ret = 0;
1991 4476 : for (;;) {
1992 4476 : if (!ret) {
1993 4476 : WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
1994 : /*
1995 : * jiffies_force_qs before RCU_GP_WAIT_FQS state
1996 : * update; required for stall checks.
1997 : */
1998 4476 : smp_wmb();
1999 4476 : WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
2000 : jiffies + (j ? 3 * j : 2));
2001 : }
2002 4476 : trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
2003 4476 : TPS("fqswait"));
2004 4476 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
2005 8596 : ret = swait_event_idle_timeout_exclusive(
2006 : rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
2007 4475 : rcu_gp_torture_wait();
2008 4475 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
2009 : /* Locking provides needed memory barriers. */
2010 : /* If grace period done, leave loop. */
2011 4475 : if (!READ_ONCE(rnp->qsmask) &&
2012 2019 : !rcu_preempt_blocked_readers_cgp(rnp))
2013 : break;
2014 : /* If time for quiescent-state forcing, do it. */
2015 2456 : if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
2016 0 : (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) {
2017 2456 : trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
2018 2456 : TPS("fqsstart"));
2019 2456 : rcu_gp_fqs(first_gp_fqs);
2020 2456 : gf = 0;
2021 2456 : if (first_gp_fqs) {
2022 1510 : first_gp_fqs = false;
2023 3020 : gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0;
2024 : }
2025 2456 : trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
2026 2456 : TPS("fqsend"));
2027 2456 : cond_resched_tasks_rcu_qs();
2028 2456 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
2029 2456 : ret = 0; /* Force full wait till next FQS. */
2030 2456 : j = READ_ONCE(jiffies_till_next_fqs);
2031 : } else {
2032 : /* Deal with stray signal. */
2033 0 : cond_resched_tasks_rcu_qs();
2034 0 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
2035 0 : WARN_ON(signal_pending(current));
2036 0 : trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
2037 0 : TPS("fqswaitsig"));
2038 0 : ret = 1; /* Keep old FQS timing. */
2039 0 : j = jiffies;
2040 0 : if (time_after(jiffies, rcu_state.jiffies_force_qs))
2041 : j = 1;
2042 : else
2043 0 : j = rcu_state.jiffies_force_qs - j;
2044 0 : gf = 0;
2045 : }
2046 : }
2047 2019 : }
2048 :
2049 : /*
2050 : * Clean up after the old grace period.
2051 : */
2052 2019 : static void rcu_gp_cleanup(void)
2053 : {
2054 2019 : int cpu;
2055 2019 : bool needgp = false;
2056 2019 : unsigned long gp_duration;
2057 2019 : unsigned long new_gp_seq;
2058 2019 : bool offloaded;
2059 2019 : struct rcu_data *rdp;
2060 2019 : struct rcu_node *rnp = rcu_get_root();
2061 2019 : struct swait_queue_head *sq;
2062 :
2063 2019 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
2064 2019 : raw_spin_lock_irq_rcu_node(rnp);
2065 2019 : rcu_state.gp_end = jiffies;
2066 2019 : gp_duration = rcu_state.gp_end - rcu_state.gp_start;
2067 2019 : if (gp_duration > rcu_state.gp_max)
2068 6 : rcu_state.gp_max = gp_duration;
2069 :
2070 : /*
2071 : * We know the grace period is complete, but to everyone else
2072 : * it appears to still be ongoing. But it is also the case
2073 : * that to everyone else it looks like there is nothing that
2074 : * they can do to advance the grace period. It is therefore
2075 : * safe for us to drop the lock in order to mark the grace
2076 : * period as completed in all of the rcu_node structures.
2077 : */
2078 4038 : raw_spin_unlock_irq_rcu_node(rnp);
2079 :
2080 : /*
2081 : * Propagate new ->gp_seq value to rcu_node structures so that
2082 : * other CPUs don't have to wait until the start of the next grace
2083 : * period to process their callbacks. This also avoids some nasty
2084 : * RCU grace-period initialization races by forcing the end of
2085 : * the current grace period to be completely recorded in all of
2086 : * the rcu_node structures before the beginning of the next grace
2087 : * period is recorded in any of the rcu_node structures.
2088 : */
2089 2019 : new_gp_seq = rcu_state.gp_seq;
2090 2019 : rcu_seq_end(&new_gp_seq);
2091 6057 : rcu_for_each_node_breadth_first(rnp) {
2092 2019 : raw_spin_lock_irq_rcu_node(rnp);
2093 2019 : if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
2094 2019 : dump_blkd_tasks(rnp, 10);
2095 2019 : WARN_ON_ONCE(rnp->qsmask);
2096 2019 : WRITE_ONCE(rnp->gp_seq, new_gp_seq);
2097 2019 : rdp = this_cpu_ptr(&rcu_data);
2098 2019 : if (rnp == rdp->mynode)
2099 4038 : needgp = __note_gp_changes(rnp, rdp) || needgp;
2100 : /* smp_mb() provided by prior unlock-lock pair. */
2101 4038 : needgp = rcu_future_gp_cleanup(rnp) || needgp;
2102 : // Reset overload indication for CPUs no longer overloaded
2103 2019 : if (rcu_is_leaf_node(rnp))
2104 2019 : for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
2105 0 : rdp = per_cpu_ptr(&rcu_data, cpu);
2106 0 : check_cb_ovld_locked(rdp, rnp);
2107 : }
2108 2019 : sq = rcu_nocb_gp_get(rnp);
2109 4038 : raw_spin_unlock_irq_rcu_node(rnp);
2110 2019 : rcu_nocb_gp_cleanup(sq);
2111 2019 : cond_resched_tasks_rcu_qs();
2112 2019 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
2113 2019 : rcu_gp_slow(gp_cleanup_delay);
2114 : }
2115 2019 : rnp = rcu_get_root();
2116 2019 : raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
2117 :
2118 : /* Declare grace period done, trace first to use old GP number. */
2119 2019 : trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
2120 2019 : rcu_seq_end(&rcu_state.gp_seq);
2121 2019 : ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
2122 2019 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
2123 : /* Check for GP requests since above loop. */
2124 2019 : rdp = this_cpu_ptr(&rcu_data);
2125 2019 : if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
2126 0 : trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed,
2127 0 : TPS("CleanupMore"));
2128 0 : needgp = true;
2129 : }
2130 : /* Advance CBs to reduce false positives below. */
2131 2019 : offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
2132 2019 : if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
2133 2004 : WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
2134 2004 : WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
2135 4023 : trace_rcu_grace_period(rcu_state.name,
2136 : rcu_state.gp_seq,
2137 2004 : TPS("newreq"));
2138 : } else {
2139 15 : WRITE_ONCE(rcu_state.gp_flags,
2140 : rcu_state.gp_flags & RCU_GP_FLAG_INIT);
2141 : }
2142 4038 : raw_spin_unlock_irq_rcu_node(rnp);
2143 :
2144 : // If strict, make all CPUs aware of the end of the old grace period.
2145 2019 : if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
2146 : on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
2147 2019 : }
2148 :
2149 : /*
2150 : * Body of kthread that handles grace periods.
2151 : */
2152 1 : static int __noreturn rcu_gp_kthread(void *unused)
2153 : {
2154 1 : rcu_bind_gp_kthread();
2155 4039 : for (;;) {
2156 :
2157 : /* Handle grace-period start. */
2158 2020 : for (;;) {
2159 2020 : trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
2160 2020 : TPS("reqwait"));
2161 2020 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
2162 2035 : swait_event_idle_exclusive(rcu_state.gp_wq,
2163 : READ_ONCE(rcu_state.gp_flags) &
2164 : RCU_GP_FLAG_INIT);
2165 2020 : rcu_gp_torture_wait();
2166 2020 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
2167 : /* Locking provides needed memory barrier. */
2168 2020 : if (rcu_gp_init())
2169 : break;
2170 0 : cond_resched_tasks_rcu_qs();
2171 0 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
2172 0 : WARN_ON(signal_pending(current));
2173 2020 : trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
2174 2019 : TPS("reqwaitsig"));
2175 : }
2176 :
2177 : /* Handle quiescent-state forcing. */
2178 2020 : rcu_gp_fqs_loop();
2179 :
2180 : /* Handle grace-period end. */
2181 2019 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
2182 2019 : rcu_gp_cleanup();
2183 2019 : WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
2184 : }
2185 : }
2186 :
2187 : /*
2188 : * Report a full set of quiescent states to the rcu_state data structure.
2189 : * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
2190 : * another grace period is required. Whether we wake the grace-period
2191 : * kthread or it awakens itself for the next round of quiescent-state
2192 : * forcing, that kthread will clean up after the just-completed grace
2193 : * period. Note that the caller must hold rnp->lock, which is released
2194 : * before return.
2195 : */
2196 2019 : static void rcu_report_qs_rsp(unsigned long flags)
2197 : __releases(rcu_get_root()->lock)
2198 : {
2199 4038 : raw_lockdep_assert_held_rcu_node(rcu_get_root());
2200 2019 : WARN_ON_ONCE(!rcu_gp_in_progress());
2201 2019 : WRITE_ONCE(rcu_state.gp_flags,
2202 : READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
2203 4038 : raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
2204 2019 : rcu_gp_kthread_wake();
2205 2019 : }
2206 :
2207 : /*
2208 : * Similar to rcu_report_qs_rdp(), for which it is a helper function.
2209 : * Allows quiescent states for a group of CPUs to be reported at one go
2210 : * to the specified rcu_node structure, though all the CPUs in the group
2211 : * must be represented by the same rcu_node structure (which need not be a
2212 : * leaf rcu_node structure, though it often will be). The gps parameter
2213 : * is the grace-period snapshot, which means that the quiescent states
2214 : * are valid only if rnp->gp_seq is equal to gps. That structure's lock
2215 : * must be held upon entry, and it is released before return.
2216 : *
2217 : * As a special case, if mask is zero, the bit-already-cleared check is
2218 : * disabled. This allows propagating quiescent state due to resumed tasks
2219 : * during grace-period initialization.
2220 : */
2221 7937 : static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
2222 : unsigned long gps, unsigned long flags)
2223 : __releases(rnp->lock)
2224 : {
2225 7937 : unsigned long oldmask = 0;
2226 7937 : struct rcu_node *rnp_c;
2227 :
2228 15874 : raw_lockdep_assert_held_rcu_node(rnp);
2229 :
2230 : /* Walk up the rcu_node hierarchy. */
2231 7937 : for (;;) {
2232 7937 : if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) {
2233 :
2234 : /*
2235 : * Our bit has already been cleared, or the
2236 : * relevant grace period is already over, so done.
2237 : */
2238 0 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2239 0 : return;
2240 : }
2241 7937 : WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
2242 7937 : WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
2243 : rcu_preempt_blocked_readers_cgp(rnp));
2244 7937 : WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
2245 7937 : trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
2246 : mask, rnp->qsmask, rnp->level,
2247 : rnp->grplo, rnp->grphi,
2248 7937 : !!rnp->gp_tasks);
2249 7937 : if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2250 :
2251 : /* Other bits still set at this level, so done. */
2252 11836 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2253 5918 : return;
2254 : }
2255 2019 : rnp->completedqs = rnp->gp_seq;
2256 2019 : mask = rnp->grpmask;
2257 2019 : if (rnp->parent == NULL) {
2258 :
2259 : /* No more levels. Exit loop holding root lock. */
2260 :
2261 : break;
2262 : }
2263 0 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2264 0 : rnp_c = rnp;
2265 0 : rnp = rnp->parent;
2266 0 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
2267 0 : oldmask = READ_ONCE(rnp_c->qsmask);
2268 : }
2269 :
2270 : /*
2271 : * Get here if we are the last CPU to pass through a quiescent
2272 : * state for this grace period. Invoke rcu_report_qs_rsp()
2273 : * to clean up and start the next grace period if one is needed.
2274 : */
2275 2019 : rcu_report_qs_rsp(flags); /* releases rnp->lock. */
2276 : }
2277 :
2278 : /*
2279 : * Record a quiescent state for all tasks that were previously queued
2280 : * on the specified rcu_node structure and that were blocking the current
2281 : * RCU grace period. The caller must hold the corresponding rnp->lock with
2282 : * irqs disabled, and this lock is released upon return, but irqs remain
2283 : * disabled.
2284 : */
2285 : static void __maybe_unused
2286 : rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
2287 : __releases(rnp->lock)
2288 : {
2289 : unsigned long gps;
2290 : unsigned long mask;
2291 : struct rcu_node *rnp_p;
2292 :
2293 : raw_lockdep_assert_held_rcu_node(rnp);
2294 : if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
2295 : WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
2296 : rnp->qsmask != 0) {
2297 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2298 : return; /* Still need more quiescent states! */
2299 : }
2300 :
2301 : rnp->completedqs = rnp->gp_seq;
2302 : rnp_p = rnp->parent;
2303 : if (rnp_p == NULL) {
2304 : /*
2305 : * Only one rcu_node structure in the tree, so don't
2306 : * try to report up to its nonexistent parent!
2307 : */
2308 : rcu_report_qs_rsp(flags);
2309 : return;
2310 : }
2311 :
2312 : /* Report up the rest of the hierarchy, tracking current ->gp_seq. */
2313 : gps = rnp->gp_seq;
2314 : mask = rnp->grpmask;
2315 : raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2316 : raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
2317 : rcu_report_qs_rnp(mask, rnp_p, gps, flags);
2318 : }
2319 :
2320 : /*
2321 : * Record a quiescent state for the specified CPU to that CPU's rcu_data
2322 : * structure. This must be called from the specified CPU.
2323 : */
2324 : static void
2325 7395 : rcu_report_qs_rdp(struct rcu_data *rdp)
2326 : {
2327 7395 : unsigned long flags;
2328 7395 : unsigned long mask;
2329 7395 : bool needwake = false;
2330 7395 : const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
2331 7395 : struct rcu_node *rnp;
2332 :
2333 7395 : WARN_ON_ONCE(rdp->cpu != smp_processor_id());
2334 7395 : rnp = rdp->mynode;
2335 7395 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
2336 7411 : if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
2337 7393 : rdp->gpwrap) {
2338 :
2339 : /*
2340 : * The grace period in which this quiescent state was
2341 : * recorded has ended, so don't report it upwards.
2342 : * We will instead need a new quiescent state that lies
2343 : * within the current grace period.
2344 : */
2345 18 : rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
2346 36 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2347 18 : return;
2348 : }
2349 7393 : mask = rdp->grpmask;
2350 7393 : rdp->core_needs_qs = false;
2351 7393 : if ((rnp->qsmask & mask) == 0) {
2352 156 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2353 : } else {
2354 : /*
2355 : * This GP can't end until cpu checks in, so all of our
2356 : * callbacks can be processed during the next GP.
2357 : */
2358 7315 : if (!offloaded)
2359 7315 : needwake = rcu_accelerate_cbs(rnp, rdp);
2360 :
2361 7315 : rcu_disable_urgency_upon_qs(rdp);
2362 7315 : rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
2363 : /* ^^^ Released rnp->lock */
2364 7315 : if (needwake)
2365 0 : rcu_gp_kthread_wake();
2366 : }
2367 : }
2368 :
2369 : /*
2370 : * Check to see if there is a new grace period of which this CPU
2371 : * is not yet aware, and if so, set up local rcu_data state for it.
2372 : * Otherwise, see if this CPU has just passed through its first
2373 : * quiescent state for this grace period, and record that fact if so.
2374 : */
2375 : static void
2376 54066 : rcu_check_quiescent_state(struct rcu_data *rdp)
2377 : {
2378 : /* Check for grace-period ends and beginnings. */
2379 54066 : note_gp_changes(rdp);
2380 :
2381 : /*
2382 : * Does this CPU still need to do its part for current grace period?
2383 : * If no, return and let the other CPUs do their part as well.
2384 : */
2385 54215 : if (!rdp->core_needs_qs)
2386 : return;
2387 :
2388 : /*
2389 : * Was there a quiescent state since the beginning of the grace
2390 : * period? If no, then exit and wait for the next call.
2391 : */
2392 21228 : if (rdp->cpu_no_qs.b.norm)
2393 : return;
2394 :
2395 : /*
2396 : * Tell RCU we are done (but rcu_report_qs_rdp() will be the
2397 : * judge of that).
2398 : */
2399 7408 : rcu_report_qs_rdp(rdp);
2400 : }
2401 :
2402 : /*
2403 : * Near the end of the offline process. Trace the fact that this CPU
2404 : * is going offline.
2405 : */
2406 0 : int rcutree_dying_cpu(unsigned int cpu)
2407 : {
2408 0 : bool blkd;
2409 0 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
2410 0 : struct rcu_node *rnp = rdp->mynode;
2411 :
2412 0 : if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2413 : return 0;
2414 :
2415 0 : blkd = !!(rnp->qsmask & rdp->grpmask);
2416 0 : trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
2417 0 : blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
2418 0 : return 0;
2419 : }
2420 :
2421 : /*
2422 : * All CPUs for the specified rcu_node structure have gone offline,
2423 : * and all tasks that were preempted within an RCU read-side critical
2424 : * section while running on one of those CPUs have since exited their RCU
2425 : * read-side critical section. Some other CPU is reporting this fact with
2426 : * the specified rcu_node structure's ->lock held and interrupts disabled.
2427 : * This function therefore goes up the tree of rcu_node structures,
2428 : * clearing the corresponding bits in the ->qsmaskinit fields. Note that
2429 : * the leaf rcu_node structure's ->qsmaskinit field has already been
2430 : * updated.
2431 : *
2432 : * This function does check that the specified rcu_node structure has
2433 : * all CPUs offline and no blocked tasks, so it is OK to invoke it
2434 : * prematurely. That said, invoking it after the fact will cost you
2435 : * a needless lock acquisition. So once it has done its work, don't
2436 : * invoke it again.
2437 : */
2438 0 : static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2439 : {
2440 0 : long mask;
2441 0 : struct rcu_node *rnp = rnp_leaf;
2442 :
2443 0 : raw_lockdep_assert_held_rcu_node(rnp_leaf);
2444 0 : if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
2445 0 : WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
2446 0 : WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
2447 : return;
2448 0 : for (;;) {
2449 0 : mask = rnp->grpmask;
2450 0 : rnp = rnp->parent;
2451 0 : if (!rnp)
2452 : break;
2453 0 : raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
2454 0 : rnp->qsmaskinit &= ~mask;
2455 : /* Between grace periods, so better already be zero! */
2456 0 : WARN_ON_ONCE(rnp->qsmask);
2457 0 : if (rnp->qsmaskinit) {
2458 0 : raw_spin_unlock_rcu_node(rnp);
2459 : /* irqs remain disabled. */
2460 0 : return;
2461 : }
2462 0 : raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2463 : }
2464 : }
2465 :
2466 : /*
2467 : * The CPU has been completely removed, and some other CPU is reporting
2468 : * this fact from process context. Do the remainder of the cleanup.
2469 : * There can only be one CPU hotplug operation at a time, so no need for
2470 : * explicit locking.
2471 : */
2472 0 : int rcutree_dead_cpu(unsigned int cpu)
2473 : {
2474 0 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2475 0 : struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2476 :
2477 0 : if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2478 : return 0;
2479 :
2480 0 : WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
2481 : /* Adjust any no-longer-needed kthreads. */
2482 0 : rcu_boost_kthread_setaffinity(rnp, -1);
2483 : /* Do any needed no-CB deferred wakeups from this CPU. */
2484 0 : do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
2485 :
2486 : // Stop-machine done, so allow nohz_full to disable tick.
2487 0 : tick_dep_clear(TICK_DEP_BIT_RCU);
2488 0 : return 0;
2489 : }
2490 :
2491 : /*
2492 : * Invoke any RCU callbacks that have made it to the end of their grace
2493 : * period. Thottle as specified by rdp->blimit.
2494 : */
2495 48775 : static void rcu_do_batch(struct rcu_data *rdp)
2496 : {
2497 48775 : int div;
2498 48775 : bool __maybe_unused empty;
2499 48775 : unsigned long flags;
2500 48775 : const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
2501 48775 : struct rcu_head *rhp;
2502 48775 : struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2503 48775 : long bl, count = 0;
2504 48775 : long pending, tlimit = 0;
2505 :
2506 : /* If no callbacks are ready, just return. */
2507 48775 : if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
2508 0 : trace_rcu_batch_start(rcu_state.name,
2509 : rcu_segcblist_n_cbs(&rdp->cblist), 0);
2510 0 : trace_rcu_batch_end(rcu_state.name, 0,
2511 0 : !rcu_segcblist_empty(&rdp->cblist),
2512 0 : need_resched(), is_idle_task(current),
2513 : rcu_is_callbacks_kthread());
2514 0 : return;
2515 : }
2516 :
2517 : /*
2518 : * Extract the list of ready callbacks, disabling to prevent
2519 : * races with call_rcu() from interrupt handlers. Leave the
2520 : * callback counts, as rcu_barrier() needs to be conservative.
2521 : */
2522 97553 : local_irq_save(flags);
2523 48767 : rcu_nocb_lock(rdp);
2524 48767 : WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2525 48776 : pending = rcu_segcblist_n_cbs(&rdp->cblist);
2526 48776 : div = READ_ONCE(rcu_divisor);
2527 48776 : div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
2528 48776 : bl = max(rdp->blimit, pending >> div);
2529 48776 : if (unlikely(bl > 100)) {
2530 0 : long rrn = READ_ONCE(rcu_resched_ns);
2531 :
2532 0 : rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
2533 0 : tlimit = local_clock() + rrn;
2534 : }
2535 48776 : trace_rcu_batch_start(rcu_state.name,
2536 : rcu_segcblist_n_cbs(&rdp->cblist), bl);
2537 48776 : rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
2538 48781 : if (offloaded)
2539 : rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
2540 :
2541 48781 : trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
2542 48781 : rcu_nocb_unlock_irqrestore(rdp, flags);
2543 :
2544 : /* Invoke callbacks. */
2545 48779 : tick_dep_set_task(current, TICK_DEP_BIT_RCU);
2546 48779 : rhp = rcu_cblist_dequeue(&rcl);
2547 :
2548 678351 : for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
2549 626420 : rcu_callback_t f;
2550 :
2551 626420 : count++;
2552 626420 : debug_rcu_head_unqueue(rhp);
2553 :
2554 624602 : rcu_lock_acquire(&rcu_callback_map);
2555 624982 : trace_rcu_invoke_callback(rcu_state.name, rhp);
2556 :
2557 624982 : f = rhp->func;
2558 624982 : WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
2559 624982 : f(rhp);
2560 :
2561 624982 : rcu_lock_release(&rcu_callback_map);
2562 :
2563 : /*
2564 : * Stop only if limit reached and CPU has something to do.
2565 : */
2566 624639 : if (count >= bl && !offloaded &&
2567 188408 : (need_resched() ||
2568 187137 : (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2569 : break;
2570 580788 : if (unlikely(tlimit)) {
2571 : /* only call local_clock() every 32 callbacks */
2572 0 : if (likely((count & 31) || local_clock() < tlimit))
2573 0 : continue;
2574 : /* Exceeded the time limit, so leave. */
2575 : break;
2576 : }
2577 580788 : if (!in_serving_softirq()) {
2578 0 : local_bh_enable();
2579 0 : lockdep_assert_irqs_enabled();
2580 0 : cond_resched_tasks_rcu_qs();
2581 0 : lockdep_assert_irqs_enabled();
2582 0 : local_bh_disable();
2583 : }
2584 : }
2585 :
2586 97521 : local_irq_save(flags);
2587 48764 : rcu_nocb_lock(rdp);
2588 48764 : rdp->n_cbs_invoked += count;
2589 48764 : trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
2590 48764 : is_idle_task(current), rcu_is_callbacks_kthread());
2591 :
2592 : /* Update counts and requeue any remaining callbacks. */
2593 48772 : rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
2594 48767 : rcu_segcblist_add_len(&rdp->cblist, -count);
2595 :
2596 : /* Reinstate batch limit if we have worked down the excess. */
2597 48777 : count = rcu_segcblist_n_cbs(&rdp->cblist);
2598 48777 : if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
2599 0 : rdp->blimit = blimit;
2600 :
2601 : /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2602 48777 : if (count == 0 && rdp->qlen_last_fqs_check != 0) {
2603 0 : rdp->qlen_last_fqs_check = 0;
2604 0 : rdp->n_force_qs_snap = rcu_state.n_force_qs;
2605 48777 : } else if (count < rdp->qlen_last_fqs_check - qhimark)
2606 0 : rdp->qlen_last_fqs_check = count;
2607 :
2608 : /*
2609 : * The following usually indicates a double call_rcu(). To track
2610 : * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
2611 : */
2612 48777 : empty = rcu_segcblist_empty(&rdp->cblist);
2613 48777 : WARN_ON_ONCE(count == 0 && !empty);
2614 48777 : WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2615 : count != 0 && empty);
2616 49035 : WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
2617 97289 : WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
2618 :
2619 48775 : rcu_nocb_unlock_irqrestore(rdp, flags);
2620 :
2621 : /* Re-invoke RCU core processing if there are callbacks remaining. */
2622 48777 : if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
2623 43580 : invoke_rcu_core();
2624 48775 : tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
2625 : }
2626 :
2627 : /*
2628 : * This function is invoked from each scheduling-clock interrupt,
2629 : * and checks to see if this CPU is in a non-context-switch quiescent
2630 : * state, for example, user mode or idle loop. It also schedules RCU
2631 : * core processing. If the current grace period has gone on too long,
2632 : * it will ask the scheduler to manufacture a context switch for the sole
2633 : * purpose of providing a providing the needed quiescent state.
2634 : */
2635 27707 : void rcu_sched_clock_irq(int user)
2636 : {
2637 27707 : trace_rcu_utilization(TPS("Start scheduler-tick"));
2638 54994 : lockdep_assert_irqs_disabled();
2639 27508 : raw_cpu_inc(rcu_data.ticks_this_gp);
2640 : /* The load-acquire pairs with the store-release setting to true. */
2641 27508 : if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
2642 : /* Idle and userspace execution already are quiescent states. */
2643 24 : if (!rcu_is_cpu_rrupt_from_idle() && !user) {
2644 25 : set_tsk_need_resched(current);
2645 25 : set_preempt_need_resched();
2646 : }
2647 27522 : __this_cpu_write(rcu_data.rcu_urgent_qs, false);
2648 : }
2649 27522 : rcu_flavor_sched_clock_irq(user);
2650 28002 : if (rcu_pending(user))
2651 11374 : invoke_rcu_core();
2652 55673 : lockdep_assert_irqs_disabled();
2653 :
2654 27945 : trace_rcu_utilization(TPS("End scheduler-tick"));
2655 27902 : }
2656 :
2657 : /*
2658 : * Scan the leaf rcu_node structures. For each structure on which all
2659 : * CPUs have reported a quiescent state and on which there are tasks
2660 : * blocking the current grace period, initiate RCU priority boosting.
2661 : * Otherwise, invoke the specified function to check dyntick state for
2662 : * each CPU that has not yet reported a quiescent state.
2663 : */
2664 2456 : static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
2665 : {
2666 2456 : int cpu;
2667 2456 : unsigned long flags;
2668 2456 : unsigned long mask;
2669 2456 : struct rcu_data *rdp;
2670 2456 : struct rcu_node *rnp;
2671 :
2672 2456 : rcu_state.cbovld = rcu_state.cbovldnext;
2673 2456 : rcu_state.cbovldnext = false;
2674 4912 : rcu_for_each_leaf_node(rnp) {
2675 2456 : cond_resched_tasks_rcu_qs();
2676 2456 : mask = 0;
2677 2456 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
2678 2456 : rcu_state.cbovldnext |= !!rnp->cbovldmask;
2679 2456 : if (rnp->qsmask == 0) {
2680 1 : if (rcu_preempt_blocked_readers_cgp(rnp)) {
2681 : /*
2682 : * No point in scanning bits because they
2683 : * are all zero. But we might need to
2684 : * priority-boost blocked readers.
2685 : */
2686 : rcu_initiate_boost(rnp, flags);
2687 : /* rcu_initiate_boost() releases rnp->lock */
2688 : continue;
2689 : }
2690 2 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2691 1 : continue;
2692 : }
2693 5839 : for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
2694 3384 : rdp = per_cpu_ptr(&rcu_data, cpu);
2695 3384 : if (f(rdp)) {
2696 757 : mask |= rdp->grpmask;
2697 757 : rcu_disable_urgency_upon_qs(rdp);
2698 : }
2699 : }
2700 2455 : if (mask != 0) {
2701 : /* Idle/offline CPUs, report (releases rnp->lock). */
2702 622 : rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
2703 : } else {
2704 : /* Nothing to do here, so just drop the lock. */
2705 4289 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2706 : }
2707 : }
2708 2456 : }
2709 :
2710 : /*
2711 : * Force quiescent states on reluctant CPUs, and also detect which
2712 : * CPUs are in dyntick-idle mode.
2713 : */
2714 0 : void rcu_force_quiescent_state(void)
2715 : {
2716 0 : unsigned long flags;
2717 0 : bool ret;
2718 0 : struct rcu_node *rnp;
2719 0 : struct rcu_node *rnp_old = NULL;
2720 :
2721 : /* Funnel through hierarchy to reduce memory contention. */
2722 0 : rnp = __this_cpu_read(rcu_data.mynode);
2723 0 : for (; rnp != NULL; rnp = rnp->parent) {
2724 0 : ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
2725 0 : !raw_spin_trylock(&rnp->fqslock);
2726 0 : if (rnp_old != NULL)
2727 0 : raw_spin_unlock(&rnp_old->fqslock);
2728 0 : if (ret)
2729 : return;
2730 0 : rnp_old = rnp;
2731 : }
2732 : /* rnp_old == rcu_get_root(), rnp == NULL. */
2733 :
2734 : /* Reached the root of the rcu_node tree, acquire lock. */
2735 0 : raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
2736 0 : raw_spin_unlock(&rnp_old->fqslock);
2737 0 : if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
2738 0 : raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2739 0 : return; /* Someone beat us to it. */
2740 : }
2741 0 : WRITE_ONCE(rcu_state.gp_flags,
2742 : READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
2743 0 : raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2744 0 : rcu_gp_kthread_wake();
2745 : }
2746 : EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
2747 :
2748 : // Workqueue handler for an RCU reader for kernels enforcing struct RCU
2749 : // grace periods.
2750 0 : static void strict_work_handler(struct work_struct *work)
2751 : {
2752 0 : rcu_read_lock();
2753 0 : rcu_read_unlock();
2754 0 : }
2755 :
2756 : /* Perform RCU core processing work for the current CPU. */
2757 54020 : static __latent_entropy void rcu_core(void)
2758 : {
2759 54020 : unsigned long flags;
2760 54020 : struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
2761 54051 : struct rcu_node *rnp = rdp->mynode;
2762 54051 : const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
2763 :
2764 54051 : if (cpu_is_offline(smp_processor_id()))
2765 : return;
2766 54060 : trace_rcu_utilization(TPS("Start RCU core"));
2767 54153 : WARN_ON_ONCE(!rdp->beenonline);
2768 :
2769 : /* Report any deferred quiescent states if preemption enabled. */
2770 54153 : if (!(preempt_count() & PREEMPT_MASK)) {
2771 38831 : rcu_preempt_deferred_qs(current);
2772 15322 : } else if (rcu_preempt_need_deferred_qs(current)) {
2773 : set_tsk_need_resched(current);
2774 : set_preempt_need_resched();
2775 : }
2776 :
2777 : /* Update RCU state based on any recent quiescent states. */
2778 54153 : rcu_check_quiescent_state(rdp);
2779 :
2780 : /* No grace period and unregistered callbacks? */
2781 54215 : if (!rcu_gp_in_progress() &&
2782 75 : rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
2783 150 : rcu_nocb_lock_irqsave(rdp, flags);
2784 75 : if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
2785 14 : rcu_accelerate_cbs_unlocked(rnp, rdp);
2786 75 : rcu_nocb_unlock_irqrestore(rdp, flags);
2787 : }
2788 :
2789 54215 : rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
2790 :
2791 : /* If there are callbacks ready, invoke them. */
2792 54194 : if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
2793 48774 : likely(READ_ONCE(rcu_scheduler_fully_active)))
2794 48774 : rcu_do_batch(rdp);
2795 :
2796 : /* Do any needed deferred wakeups of rcuo kthreads. */
2797 54197 : do_nocb_deferred_wakeup(rdp);
2798 54197 : trace_rcu_utilization(TPS("End RCU core"));
2799 :
2800 : // If strict GPs, schedule an RCU reader in a clean environment.
2801 54197 : if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
2802 : queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
2803 : }
2804 :
2805 54063 : static void rcu_core_si(struct softirq_action *h)
2806 : {
2807 54063 : rcu_core();
2808 54215 : }
2809 :
2810 0 : static void rcu_wake_cond(struct task_struct *t, int status)
2811 : {
2812 : /*
2813 : * If the thread is yielding, only wake it when this
2814 : * is invoked from idle
2815 : */
2816 0 : if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
2817 0 : wake_up_process(t);
2818 0 : }
2819 :
2820 0 : static void invoke_rcu_core_kthread(void)
2821 : {
2822 0 : struct task_struct *t;
2823 0 : unsigned long flags;
2824 :
2825 0 : local_irq_save(flags);
2826 0 : __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
2827 0 : t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
2828 0 : if (t != NULL && t != current)
2829 0 : rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
2830 0 : local_irq_restore(flags);
2831 0 : }
2832 :
2833 : /*
2834 : * Wake up this CPU's rcuc kthread to do RCU core processing.
2835 : */
2836 54848 : static void invoke_rcu_core(void)
2837 : {
2838 54848 : if (!cpu_online(smp_processor_id()))
2839 : return;
2840 54855 : if (use_softirq)
2841 54855 : raise_softirq(RCU_SOFTIRQ);
2842 : else
2843 0 : invoke_rcu_core_kthread();
2844 : }
2845 :
2846 0 : static void rcu_cpu_kthread_park(unsigned int cpu)
2847 : {
2848 0 : per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
2849 0 : }
2850 :
2851 0 : static int rcu_cpu_kthread_should_run(unsigned int cpu)
2852 : {
2853 0 : return __this_cpu_read(rcu_data.rcu_cpu_has_work);
2854 : }
2855 :
2856 : /*
2857 : * Per-CPU kernel thread that invokes RCU callbacks. This replaces
2858 : * the RCU softirq used in configurations of RCU that do not support RCU
2859 : * priority boosting.
2860 : */
2861 0 : static void rcu_cpu_kthread(unsigned int cpu)
2862 : {
2863 0 : unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
2864 0 : char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
2865 0 : int spincnt;
2866 :
2867 0 : trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
2868 0 : for (spincnt = 0; spincnt < 10; spincnt++) {
2869 0 : local_bh_disable();
2870 0 : *statusp = RCU_KTHREAD_RUNNING;
2871 0 : local_irq_disable();
2872 0 : work = *workp;
2873 0 : *workp = 0;
2874 0 : local_irq_enable();
2875 0 : if (work)
2876 0 : rcu_core();
2877 0 : local_bh_enable();
2878 0 : if (*workp == 0) {
2879 0 : trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
2880 0 : *statusp = RCU_KTHREAD_WAITING;
2881 0 : return;
2882 : }
2883 : }
2884 0 : *statusp = RCU_KTHREAD_YIELDING;
2885 0 : trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
2886 0 : schedule_timeout_idle(2);
2887 0 : trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
2888 0 : *statusp = RCU_KTHREAD_WAITING;
2889 : }
2890 :
2891 : static struct smp_hotplug_thread rcu_cpu_thread_spec = {
2892 : .store = &rcu_data.rcu_cpu_kthread_task,
2893 : .thread_should_run = rcu_cpu_kthread_should_run,
2894 : .thread_fn = rcu_cpu_kthread,
2895 : .thread_comm = "rcuc/%u",
2896 : .setup = rcu_cpu_kthread_setup,
2897 : .park = rcu_cpu_kthread_park,
2898 : };
2899 :
2900 : /*
2901 : * Spawn per-CPU RCU core processing kthreads.
2902 : */
2903 1 : static int __init rcu_spawn_core_kthreads(void)
2904 : {
2905 1 : int cpu;
2906 :
2907 5 : for_each_possible_cpu(cpu)
2908 4 : per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
2909 1 : if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
2910 : return 0;
2911 0 : WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
2912 : "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
2913 : return 0;
2914 : }
2915 : early_initcall(rcu_spawn_core_kthreads);
2916 :
2917 : /*
2918 : * Handle any core-RCU processing required by a call_rcu() invocation.
2919 : */
2920 627276 : static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
2921 : unsigned long flags)
2922 : {
2923 : /*
2924 : * If called from an extended quiescent state, invoke the RCU
2925 : * core in order to force a re-evaluation of RCU's idleness.
2926 : */
2927 1254632 : if (!rcu_is_watching())
2928 0 : invoke_rcu_core();
2929 :
2930 : /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
2931 627356 : if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
2932 20388 : return;
2933 :
2934 : /*
2935 : * Force the grace period if too many callbacks or too long waiting.
2936 : * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
2937 : * if some other CPU has recently done so. Also, don't bother
2938 : * invoking rcu_force_quiescent_state() if the newly enqueued callback
2939 : * is the only one waiting for a grace period to complete.
2940 : */
2941 606826 : if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
2942 : rdp->qlen_last_fqs_check + qhimark)) {
2943 :
2944 : /* Are we ignoring a completed grace period? */
2945 0 : note_gp_changes(rdp);
2946 :
2947 : /* Start a new grace period if one not already started. */
2948 0 : if (!rcu_gp_in_progress()) {
2949 0 : rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
2950 : } else {
2951 : /* Give the grace period a kick. */
2952 0 : rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
2953 0 : if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
2954 0 : rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
2955 0 : rcu_force_quiescent_state();
2956 0 : rdp->n_force_qs_snap = rcu_state.n_force_qs;
2957 0 : rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
2958 : }
2959 : }
2960 : }
2961 :
2962 : /*
2963 : * RCU callback function to leak a callback.
2964 : */
2965 0 : static void rcu_leak_callback(struct rcu_head *rhp)
2966 : {
2967 0 : }
2968 :
2969 : /*
2970 : * Check and if necessary update the leaf rcu_node structure's
2971 : * ->cbovldmask bit corresponding to the current CPU based on that CPU's
2972 : * number of queued RCU callbacks. The caller must hold the leaf rcu_node
2973 : * structure's ->lock.
2974 : */
2975 0 : static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp)
2976 : {
2977 0 : raw_lockdep_assert_held_rcu_node(rnp);
2978 0 : if (qovld_calc <= 0)
2979 : return; // Early boot and wildcard value set.
2980 0 : if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc)
2981 0 : WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask);
2982 : else
2983 0 : WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
2984 : }
2985 :
2986 : /*
2987 : * Check and if necessary update the leaf rcu_node structure's
2988 : * ->cbovldmask bit corresponding to the current CPU based on that CPU's
2989 : * number of queued RCU callbacks. No locks need be held, but the
2990 : * caller must have disabled interrupts.
2991 : *
2992 : * Note that this function ignores the possibility that there are a lot
2993 : * of callbacks all of which have already seen the end of their respective
2994 : * grace periods. This omission is due to the need for no-CBs CPUs to
2995 : * be holding ->nocb_lock to do this check, which is too heavy for a
2996 : * common-case operation.
2997 : */
2998 627341 : static void check_cb_ovld(struct rcu_data *rdp)
2999 : {
3000 627341 : struct rcu_node *const rnp = rdp->mynode;
3001 :
3002 627341 : if (qovld_calc <= 0 ||
3003 627338 : ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) ==
3004 627338 : !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
3005 : return; // Early boot wildcard value or already set correctly.
3006 0 : raw_spin_lock_rcu_node(rnp);
3007 0 : check_cb_ovld_locked(rdp, rnp);
3008 0 : raw_spin_unlock_rcu_node(rnp);
3009 : }
3010 :
3011 : /* Helper function for call_rcu() and friends. */
3012 : static void
3013 627265 : __call_rcu(struct rcu_head *head, rcu_callback_t func)
3014 : {
3015 627265 : static atomic_t doublefrees;
3016 627265 : unsigned long flags;
3017 627265 : struct rcu_data *rdp;
3018 627265 : bool was_alldone;
3019 :
3020 : /* Misaligned rcu_head! */
3021 627265 : WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
3022 :
3023 627265 : if (debug_rcu_head_queue(head)) {
3024 : /*
3025 : * Probable double call_rcu(), so leak the callback.
3026 : * Use rcu:rcu_callback trace event to find the previous
3027 : * time callback was passed to __call_rcu().
3028 : */
3029 0 : if (atomic_inc_return(&doublefrees) < 4) {
3030 0 : pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
3031 0 : mem_dump_obj(head);
3032 : }
3033 0 : WRITE_ONCE(head->func, rcu_leak_callback);
3034 0 : return;
3035 : }
3036 627002 : head->func = func;
3037 627002 : head->next = NULL;
3038 1254057 : local_irq_save(flags);
3039 626953 : kasan_record_aux_stack(head);
3040 627350 : rdp = this_cpu_ptr(&rcu_data);
3041 :
3042 : /* Add the callback to our list. */
3043 627356 : if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
3044 : // This can trigger due to call_rcu() from offline CPU:
3045 1 : WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
3046 2 : WARN_ON_ONCE(!rcu_is_watching());
3047 : // Very early boot, before rcu_init(). Initialize if needed
3048 : // and then drop through to queue the callback.
3049 1 : if (rcu_segcblist_empty(&rdp->cblist))
3050 1 : rcu_segcblist_init(&rdp->cblist);
3051 : }
3052 :
3053 627356 : check_cb_ovld(rdp);
3054 627326 : if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
3055 : return; // Enqueued onto ->nocb_bypass, so just leave.
3056 : // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
3057 627326 : rcu_segcblist_enqueue(&rdp->cblist, head);
3058 627279 : if (__is_kvfree_rcu_offset((unsigned long)func))
3059 0 : trace_rcu_kvfree_callback(rcu_state.name, head,
3060 : (unsigned long)func,
3061 : rcu_segcblist_n_cbs(&rdp->cblist));
3062 : else
3063 627279 : trace_rcu_callback(rcu_state.name, head,
3064 : rcu_segcblist_n_cbs(&rdp->cblist));
3065 :
3066 627279 : trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
3067 :
3068 : /* Go handle any RCU core processing required. */
3069 627279 : if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
3070 627041 : __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
3071 : } else {
3072 627279 : __call_rcu_core(rdp, head, flags);
3073 627209 : local_irq_restore(flags);
3074 : }
3075 : }
3076 :
3077 : /**
3078 : * call_rcu() - Queue an RCU callback for invocation after a grace period.
3079 : * @head: structure to be used for queueing the RCU updates.
3080 : * @func: actual callback function to be invoked after the grace period
3081 : *
3082 : * The callback function will be invoked some time after a full grace
3083 : * period elapses, in other words after all pre-existing RCU read-side
3084 : * critical sections have completed. However, the callback function
3085 : * might well execute concurrently with RCU read-side critical sections
3086 : * that started after call_rcu() was invoked. RCU read-side critical
3087 : * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
3088 : * may be nested. In addition, regions of code across which interrupts,
3089 : * preemption, or softirqs have been disabled also serve as RCU read-side
3090 : * critical sections. This includes hardware interrupt handlers, softirq
3091 : * handlers, and NMI handlers.
3092 : *
3093 : * Note that all CPUs must agree that the grace period extended beyond
3094 : * all pre-existing RCU read-side critical section. On systems with more
3095 : * than one CPU, this means that when "func()" is invoked, each CPU is
3096 : * guaranteed to have executed a full memory barrier since the end of its
3097 : * last RCU read-side critical section whose beginning preceded the call
3098 : * to call_rcu(). It also means that each CPU executing an RCU read-side
3099 : * critical section that continues beyond the start of "func()" must have
3100 : * executed a memory barrier after the call_rcu() but before the beginning
3101 : * of that RCU read-side critical section. Note that these guarantees
3102 : * include CPUs that are offline, idle, or executing in user mode, as
3103 : * well as CPUs that are executing in the kernel.
3104 : *
3105 : * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
3106 : * resulting RCU callback function "func()", then both CPU A and CPU B are
3107 : * guaranteed to execute a full memory barrier during the time interval
3108 : * between the call to call_rcu() and the invocation of "func()" -- even
3109 : * if CPU A and CPU B are the same CPU (but again only if the system has
3110 : * more than one CPU).
3111 : */
3112 627267 : void call_rcu(struct rcu_head *head, rcu_callback_t func)
3113 : {
3114 627267 : __call_rcu(head, func);
3115 627014 : }
3116 : EXPORT_SYMBOL_GPL(call_rcu);
3117 :
3118 :
3119 : /* Maximum number of jiffies to wait before draining a batch. */
3120 : #define KFREE_DRAIN_JIFFIES (HZ / 50)
3121 : #define KFREE_N_BATCHES 2
3122 : #define FREE_N_CHANNELS 2
3123 :
3124 : /**
3125 : * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
3126 : * @nr_records: Number of active pointers in the array
3127 : * @next: Next bulk object in the block chain
3128 : * @records: Array of the kvfree_rcu() pointers
3129 : */
3130 : struct kvfree_rcu_bulk_data {
3131 : unsigned long nr_records;
3132 : struct kvfree_rcu_bulk_data *next;
3133 : void *records[];
3134 : };
3135 :
3136 : /*
3137 : * This macro defines how many entries the "records" array
3138 : * will contain. It is based on the fact that the size of
3139 : * kvfree_rcu_bulk_data structure becomes exactly one page.
3140 : */
3141 : #define KVFREE_BULK_MAX_ENTR \
3142 : ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
3143 :
3144 : /**
3145 : * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
3146 : * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
3147 : * @head_free: List of kfree_rcu() objects waiting for a grace period
3148 : * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
3149 : * @krcp: Pointer to @kfree_rcu_cpu structure
3150 : */
3151 :
3152 : struct kfree_rcu_cpu_work {
3153 : struct rcu_work rcu_work;
3154 : struct rcu_head *head_free;
3155 : struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
3156 : struct kfree_rcu_cpu *krcp;
3157 : };
3158 :
3159 : /**
3160 : * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
3161 : * @head: List of kfree_rcu() objects not yet waiting for a grace period
3162 : * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
3163 : * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
3164 : * @lock: Synchronize access to this structure
3165 : * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
3166 : * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
3167 : * @initialized: The @rcu_work fields have been initialized
3168 : * @count: Number of objects for which GP not started
3169 : * @bkvcache:
3170 : * A simple cache list that contains objects for reuse purpose.
3171 : * In order to save some per-cpu space the list is singular.
3172 : * Even though it is lockless an access has to be protected by the
3173 : * per-cpu lock.
3174 : * @page_cache_work: A work to refill the cache when it is empty
3175 : * @work_in_progress: Indicates that page_cache_work is running
3176 : * @hrtimer: A hrtimer for scheduling a page_cache_work
3177 : * @nr_bkv_objs: number of allocated objects at @bkvcache.
3178 : *
3179 : * This is a per-CPU structure. The reason that it is not included in
3180 : * the rcu_data structure is to permit this code to be extracted from
3181 : * the RCU files. Such extraction could allow further optimization of
3182 : * the interactions with the slab allocators.
3183 : */
3184 : struct kfree_rcu_cpu {
3185 : struct rcu_head *head;
3186 : struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
3187 : struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
3188 : raw_spinlock_t lock;
3189 : struct delayed_work monitor_work;
3190 : bool monitor_todo;
3191 : bool initialized;
3192 : int count;
3193 :
3194 : struct work_struct page_cache_work;
3195 : atomic_t work_in_progress;
3196 : struct hrtimer hrtimer;
3197 :
3198 : struct llist_head bkvcache;
3199 : int nr_bkv_objs;
3200 : };
3201 :
3202 : static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
3203 : .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
3204 : };
3205 :
3206 : static __always_inline void
3207 141 : debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
3208 : {
3209 : #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
3210 141 : int i;
3211 :
3212 389 : for (i = 0; i < bhead->nr_records; i++)
3213 248 : debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
3214 : #endif
3215 : }
3216 :
3217 : static inline struct kfree_rcu_cpu *
3218 252 : krc_this_cpu_lock(unsigned long *flags)
3219 : {
3220 252 : struct kfree_rcu_cpu *krcp;
3221 :
3222 504 : local_irq_save(*flags); // For safely calling this_cpu_ptr().
3223 252 : krcp = this_cpu_ptr(&krc);
3224 252 : raw_spin_lock(&krcp->lock);
3225 :
3226 252 : return krcp;
3227 : }
3228 :
3229 : static inline void
3230 252 : krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
3231 : {
3232 252 : raw_spin_unlock(&krcp->lock);
3233 252 : local_irq_restore(flags);
3234 252 : }
3235 :
3236 : static inline struct kvfree_rcu_bulk_data *
3237 145 : get_cached_bnode(struct kfree_rcu_cpu *krcp)
3238 : {
3239 145 : if (!krcp->nr_bkv_objs)
3240 : return NULL;
3241 :
3242 141 : krcp->nr_bkv_objs--;
3243 141 : return (struct kvfree_rcu_bulk_data *)
3244 141 : llist_del_first(&krcp->bkvcache);
3245 : }
3246 :
3247 : static inline bool
3248 161 : put_cached_bnode(struct kfree_rcu_cpu *krcp,
3249 : struct kvfree_rcu_bulk_data *bnode)
3250 : {
3251 : // Check the limit.
3252 161 : if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
3253 : return false;
3254 :
3255 161 : llist_add((struct llist_node *) bnode, &krcp->bkvcache);
3256 161 : krcp->nr_bkv_objs++;
3257 161 : return true;
3258 :
3259 : }
3260 :
3261 : /*
3262 : * This function is invoked in workqueue context after a grace period.
3263 : * It frees all the objects queued on ->bhead_free or ->head_free.
3264 : */
3265 144 : static void kfree_rcu_work(struct work_struct *work)
3266 : {
3267 144 : unsigned long flags;
3268 144 : struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
3269 144 : struct rcu_head *head, *next;
3270 144 : struct kfree_rcu_cpu *krcp;
3271 144 : struct kfree_rcu_cpu_work *krwp;
3272 144 : int i, j;
3273 :
3274 144 : krwp = container_of(to_rcu_work(work),
3275 : struct kfree_rcu_cpu_work, rcu_work);
3276 144 : krcp = krwp->krcp;
3277 :
3278 144 : raw_spin_lock_irqsave(&krcp->lock, flags);
3279 : // Channels 1 and 2.
3280 576 : for (i = 0; i < FREE_N_CHANNELS; i++) {
3281 288 : bkvhead[i] = krwp->bkvhead_free[i];
3282 288 : krwp->bkvhead_free[i] = NULL;
3283 : }
3284 :
3285 : // Channel 3.
3286 144 : head = krwp->head_free;
3287 144 : krwp->head_free = NULL;
3288 144 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3289 :
3290 : // Handle two first channels.
3291 576 : for (i = 0; i < FREE_N_CHANNELS; i++) {
3292 429 : for (; bkvhead[i]; bkvhead[i] = bnext) {
3293 141 : bnext = bkvhead[i]->next;
3294 141 : debug_rcu_bhead_unqueue(bkvhead[i]);
3295 :
3296 141 : rcu_lock_acquire(&rcu_callback_map);
3297 141 : if (i == 0) { // kmalloc() / kfree().
3298 141 : trace_rcu_invoke_kfree_bulk_callback(
3299 : rcu_state.name, bkvhead[i]->nr_records,
3300 141 : bkvhead[i]->records);
3301 :
3302 141 : kfree_bulk(bkvhead[i]->nr_records,
3303 : bkvhead[i]->records);
3304 : } else { // vmalloc() / vfree().
3305 0 : for (j = 0; j < bkvhead[i]->nr_records; j++) {
3306 0 : trace_rcu_invoke_kvfree_callback(
3307 : rcu_state.name,
3308 0 : bkvhead[i]->records[j], 0);
3309 :
3310 0 : vfree(bkvhead[i]->records[j]);
3311 : }
3312 : }
3313 141 : rcu_lock_release(&rcu_callback_map);
3314 :
3315 141 : raw_spin_lock_irqsave(&krcp->lock, flags);
3316 141 : if (put_cached_bnode(krcp, bkvhead[i]))
3317 141 : bkvhead[i] = NULL;
3318 141 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3319 :
3320 141 : if (bkvhead[i])
3321 0 : free_page((unsigned long) bkvhead[i]);
3322 :
3323 141 : cond_resched_tasks_rcu_qs();
3324 : }
3325 : }
3326 :
3327 : /*
3328 : * Emergency case only. It can happen under low memory
3329 : * condition when an allocation gets failed, so the "bulk"
3330 : * path can not be temporary maintained.
3331 : */
3332 148 : for (; head; head = next) {
3333 4 : unsigned long offset = (unsigned long)head->func;
3334 4 : void *ptr = (void *)head - offset;
3335 :
3336 4 : next = head->next;
3337 4 : debug_rcu_head_unqueue((struct rcu_head *)ptr);
3338 4 : rcu_lock_acquire(&rcu_callback_map);
3339 4 : trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
3340 :
3341 4 : if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
3342 4 : kvfree(ptr);
3343 :
3344 4 : rcu_lock_release(&rcu_callback_map);
3345 4 : cond_resched_tasks_rcu_qs();
3346 : }
3347 144 : }
3348 :
3349 : /*
3350 : * Schedule the kfree batch RCU work to run in workqueue context after a GP.
3351 : *
3352 : * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
3353 : * timeout has been reached.
3354 : */
3355 165 : static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
3356 : {
3357 165 : struct kfree_rcu_cpu_work *krwp;
3358 165 : bool repeat = false;
3359 165 : int i, j;
3360 :
3361 330 : lockdep_assert_held(&krcp->lock);
3362 :
3363 495 : for (i = 0; i < KFREE_N_BATCHES; i++) {
3364 330 : krwp = &(krcp->krw_arr[i]);
3365 :
3366 : /*
3367 : * Try to detach bkvhead or head and attach it over any
3368 : * available corresponding free channel. It can be that
3369 : * a previous RCU batch is in progress, it means that
3370 : * immediately to queue another one is not possible so
3371 : * return false to tell caller to retry.
3372 : */
3373 330 : if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
3374 189 : (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
3375 189 : (krcp->head && !krwp->head_free)) {
3376 : // Channel 1 corresponds to SLAB ptrs.
3377 : // Channel 2 corresponds to vmalloc ptrs.
3378 432 : for (j = 0; j < FREE_N_CHANNELS; j++) {
3379 288 : if (!krwp->bkvhead_free[j]) {
3380 288 : krwp->bkvhead_free[j] = krcp->bkvhead[j];
3381 288 : krcp->bkvhead[j] = NULL;
3382 : }
3383 : }
3384 :
3385 : // Channel 3 corresponds to emergency path.
3386 144 : if (!krwp->head_free) {
3387 144 : krwp->head_free = krcp->head;
3388 144 : krcp->head = NULL;
3389 : }
3390 :
3391 144 : WRITE_ONCE(krcp->count, 0);
3392 :
3393 : /*
3394 : * One work is per one batch, so there are three
3395 : * "free channels", the batch can handle. It can
3396 : * be that the work is in the pending state when
3397 : * channels have been detached following by each
3398 : * other.
3399 : */
3400 144 : queue_rcu_work(system_wq, &krwp->rcu_work);
3401 : }
3402 :
3403 : // Repeat if any "free" corresponding channel is still busy.
3404 330 : if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
3405 35 : repeat = true;
3406 : }
3407 :
3408 165 : return !repeat;
3409 : }
3410 :
3411 165 : static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
3412 : unsigned long flags)
3413 : {
3414 : // Attempt to start a new batch.
3415 165 : krcp->monitor_todo = false;
3416 165 : if (queue_kfree_rcu_work(krcp)) {
3417 : // Success! Our job is done here.
3418 139 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3419 139 : return;
3420 : }
3421 :
3422 : // Previous RCU batch still in progress, try again later.
3423 26 : krcp->monitor_todo = true;
3424 26 : schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
3425 26 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3426 : }
3427 :
3428 : /*
3429 : * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
3430 : * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
3431 : */
3432 165 : static void kfree_rcu_monitor(struct work_struct *work)
3433 : {
3434 165 : unsigned long flags;
3435 165 : struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
3436 : monitor_work.work);
3437 :
3438 165 : raw_spin_lock_irqsave(&krcp->lock, flags);
3439 165 : if (krcp->monitor_todo)
3440 165 : kfree_rcu_drain_unlock(krcp, flags);
3441 : else
3442 0 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3443 165 : }
3444 :
3445 : static enum hrtimer_restart
3446 4 : schedule_page_work_fn(struct hrtimer *t)
3447 : {
3448 4 : struct kfree_rcu_cpu *krcp =
3449 4 : container_of(t, struct kfree_rcu_cpu, hrtimer);
3450 :
3451 4 : queue_work(system_highpri_wq, &krcp->page_cache_work);
3452 4 : return HRTIMER_NORESTART;
3453 : }
3454 :
3455 4 : static void fill_page_cache_func(struct work_struct *work)
3456 : {
3457 4 : struct kvfree_rcu_bulk_data *bnode;
3458 4 : struct kfree_rcu_cpu *krcp =
3459 4 : container_of(work, struct kfree_rcu_cpu,
3460 : page_cache_work);
3461 4 : unsigned long flags;
3462 4 : bool pushed;
3463 4 : int i;
3464 :
3465 24 : for (i = 0; i < rcu_min_cached_objs; i++) {
3466 40 : bnode = (struct kvfree_rcu_bulk_data *)
3467 20 : __get_free_page(GFP_KERNEL | __GFP_NOWARN);
3468 :
3469 20 : if (bnode) {
3470 20 : raw_spin_lock_irqsave(&krcp->lock, flags);
3471 20 : pushed = put_cached_bnode(krcp, bnode);
3472 20 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3473 :
3474 20 : if (!pushed) {
3475 0 : free_page((unsigned long) bnode);
3476 0 : break;
3477 : }
3478 : }
3479 : }
3480 :
3481 4 : atomic_set(&krcp->work_in_progress, 0);
3482 4 : }
3483 :
3484 : static void
3485 4 : run_page_cache_worker(struct kfree_rcu_cpu *krcp)
3486 : {
3487 4 : if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
3488 8 : !atomic_xchg(&krcp->work_in_progress, 1)) {
3489 4 : hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
3490 : HRTIMER_MODE_REL);
3491 4 : krcp->hrtimer.function = schedule_page_work_fn;
3492 4 : hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
3493 : }
3494 4 : }
3495 :
3496 : static inline bool
3497 252 : kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
3498 : {
3499 252 : struct kvfree_rcu_bulk_data *bnode;
3500 252 : int idx;
3501 :
3502 252 : if (unlikely(!krcp->initialized))
3503 : return false;
3504 :
3505 504 : lockdep_assert_held(&krcp->lock);
3506 252 : idx = !!is_vmalloc_addr(ptr);
3507 :
3508 : /* Check if a new block is required. */
3509 252 : if (!krcp->bkvhead[idx] ||
3510 107 : krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
3511 145 : bnode = get_cached_bnode(krcp);
3512 : /* Switch to emergency path. */
3513 145 : if (!bnode)
3514 : return false;
3515 :
3516 : /* Initialize the new block. */
3517 141 : bnode->nr_records = 0;
3518 141 : bnode->next = krcp->bkvhead[idx];
3519 :
3520 : /* Attach it to the head. */
3521 141 : krcp->bkvhead[idx] = bnode;
3522 : }
3523 :
3524 : /* Finally insert. */
3525 248 : krcp->bkvhead[idx]->records
3526 248 : [krcp->bkvhead[idx]->nr_records++] = ptr;
3527 :
3528 248 : return true;
3529 : }
3530 :
3531 : /*
3532 : * Queue a request for lazy invocation of appropriate free routine after a
3533 : * grace period. Please note there are three paths are maintained, two are the
3534 : * main ones that use array of pointers interface and third one is emergency
3535 : * one, that is used only when the main path can not be maintained temporary,
3536 : * due to memory pressure.
3537 : *
3538 : * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
3539 : * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
3540 : * be free'd in workqueue context. This allows us to: batch requests together to
3541 : * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
3542 : */
3543 252 : void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
3544 : {
3545 252 : unsigned long flags;
3546 252 : struct kfree_rcu_cpu *krcp;
3547 252 : bool success;
3548 252 : void *ptr;
3549 :
3550 252 : if (head) {
3551 252 : ptr = (void *) head - (unsigned long) func;
3552 : } else {
3553 : /*
3554 : * Please note there is a limitation for the head-less
3555 : * variant, that is why there is a clear rule for such
3556 : * objects: it can be used from might_sleep() context
3557 : * only. For other places please embed an rcu_head to
3558 : * your data.
3559 : */
3560 0 : might_sleep();
3561 0 : ptr = (unsigned long *) func;
3562 : }
3563 :
3564 252 : krcp = krc_this_cpu_lock(&flags);
3565 :
3566 : // Queue the object but don't yet schedule the batch.
3567 252 : if (debug_rcu_head_queue(ptr)) {
3568 : // Probable double kfree_rcu(), just leak.
3569 0 : WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
3570 : __func__, head);
3571 :
3572 : // Mark as success and leave.
3573 0 : success = true;
3574 0 : goto unlock_return;
3575 : }
3576 :
3577 252 : kasan_record_aux_stack(ptr);
3578 252 : success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
3579 252 : if (!success) {
3580 4 : run_page_cache_worker(krcp);
3581 :
3582 4 : if (head == NULL)
3583 : // Inline if kvfree_rcu(one_arg) call.
3584 0 : goto unlock_return;
3585 :
3586 4 : head->func = func;
3587 4 : head->next = krcp->head;
3588 4 : krcp->head = head;
3589 4 : success = true;
3590 : }
3591 :
3592 252 : WRITE_ONCE(krcp->count, krcp->count + 1);
3593 :
3594 : // Set timer to drain after KFREE_DRAIN_JIFFIES.
3595 252 : if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
3596 252 : !krcp->monitor_todo) {
3597 139 : krcp->monitor_todo = true;
3598 139 : schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
3599 : }
3600 :
3601 113 : unlock_return:
3602 252 : krc_this_cpu_unlock(krcp, flags);
3603 :
3604 : /*
3605 : * Inline kvfree() after synchronize_rcu(). We can do
3606 : * it from might_sleep() context only, so the current
3607 : * CPU can pass the QS state.
3608 : */
3609 252 : if (!success) {
3610 0 : debug_rcu_head_unqueue((struct rcu_head *) ptr);
3611 0 : synchronize_rcu();
3612 0 : kvfree(ptr);
3613 : }
3614 252 : }
3615 : EXPORT_SYMBOL_GPL(kvfree_call_rcu);
3616 :
3617 : static unsigned long
3618 0 : kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
3619 : {
3620 0 : int cpu;
3621 0 : unsigned long count = 0;
3622 :
3623 : /* Snapshot count of all CPUs */
3624 0 : for_each_possible_cpu(cpu) {
3625 0 : struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3626 :
3627 0 : count += READ_ONCE(krcp->count);
3628 : }
3629 :
3630 0 : return count;
3631 : }
3632 :
3633 : static unsigned long
3634 0 : kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
3635 : {
3636 0 : int cpu, freed = 0;
3637 0 : unsigned long flags;
3638 :
3639 0 : for_each_possible_cpu(cpu) {
3640 0 : int count;
3641 0 : struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3642 :
3643 0 : count = krcp->count;
3644 0 : raw_spin_lock_irqsave(&krcp->lock, flags);
3645 0 : if (krcp->monitor_todo)
3646 0 : kfree_rcu_drain_unlock(krcp, flags);
3647 : else
3648 0 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3649 :
3650 0 : sc->nr_to_scan -= count;
3651 0 : freed += count;
3652 :
3653 0 : if (sc->nr_to_scan <= 0)
3654 : break;
3655 : }
3656 :
3657 0 : return freed == 0 ? SHRINK_STOP : freed;
3658 : }
3659 :
3660 : static struct shrinker kfree_rcu_shrinker = {
3661 : .count_objects = kfree_rcu_shrink_count,
3662 : .scan_objects = kfree_rcu_shrink_scan,
3663 : .batch = 0,
3664 : .seeks = DEFAULT_SEEKS,
3665 : };
3666 :
3667 1 : void __init kfree_rcu_scheduler_running(void)
3668 : {
3669 1 : int cpu;
3670 1 : unsigned long flags;
3671 :
3672 6 : for_each_possible_cpu(cpu) {
3673 4 : struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3674 :
3675 4 : raw_spin_lock_irqsave(&krcp->lock, flags);
3676 4 : if (!krcp->head || krcp->monitor_todo) {
3677 4 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3678 4 : continue;
3679 : }
3680 0 : krcp->monitor_todo = true;
3681 0 : schedule_delayed_work_on(cpu, &krcp->monitor_work,
3682 : KFREE_DRAIN_JIFFIES);
3683 5 : raw_spin_unlock_irqrestore(&krcp->lock, flags);
3684 : }
3685 1 : }
3686 :
3687 : /*
3688 : * During early boot, any blocking grace-period wait automatically
3689 : * implies a grace period. Later on, this is never the case for PREEMPTION.
3690 : *
3691 : * However, because a context switch is a grace period for !PREEMPTION, any
3692 : * blocking grace-period wait automatically implies a grace period if
3693 : * there is only one CPU online at any point time during execution of
3694 : * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
3695 : * occasionally incorrectly indicate that there are multiple CPUs online
3696 : * when there was in fact only one the whole time, as this just adds some
3697 : * overhead: RCU still operates correctly.
3698 : */
3699 174 : static int rcu_blocking_is_gp(void)
3700 : {
3701 174 : int ret;
3702 :
3703 174 : if (IS_ENABLED(CONFIG_PREEMPTION))
3704 : return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
3705 174 : might_sleep(); /* Check for RCU read-side critical section. */
3706 174 : preempt_disable();
3707 : /*
3708 : * If the rcu_state.n_online_cpus counter is equal to one,
3709 : * there is only one CPU, and that CPU sees all prior accesses
3710 : * made by any CPU that was online at the time of its access.
3711 : * Furthermore, if this counter is equal to one, its value cannot
3712 : * change until after the preempt_enable() below.
3713 : *
3714 : * Furthermore, if rcu_state.n_online_cpus is equal to one here,
3715 : * all later CPUs (both this one and any that come online later
3716 : * on) are guaranteed to see all accesses prior to this point
3717 : * in the code, without the need for additional memory barriers.
3718 : * Those memory barriers are provided by CPU-hotplug code.
3719 : */
3720 174 : ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
3721 174 : preempt_enable();
3722 174 : return ret;
3723 : }
3724 :
3725 : /**
3726 : * synchronize_rcu - wait until a grace period has elapsed.
3727 : *
3728 : * Control will return to the caller some time after a full grace
3729 : * period has elapsed, in other words after all currently executing RCU
3730 : * read-side critical sections have completed. Note, however, that
3731 : * upon return from synchronize_rcu(), the caller might well be executing
3732 : * concurrently with new RCU read-side critical sections that began while
3733 : * synchronize_rcu() was waiting. RCU read-side critical sections are
3734 : * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
3735 : * In addition, regions of code across which interrupts, preemption, or
3736 : * softirqs have been disabled also serve as RCU read-side critical
3737 : * sections. This includes hardware interrupt handlers, softirq handlers,
3738 : * and NMI handlers.
3739 : *
3740 : * Note that this guarantee implies further memory-ordering guarantees.
3741 : * On systems with more than one CPU, when synchronize_rcu() returns,
3742 : * each CPU is guaranteed to have executed a full memory barrier since
3743 : * the end of its last RCU read-side critical section whose beginning
3744 : * preceded the call to synchronize_rcu(). In addition, each CPU having
3745 : * an RCU read-side critical section that extends beyond the return from
3746 : * synchronize_rcu() is guaranteed to have executed a full memory barrier
3747 : * after the beginning of synchronize_rcu() and before the beginning of
3748 : * that RCU read-side critical section. Note that these guarantees include
3749 : * CPUs that are offline, idle, or executing in user mode, as well as CPUs
3750 : * that are executing in the kernel.
3751 : *
3752 : * Furthermore, if CPU A invoked synchronize_rcu(), which returned
3753 : * to its caller on CPU B, then both CPU A and CPU B are guaranteed
3754 : * to have executed a full memory barrier during the execution of
3755 : * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
3756 : * again only if the system has more than one CPU).
3757 : */
3758 10 : void synchronize_rcu(void)
3759 : {
3760 31 : RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
3761 : lock_is_held(&rcu_lock_map) ||
3762 : lock_is_held(&rcu_sched_lock_map),
3763 : "Illegal synchronize_rcu() in RCU read-side critical section");
3764 10 : if (rcu_blocking_is_gp())
3765 : return; // Context allows vacuous grace periods.
3766 5 : if (rcu_gp_is_expedited())
3767 4 : synchronize_rcu_expedited();
3768 : else
3769 1 : wait_rcu_gp(call_rcu);
3770 : }
3771 : EXPORT_SYMBOL_GPL(synchronize_rcu);
3772 :
3773 : /**
3774 : * get_state_synchronize_rcu - Snapshot current RCU state
3775 : *
3776 : * Returns a cookie that is used by a later call to cond_synchronize_rcu()
3777 : * to determine whether or not a full grace period has elapsed in the
3778 : * meantime.
3779 : */
3780 0 : unsigned long get_state_synchronize_rcu(void)
3781 : {
3782 : /*
3783 : * Any prior manipulation of RCU-protected data must happen
3784 : * before the load from ->gp_seq.
3785 : */
3786 0 : smp_mb(); /* ^^^ */
3787 0 : return rcu_seq_snap(&rcu_state.gp_seq);
3788 : }
3789 : EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
3790 :
3791 : /**
3792 : * cond_synchronize_rcu - Conditionally wait for an RCU grace period
3793 : *
3794 : * @oldstate: return value from earlier call to get_state_synchronize_rcu()
3795 : *
3796 : * If a full RCU grace period has elapsed since the earlier call to
3797 : * get_state_synchronize_rcu(), just return. Otherwise, invoke
3798 : * synchronize_rcu() to wait for a full grace period.
3799 : *
3800 : * Yes, this function does not take counter wrap into account. But
3801 : * counter wrap is harmless. If the counter wraps, we have waited for
3802 : * more than 2 billion grace periods (and way more on a 64-bit system!),
3803 : * so waiting for one additional grace period should be just fine.
3804 : */
3805 0 : void cond_synchronize_rcu(unsigned long oldstate)
3806 : {
3807 0 : if (!rcu_seq_done(&rcu_state.gp_seq, oldstate))
3808 0 : synchronize_rcu();
3809 : else
3810 0 : smp_mb(); /* Ensure GP ends before subsequent accesses. */
3811 0 : }
3812 : EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
3813 :
3814 : /*
3815 : * Check to see if there is any immediate RCU-related work to be done by
3816 : * the current CPU, returning 1 if so and zero otherwise. The checks are
3817 : * in order of increasing expense: checks that can be carried out against
3818 : * CPU-local state are performed first. However, we must check for CPU
3819 : * stalls first, else we might not get a chance.
3820 : */
3821 27560 : static int rcu_pending(int user)
3822 : {
3823 27560 : bool gp_in_progress;
3824 27560 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
3825 28034 : struct rcu_node *rnp = rdp->mynode;
3826 :
3827 56088 : lockdep_assert_irqs_disabled();
3828 :
3829 : /* Check for CPU stalls, if enabled. */
3830 27984 : check_cpu_stall(rdp);
3831 :
3832 : /* Does this CPU need a deferred NOCB wakeup? */
3833 27740 : if (rcu_nocb_need_deferred_wakeup(rdp))
3834 : return 1;
3835 :
3836 : /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
3837 27740 : if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
3838 : return 0;
3839 :
3840 : /* Is the RCU core waiting for a quiescent state from this CPU? */
3841 28183 : gp_in_progress = rcu_gp_in_progress();
3842 28183 : if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
3843 : return 1;
3844 :
3845 : /* Does this CPU have callbacks ready to invoke? */
3846 22765 : if (!rcu_segcblist_is_offloaded(&rdp->cblist) &&
3847 22765 : rcu_segcblist_ready_cbs(&rdp->cblist))
3848 : return 1;
3849 :
3850 : /* Has RCU gone idle with this CPU needing another grace period? */
3851 21350 : if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
3852 83 : !rcu_segcblist_is_offloaded(&rdp->cblist) &&
3853 83 : !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
3854 : return 1;
3855 :
3856 : /* Have RCU grace period completed or started? */
3857 21330 : if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq ||
3858 16144 : unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
3859 5186 : return 1;
3860 :
3861 : /* nothing to do */
3862 : return 0;
3863 : }
3864 :
3865 : /*
3866 : * Helper function for rcu_barrier() tracing. If tracing is disabled,
3867 : * the compiler is expected to optimize this away.
3868 : */
3869 11 : static void rcu_barrier_trace(const char *s, int cpu, unsigned long done)
3870 : {
3871 11 : trace_rcu_barrier(rcu_state.name, s, cpu,
3872 : atomic_read(&rcu_state.barrier_cpu_count), done);
3873 3 : }
3874 :
3875 : /*
3876 : * RCU callback function for rcu_barrier(). If we are last, wake
3877 : * up the task executing rcu_barrier().
3878 : *
3879 : * Note that the value of rcu_state.barrier_sequence must be captured
3880 : * before the atomic_dec_and_test(). Otherwise, if this CPU is not last,
3881 : * other CPUs might count the value down to zero before this CPU gets
3882 : * around to invoking rcu_barrier_trace(), which might result in bogus
3883 : * data from the next instance of rcu_barrier().
3884 : */
3885 2 : static void rcu_barrier_callback(struct rcu_head *rhp)
3886 : {
3887 2 : unsigned long __maybe_unused s = rcu_state.barrier_sequence;
3888 :
3889 4 : if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
3890 1 : rcu_barrier_trace(TPS("LastCB"), -1, s);
3891 1 : complete(&rcu_state.barrier_completion);
3892 : } else {
3893 1 : rcu_barrier_trace(TPS("CB"), -1, s);
3894 : }
3895 2 : }
3896 :
3897 : /*
3898 : * Called with preemption disabled, and from cross-cpu IRQ context.
3899 : */
3900 2 : static void rcu_barrier_func(void *cpu_in)
3901 : {
3902 2 : uintptr_t cpu = (uintptr_t)cpu_in;
3903 2 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3904 :
3905 2 : rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
3906 2 : rdp->barrier_head.func = rcu_barrier_callback;
3907 2 : debug_rcu_head_queue(&rdp->barrier_head);
3908 2 : rcu_nocb_lock(rdp);
3909 2 : WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
3910 2 : if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
3911 2 : atomic_inc(&rcu_state.barrier_cpu_count);
3912 : } else {
3913 0 : debug_rcu_head_unqueue(&rdp->barrier_head);
3914 0 : rcu_barrier_trace(TPS("IRQNQ"), -1,
3915 : rcu_state.barrier_sequence);
3916 : }
3917 2 : rcu_nocb_unlock(rdp);
3918 2 : }
3919 :
3920 : /**
3921 : * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
3922 : *
3923 : * Note that this primitive does not necessarily wait for an RCU grace period
3924 : * to complete. For example, if there are no RCU callbacks queued anywhere
3925 : * in the system, then rcu_barrier() is within its rights to return
3926 : * immediately, without waiting for anything, much less an RCU grace period.
3927 : */
3928 1 : void rcu_barrier(void)
3929 : {
3930 1 : uintptr_t cpu;
3931 1 : struct rcu_data *rdp;
3932 1 : unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
3933 :
3934 1 : rcu_barrier_trace(TPS("Begin"), -1, s);
3935 :
3936 : /* Take mutex to serialize concurrent rcu_barrier() requests. */
3937 1 : mutex_lock(&rcu_state.barrier_mutex);
3938 :
3939 : /* Did someone else do our work for us? */
3940 1 : if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
3941 0 : rcu_barrier_trace(TPS("EarlyExit"), -1,
3942 : rcu_state.barrier_sequence);
3943 0 : smp_mb(); /* caller's subsequent code after above check. */
3944 0 : mutex_unlock(&rcu_state.barrier_mutex);
3945 0 : return;
3946 : }
3947 :
3948 : /* Mark the start of the barrier operation. */
3949 1 : rcu_seq_start(&rcu_state.barrier_sequence);
3950 1 : rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
3951 :
3952 : /*
3953 : * Initialize the count to two rather than to zero in order
3954 : * to avoid a too-soon return to zero in case of an immediate
3955 : * invocation of the just-enqueued callback (or preemption of
3956 : * this task). Exclude CPU-hotplug operations to ensure that no
3957 : * offline non-offloaded CPU has callbacks queued.
3958 : */
3959 1 : init_completion(&rcu_state.barrier_completion);
3960 1 : atomic_set(&rcu_state.barrier_cpu_count, 2);
3961 1 : get_online_cpus();
3962 :
3963 : /*
3964 : * Force each CPU with callbacks to register a new callback.
3965 : * When that callback is invoked, we will know that all of the
3966 : * corresponding CPU's preceding callbacks have been invoked.
3967 : */
3968 6 : for_each_possible_cpu(cpu) {
3969 4 : rdp = per_cpu_ptr(&rcu_data, cpu);
3970 4 : if (cpu_is_offline(cpu) &&
3971 0 : !rcu_segcblist_is_offloaded(&rdp->cblist))
3972 0 : continue;
3973 4 : if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
3974 2 : rcu_barrier_trace(TPS("OnlineQ"), cpu,
3975 : rcu_state.barrier_sequence);
3976 2 : smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
3977 2 : } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
3978 0 : cpu_is_offline(cpu)) {
3979 0 : rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
3980 : rcu_state.barrier_sequence);
3981 0 : local_irq_disable();
3982 0 : rcu_barrier_func((void *)cpu);
3983 0 : local_irq_enable();
3984 2 : } else if (cpu_is_offline(cpu)) {
3985 0 : rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
3986 : rcu_state.barrier_sequence);
3987 : } else {
3988 7 : rcu_barrier_trace(TPS("OnlineNQ"), cpu,
3989 : rcu_state.barrier_sequence);
3990 : }
3991 : }
3992 1 : put_online_cpus();
3993 :
3994 : /*
3995 : * Now that we have an rcu_barrier_callback() callback on each
3996 : * CPU, and thus each counted, remove the initial count.
3997 : */
3998 2 : if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count))
3999 0 : complete(&rcu_state.barrier_completion);
4000 :
4001 : /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
4002 1 : wait_for_completion(&rcu_state.barrier_completion);
4003 :
4004 : /* Mark the end of the barrier operation. */
4005 1 : rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
4006 1 : rcu_seq_end(&rcu_state.barrier_sequence);
4007 :
4008 : /* Other rcu_barrier() invocations can now safely proceed. */
4009 1 : mutex_unlock(&rcu_state.barrier_mutex);
4010 : }
4011 : EXPORT_SYMBOL_GPL(rcu_barrier);
4012 :
4013 : /*
4014 : * Propagate ->qsinitmask bits up the rcu_node tree to account for the
4015 : * first CPU in a given leaf rcu_node structure coming online. The caller
4016 : * must hold the corresponding leaf rcu_node ->lock with interrrupts
4017 : * disabled.
4018 : */
4019 1 : static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
4020 : {
4021 1 : long mask;
4022 1 : long oldmask;
4023 1 : struct rcu_node *rnp = rnp_leaf;
4024 :
4025 2 : raw_lockdep_assert_held_rcu_node(rnp_leaf);
4026 1 : WARN_ON_ONCE(rnp->wait_blkd_tasks);
4027 1 : for (;;) {
4028 1 : mask = rnp->grpmask;
4029 1 : rnp = rnp->parent;
4030 1 : if (rnp == NULL)
4031 : return;
4032 0 : raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
4033 0 : oldmask = rnp->qsmaskinit;
4034 0 : rnp->qsmaskinit |= mask;
4035 0 : raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
4036 0 : if (oldmask)
4037 : return;
4038 : }
4039 : }
4040 :
4041 : /*
4042 : * Do boot-time initialization of a CPU's per-CPU RCU data.
4043 : */
4044 : static void __init
4045 4 : rcu_boot_init_percpu_data(int cpu)
4046 : {
4047 4 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4048 :
4049 : /* Set up local state, ensuring consistent view of global state. */
4050 4 : rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
4051 4 : INIT_WORK(&rdp->strict_work, strict_work_handler);
4052 4 : WARN_ON_ONCE(rdp->dynticks_nesting != 1);
4053 4 : WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
4054 4 : rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
4055 4 : rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
4056 4 : rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
4057 4 : rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
4058 4 : rdp->cpu = cpu;
4059 4 : rcu_boot_init_nocb_percpu_data(rdp);
4060 4 : }
4061 :
4062 : /*
4063 : * Invoked early in the CPU-online process, when pretty much all services
4064 : * are available. The incoming CPU is not present.
4065 : *
4066 : * Initializes a CPU's per-CPU RCU data. Note that only one online or
4067 : * offline event can be happening at a given time. Note also that we can
4068 : * accept some slop in the rsp->gp_seq access due to the fact that this
4069 : * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
4070 : * And any offloaded callbacks are being numbered elsewhere.
4071 : */
4072 4 : int rcutree_prepare_cpu(unsigned int cpu)
4073 : {
4074 4 : unsigned long flags;
4075 4 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4076 4 : struct rcu_node *rnp = rcu_get_root();
4077 :
4078 : /* Set up local state, ensuring consistent view of global state. */
4079 4 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
4080 4 : rdp->qlen_last_fqs_check = 0;
4081 4 : rdp->n_force_qs_snap = rcu_state.n_force_qs;
4082 4 : rdp->blimit = blimit;
4083 4 : rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
4084 4 : rcu_dynticks_eqs_online();
4085 8 : raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
4086 : /*
4087 : * Lock in case the CB/GP kthreads are still around handling
4088 : * old callbacks (longer term we should flush all callbacks
4089 : * before completing CPU offline)
4090 : */
4091 4 : rcu_nocb_lock(rdp);
4092 4 : if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */
4093 3 : rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
4094 4 : rcu_nocb_unlock(rdp);
4095 :
4096 : /*
4097 : * Add CPU to leaf rcu_node pending-online bitmask. Any needed
4098 : * propagation up the rcu_node tree will happen at the beginning
4099 : * of the next grace period.
4100 : */
4101 4 : rnp = rdp->mynode;
4102 4 : raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
4103 4 : rdp->beenonline = true; /* We have now been online. */
4104 4 : rdp->gp_seq = READ_ONCE(rnp->gp_seq);
4105 4 : rdp->gp_seq_needed = rdp->gp_seq;
4106 4 : rdp->cpu_no_qs.b.norm = true;
4107 4 : rdp->core_needs_qs = false;
4108 4 : rdp->rcu_iw_pending = false;
4109 4 : rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
4110 4 : rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
4111 4 : trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
4112 8 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4113 4 : rcu_prepare_kthreads(cpu);
4114 4 : rcu_spawn_cpu_nocb_kthread(cpu);
4115 4 : WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
4116 :
4117 4 : return 0;
4118 : }
4119 :
4120 : /*
4121 : * Update RCU priority boot kthread affinity for CPU-hotplug changes.
4122 : */
4123 3 : static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
4124 : {
4125 3 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4126 :
4127 3 : rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
4128 : }
4129 :
4130 : /*
4131 : * Near the end of the CPU-online process. Pretty much all services
4132 : * enabled, and the CPU is now very much alive.
4133 : */
4134 4 : int rcutree_online_cpu(unsigned int cpu)
4135 : {
4136 4 : unsigned long flags;
4137 4 : struct rcu_data *rdp;
4138 4 : struct rcu_node *rnp;
4139 :
4140 4 : rdp = per_cpu_ptr(&rcu_data, cpu);
4141 4 : rnp = rdp->mynode;
4142 4 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
4143 4 : rnp->ffmask |= rdp->grpmask;
4144 8 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4145 4 : if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
4146 : return 0; /* Too early in boot for scheduler work. */
4147 3 : sync_sched_exp_online_cleanup(cpu);
4148 3 : rcutree_affinity_setting(cpu, -1);
4149 :
4150 : // Stop-machine done, so allow nohz_full to disable tick.
4151 3 : tick_dep_clear(TICK_DEP_BIT_RCU);
4152 3 : return 0;
4153 : }
4154 :
4155 : /*
4156 : * Near the beginning of the process. The CPU is still very much alive
4157 : * with pretty much all services enabled.
4158 : */
4159 0 : int rcutree_offline_cpu(unsigned int cpu)
4160 : {
4161 0 : unsigned long flags;
4162 0 : struct rcu_data *rdp;
4163 0 : struct rcu_node *rnp;
4164 :
4165 0 : rdp = per_cpu_ptr(&rcu_data, cpu);
4166 0 : rnp = rdp->mynode;
4167 0 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
4168 0 : rnp->ffmask &= ~rdp->grpmask;
4169 0 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4170 :
4171 0 : rcutree_affinity_setting(cpu, cpu);
4172 :
4173 : // nohz_full CPUs need the tick for stop-machine to work quickly
4174 0 : tick_dep_set(TICK_DEP_BIT_RCU);
4175 0 : return 0;
4176 : }
4177 :
4178 : /*
4179 : * Mark the specified CPU as being online so that subsequent grace periods
4180 : * (both expedited and normal) will wait on it. Note that this means that
4181 : * incoming CPUs are not allowed to use RCU read-side critical sections
4182 : * until this function is called. Failing to observe this restriction
4183 : * will result in lockdep splats.
4184 : *
4185 : * Note that this function is special in that it is invoked directly
4186 : * from the incoming CPU rather than from the cpuhp_step mechanism.
4187 : * This is because this function must be invoked at a precise location.
4188 : */
4189 7 : void rcu_cpu_starting(unsigned int cpu)
4190 : {
4191 7 : unsigned long flags;
4192 7 : unsigned long mask;
4193 7 : struct rcu_data *rdp;
4194 7 : struct rcu_node *rnp;
4195 7 : bool newcpu;
4196 :
4197 7 : rdp = per_cpu_ptr(&rcu_data, cpu);
4198 7 : if (rdp->cpu_started)
4199 : return;
4200 4 : rdp->cpu_started = true;
4201 :
4202 4 : rnp = rdp->mynode;
4203 4 : mask = rdp->grpmask;
4204 4 : WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
4205 4 : WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
4206 4 : smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
4207 4 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
4208 4 : WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
4209 4 : newcpu = !(rnp->expmaskinitnext & mask);
4210 4 : rnp->expmaskinitnext |= mask;
4211 : /* Allow lockless access for expedited grace periods. */
4212 4 : smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
4213 4 : ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
4214 4 : rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
4215 4 : rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
4216 4 : rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
4217 :
4218 : /* An incoming CPU should never be blocking a grace period. */
4219 4 : if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
4220 0 : rcu_disable_urgency_upon_qs(rdp);
4221 : /* Report QS -after- changing ->qsmaskinitnext! */
4222 0 : rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
4223 : } else {
4224 8 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4225 : }
4226 4 : smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
4227 4 : WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
4228 4 : WARN_ON_ONCE(rnp->ofl_seq & 0x1);
4229 4 : smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
4230 : }
4231 :
4232 : /*
4233 : * The outgoing function has no further need of RCU, so remove it from
4234 : * the rcu_node tree's ->qsmaskinitnext bit masks.
4235 : *
4236 : * Note that this function is special in that it is invoked directly
4237 : * from the outgoing CPU rather than from the cpuhp_step mechanism.
4238 : * This is because this function must be invoked at a precise location.
4239 : */
4240 0 : void rcu_report_dead(unsigned int cpu)
4241 : {
4242 0 : unsigned long flags;
4243 0 : unsigned long mask;
4244 0 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4245 0 : struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
4246 :
4247 : // Do any dangling deferred wakeups.
4248 0 : do_nocb_deferred_wakeup(rdp);
4249 :
4250 : /* QS for any half-done expedited grace period. */
4251 0 : preempt_disable();
4252 0 : rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
4253 0 : preempt_enable();
4254 0 : rcu_preempt_deferred_qs(current);
4255 :
4256 : /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
4257 0 : mask = rdp->grpmask;
4258 0 : WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
4259 0 : WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
4260 0 : smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
4261 0 : raw_spin_lock(&rcu_state.ofl_lock);
4262 0 : raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
4263 0 : rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
4264 0 : rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
4265 0 : if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
4266 : /* Report quiescent state -before- changing ->qsmaskinitnext! */
4267 0 : rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
4268 0 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
4269 : }
4270 0 : WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
4271 0 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4272 0 : raw_spin_unlock(&rcu_state.ofl_lock);
4273 0 : smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
4274 0 : WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
4275 0 : WARN_ON_ONCE(rnp->ofl_seq & 0x1);
4276 :
4277 0 : rdp->cpu_started = false;
4278 0 : }
4279 :
4280 : #ifdef CONFIG_HOTPLUG_CPU
4281 : /*
4282 : * The outgoing CPU has just passed through the dying-idle state, and we
4283 : * are being invoked from the CPU that was IPIed to continue the offline
4284 : * operation. Migrate the outgoing CPU's callbacks to the current CPU.
4285 : */
4286 0 : void rcutree_migrate_callbacks(int cpu)
4287 : {
4288 0 : unsigned long flags;
4289 0 : struct rcu_data *my_rdp;
4290 0 : struct rcu_node *my_rnp;
4291 0 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4292 0 : bool needwake;
4293 :
4294 0 : if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
4295 0 : rcu_segcblist_empty(&rdp->cblist))
4296 : return; /* No callbacks to migrate. */
4297 :
4298 0 : local_irq_save(flags);
4299 0 : my_rdp = this_cpu_ptr(&rcu_data);
4300 0 : my_rnp = my_rdp->mynode;
4301 0 : rcu_nocb_lock(my_rdp); /* irqs already disabled. */
4302 0 : WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
4303 0 : raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
4304 : /* Leverage recent GPs and set GP for new callbacks. */
4305 0 : needwake = rcu_advance_cbs(my_rnp, rdp) ||
4306 0 : rcu_advance_cbs(my_rnp, my_rdp);
4307 0 : rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
4308 0 : needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
4309 0 : rcu_segcblist_disable(&rdp->cblist);
4310 0 : WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
4311 : !rcu_segcblist_n_cbs(&my_rdp->cblist));
4312 0 : if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
4313 : raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
4314 0 : __call_rcu_nocb_wake(my_rdp, true, flags);
4315 : } else {
4316 0 : rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
4317 0 : raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
4318 : }
4319 0 : if (needwake)
4320 0 : rcu_gp_kthread_wake();
4321 0 : lockdep_assert_irqs_enabled();
4322 0 : WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
4323 : !rcu_segcblist_empty(&rdp->cblist),
4324 : "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
4325 : cpu, rcu_segcblist_n_cbs(&rdp->cblist),
4326 : rcu_segcblist_first_cb(&rdp->cblist));
4327 : }
4328 : #endif
4329 :
4330 : /*
4331 : * On non-huge systems, use expedited RCU grace periods to make suspend
4332 : * and hibernation run faster.
4333 : */
4334 : static int rcu_pm_notify(struct notifier_block *self,
4335 : unsigned long action, void *hcpu)
4336 : {
4337 : switch (action) {
4338 : case PM_HIBERNATION_PREPARE:
4339 : case PM_SUSPEND_PREPARE:
4340 : rcu_expedite_gp();
4341 : break;
4342 : case PM_POST_HIBERNATION:
4343 : case PM_POST_SUSPEND:
4344 : rcu_unexpedite_gp();
4345 : break;
4346 : default:
4347 : break;
4348 : }
4349 : return NOTIFY_OK;
4350 : }
4351 :
4352 : /*
4353 : * Spawn the kthreads that handle RCU's grace periods.
4354 : */
4355 1 : static int __init rcu_spawn_gp_kthread(void)
4356 : {
4357 1 : unsigned long flags;
4358 1 : int kthread_prio_in = kthread_prio;
4359 1 : struct rcu_node *rnp;
4360 1 : struct sched_param sp;
4361 1 : struct task_struct *t;
4362 :
4363 : /* Force priority into range. */
4364 1 : if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
4365 : && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
4366 : kthread_prio = 2;
4367 1 : else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
4368 : kthread_prio = 1;
4369 1 : else if (kthread_prio < 0)
4370 0 : kthread_prio = 0;
4371 1 : else if (kthread_prio > 99)
4372 0 : kthread_prio = 99;
4373 :
4374 1 : if (kthread_prio != kthread_prio_in)
4375 0 : pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
4376 : kthread_prio, kthread_prio_in);
4377 :
4378 1 : rcu_scheduler_fully_active = 1;
4379 1 : t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
4380 1 : if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
4381 : return 0;
4382 1 : if (kthread_prio) {
4383 0 : sp.sched_priority = kthread_prio;
4384 0 : sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
4385 : }
4386 1 : rnp = rcu_get_root();
4387 1 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
4388 1 : WRITE_ONCE(rcu_state.gp_activity, jiffies);
4389 1 : WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
4390 : // Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
4391 1 : smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */
4392 2 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4393 1 : wake_up_process(t);
4394 1 : rcu_spawn_nocb_kthreads();
4395 1 : rcu_spawn_boost_kthreads();
4396 1 : return 0;
4397 : }
4398 : early_initcall(rcu_spawn_gp_kthread);
4399 :
4400 : /*
4401 : * This function is invoked towards the end of the scheduler's
4402 : * initialization process. Before this is called, the idle task might
4403 : * contain synchronous grace-period primitives (during which time, this idle
4404 : * task is booting the system, and such primitives are no-ops). After this
4405 : * function is called, any synchronous grace-period primitives are run as
4406 : * expedited, with the requesting task driving the grace period forward.
4407 : * A later core_initcall() rcu_set_runtime_mode() will switch to full
4408 : * runtime RCU functionality.
4409 : */
4410 1 : void rcu_scheduler_starting(void)
4411 : {
4412 1 : WARN_ON(num_online_cpus() != 1);
4413 1 : WARN_ON(nr_context_switches() > 0);
4414 1 : rcu_test_sync_prims();
4415 1 : rcu_scheduler_active = RCU_SCHEDULER_INIT;
4416 1 : rcu_test_sync_prims();
4417 1 : }
4418 :
4419 : /*
4420 : * Helper function for rcu_init() that initializes the rcu_state structure.
4421 : */
4422 1 : static void __init rcu_init_one(void)
4423 : {
4424 1 : static const char * const buf[] = RCU_NODE_NAME_INIT;
4425 1 : static const char * const fqs[] = RCU_FQS_NAME_INIT;
4426 1 : static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
4427 1 : static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
4428 :
4429 1 : int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
4430 1 : int cpustride = 1;
4431 1 : int i;
4432 1 : int j;
4433 1 : struct rcu_node *rnp;
4434 :
4435 1 : BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
4436 :
4437 : /* Silence gcc 4.8 false positive about array index out of range. */
4438 1 : if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
4439 0 : panic("rcu_init_one: rcu_num_lvls out of range");
4440 :
4441 : /* Initialize the level-tracking arrays. */
4442 :
4443 1 : for (i = 1; i < rcu_num_lvls; i++)
4444 : rcu_state.level[i] =
4445 : rcu_state.level[i - 1] + num_rcu_lvl[i - 1];
4446 1 : rcu_init_levelspread(levelspread, num_rcu_lvl);
4447 :
4448 : /* Initialize the elements themselves, starting from the leaves. */
4449 :
4450 2 : for (i = rcu_num_lvls - 1; i >= 0; i--) {
4451 1 : cpustride *= levelspread[i];
4452 1 : rnp = rcu_state.level[i];
4453 2 : for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
4454 1 : raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
4455 1 : lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
4456 : &rcu_node_class[i], buf[i]);
4457 1 : raw_spin_lock_init(&rnp->fqslock);
4458 1 : lockdep_set_class_and_name(&rnp->fqslock,
4459 : &rcu_fqs_class[i], fqs[i]);
4460 1 : rnp->gp_seq = rcu_state.gp_seq;
4461 1 : rnp->gp_seq_needed = rcu_state.gp_seq;
4462 1 : rnp->completedqs = rcu_state.gp_seq;
4463 1 : rnp->qsmask = 0;
4464 1 : rnp->qsmaskinit = 0;
4465 1 : rnp->grplo = j * cpustride;
4466 1 : rnp->grphi = (j + 1) * cpustride - 1;
4467 1 : if (rnp->grphi >= nr_cpu_ids)
4468 0 : rnp->grphi = nr_cpu_ids - 1;
4469 1 : if (i == 0) {
4470 1 : rnp->grpnum = 0;
4471 1 : rnp->grpmask = 0;
4472 1 : rnp->parent = NULL;
4473 : } else {
4474 0 : rnp->grpnum = j % levelspread[i - 1];
4475 0 : rnp->grpmask = BIT(rnp->grpnum);
4476 0 : rnp->parent = rcu_state.level[i - 1] +
4477 0 : j / levelspread[i - 1];
4478 : }
4479 1 : rnp->level = i;
4480 1 : INIT_LIST_HEAD(&rnp->blkd_tasks);
4481 1 : rcu_init_one_nocb(rnp);
4482 1 : init_waitqueue_head(&rnp->exp_wq[0]);
4483 1 : init_waitqueue_head(&rnp->exp_wq[1]);
4484 1 : init_waitqueue_head(&rnp->exp_wq[2]);
4485 1 : init_waitqueue_head(&rnp->exp_wq[3]);
4486 1 : spin_lock_init(&rnp->exp_lock);
4487 : }
4488 : }
4489 :
4490 1 : init_swait_queue_head(&rcu_state.gp_wq);
4491 1 : init_swait_queue_head(&rcu_state.expedited_wq);
4492 1 : rnp = rcu_first_leaf_node();
4493 5 : for_each_possible_cpu(i) {
4494 4 : while (i > rnp->grphi)
4495 0 : rnp++;
4496 4 : per_cpu_ptr(&rcu_data, i)->mynode = rnp;
4497 4 : rcu_boot_init_percpu_data(i);
4498 : }
4499 1 : }
4500 :
4501 : /*
4502 : * Compute the rcu_node tree geometry from kernel parameters. This cannot
4503 : * replace the definitions in tree.h because those are needed to size
4504 : * the ->node array in the rcu_state structure.
4505 : */
4506 1 : static void __init rcu_init_geometry(void)
4507 : {
4508 1 : ulong d;
4509 1 : int i;
4510 1 : int rcu_capacity[RCU_NUM_LVLS];
4511 :
4512 : /*
4513 : * Initialize any unspecified boot parameters.
4514 : * The default values of jiffies_till_first_fqs and
4515 : * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
4516 : * value, which is a function of HZ, then adding one for each
4517 : * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
4518 : */
4519 1 : d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
4520 1 : if (jiffies_till_first_fqs == ULONG_MAX)
4521 1 : jiffies_till_first_fqs = d;
4522 1 : if (jiffies_till_next_fqs == ULONG_MAX)
4523 1 : jiffies_till_next_fqs = d;
4524 1 : adjust_jiffies_till_sched_qs();
4525 :
4526 : /* If the compile-time values are accurate, just leave. */
4527 1 : if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
4528 1 : nr_cpu_ids == NR_CPUS)
4529 : return;
4530 1 : pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
4531 : rcu_fanout_leaf, nr_cpu_ids);
4532 :
4533 : /*
4534 : * The boot-time rcu_fanout_leaf parameter must be at least two
4535 : * and cannot exceed the number of bits in the rcu_node masks.
4536 : * Complain and fall back to the compile-time values if this
4537 : * limit is exceeded.
4538 : */
4539 1 : if (rcu_fanout_leaf < 2 ||
4540 : rcu_fanout_leaf > sizeof(unsigned long) * 8) {
4541 0 : rcu_fanout_leaf = RCU_FANOUT_LEAF;
4542 0 : WARN_ON(1);
4543 0 : return;
4544 : }
4545 :
4546 : /*
4547 : * Compute number of nodes that can be handled an rcu_node tree
4548 : * with the given number of levels.
4549 : */
4550 1 : rcu_capacity[0] = rcu_fanout_leaf;
4551 1 : for (i = 1; i < RCU_NUM_LVLS; i++)
4552 : rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
4553 :
4554 : /*
4555 : * The tree must be able to accommodate the configured number of CPUs.
4556 : * If this limit is exceeded, fall back to the compile-time values.
4557 : */
4558 1 : if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
4559 0 : rcu_fanout_leaf = RCU_FANOUT_LEAF;
4560 0 : WARN_ON(1);
4561 0 : return;
4562 : }
4563 :
4564 : /* Calculate the number of levels in the tree. */
4565 1 : for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
4566 : }
4567 1 : rcu_num_lvls = i + 1;
4568 :
4569 : /* Calculate the number of rcu_nodes at each level of the tree. */
4570 2 : for (i = 0; i < rcu_num_lvls; i++) {
4571 1 : int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
4572 1 : num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
4573 : }
4574 :
4575 : /* Calculate the total number of rcu_node structures. */
4576 1 : rcu_num_nodes = 0;
4577 2 : for (i = 0; i < rcu_num_lvls; i++)
4578 1 : rcu_num_nodes += num_rcu_lvl[i];
4579 : }
4580 :
4581 : /*
4582 : * Dump out the structure of the rcu_node combining tree associated
4583 : * with the rcu_state structure.
4584 : */
4585 0 : static void __init rcu_dump_rcu_node_tree(void)
4586 : {
4587 0 : int level = 0;
4588 0 : struct rcu_node *rnp;
4589 :
4590 0 : pr_info("rcu_node tree layout dump\n");
4591 0 : pr_info(" ");
4592 0 : rcu_for_each_node_breadth_first(rnp) {
4593 0 : if (rnp->level != level) {
4594 0 : pr_cont("\n");
4595 0 : pr_info(" ");
4596 0 : level = rnp->level;
4597 : }
4598 0 : pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
4599 : }
4600 0 : pr_cont("\n");
4601 0 : }
4602 :
4603 : struct workqueue_struct *rcu_gp_wq;
4604 : struct workqueue_struct *rcu_par_gp_wq;
4605 :
4606 1 : static void __init kfree_rcu_batch_init(void)
4607 : {
4608 1 : int cpu;
4609 1 : int i;
4610 :
4611 5 : for_each_possible_cpu(cpu) {
4612 4 : struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
4613 :
4614 12 : for (i = 0; i < KFREE_N_BATCHES; i++) {
4615 8 : INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
4616 8 : krcp->krw_arr[i].krcp = krcp;
4617 : }
4618 :
4619 4 : INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
4620 4 : INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
4621 4 : krcp->initialized = true;
4622 : }
4623 1 : if (register_shrinker(&kfree_rcu_shrinker))
4624 0 : pr_err("Failed to register kfree_rcu() shrinker!\n");
4625 1 : }
4626 :
4627 1 : void __init rcu_init(void)
4628 : {
4629 1 : int cpu;
4630 :
4631 1 : rcu_early_boot_tests();
4632 :
4633 1 : kfree_rcu_batch_init();
4634 1 : rcu_bootup_announce();
4635 1 : rcu_init_geometry();
4636 1 : rcu_init_one();
4637 1 : if (dump_tree)
4638 0 : rcu_dump_rcu_node_tree();
4639 1 : if (use_softirq)
4640 1 : open_softirq(RCU_SOFTIRQ, rcu_core_si);
4641 :
4642 : /*
4643 : * We don't need protection against CPU-hotplug here because
4644 : * this is called early in boot, before either interrupts
4645 : * or the scheduler are operational.
4646 : */
4647 : pm_notifier(rcu_pm_notify, 0);
4648 2 : for_each_online_cpu(cpu) {
4649 1 : rcutree_prepare_cpu(cpu);
4650 1 : rcu_cpu_starting(cpu);
4651 1 : rcutree_online_cpu(cpu);
4652 : }
4653 :
4654 : /* Create workqueue for expedited GPs and for Tree SRCU. */
4655 1 : rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
4656 1 : WARN_ON(!rcu_gp_wq);
4657 1 : rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
4658 1 : WARN_ON(!rcu_par_gp_wq);
4659 1 : srcu_init();
4660 :
4661 : /* Fill in default value for rcutree.qovld boot parameter. */
4662 : /* -After- the rcu_node ->lock fields are initialized! */
4663 1 : if (qovld < 0)
4664 0 : qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
4665 : else
4666 1 : qovld_calc = qovld;
4667 1 : }
4668 :
4669 : #include "tree_stall.h"
4670 : #include "tree_exp.h"
4671 : #include "tree_plugin.h"
|