Line data Source code
1 : /* SPDX-License-Identifier: GPL-2.0+ */
2 : /*
3 : * Read-Copy Update mechanism for mutual exclusion (tree-based version)
4 : * Internal non-public definitions that provide either classic
5 : * or preemptible semantics.
6 : *
7 : * Copyright Red Hat, 2009
8 : * Copyright IBM Corporation, 2009
9 : *
10 : * Author: Ingo Molnar <mingo@elte.hu>
11 : * Paul E. McKenney <paulmck@linux.ibm.com>
12 : */
13 :
14 : #include "../locking/rtmutex_common.h"
15 :
16 : #ifdef CONFIG_RCU_NOCB_CPU
17 : static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
18 : static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
19 : #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
20 :
21 : /*
22 : * Check the RCU kernel configuration parameters and print informative
23 : * messages about anything out of the ordinary.
24 : */
25 1 : static void __init rcu_bootup_announce_oddness(void)
26 : {
27 1 : if (IS_ENABLED(CONFIG_RCU_TRACE))
28 : pr_info("\tRCU event tracing is enabled.\n");
29 1 : if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
30 : (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
31 : pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n",
32 : RCU_FANOUT);
33 1 : if (rcu_fanout_exact)
34 0 : pr_info("\tHierarchical RCU autobalancing is disabled.\n");
35 1 : if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
36 : pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
37 1 : if (IS_ENABLED(CONFIG_PROVE_RCU))
38 1 : pr_info("\tRCU lockdep checking is enabled.\n");
39 1 : if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
40 : pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n");
41 1 : if (RCU_NUM_LVLS >= 4)
42 : pr_info("\tFour(or more)-level hierarchy is enabled.\n");
43 1 : if (RCU_FANOUT_LEAF != 16)
44 : pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
45 : RCU_FANOUT_LEAF);
46 1 : if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
47 0 : pr_info("\tBoot-time adjustment of leaf fanout to %d.\n",
48 : rcu_fanout_leaf);
49 1 : if (nr_cpu_ids != NR_CPUS)
50 1 : pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
51 : #ifdef CONFIG_RCU_BOOST
52 : pr_info("\tRCU priority boosting: priority %d delay %d ms.\n",
53 : kthread_prio, CONFIG_RCU_BOOST_DELAY);
54 : #endif
55 1 : if (blimit != DEFAULT_RCU_BLIMIT)
56 0 : pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
57 1 : if (qhimark != DEFAULT_RCU_QHIMARK)
58 0 : pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
59 1 : if (qlowmark != DEFAULT_RCU_QLOMARK)
60 0 : pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
61 1 : if (qovld != DEFAULT_RCU_QOVLD)
62 0 : pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld);
63 1 : if (jiffies_till_first_fqs != ULONG_MAX)
64 0 : pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
65 1 : if (jiffies_till_next_fqs != ULONG_MAX)
66 0 : pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
67 1 : if (jiffies_till_sched_qs != ULONG_MAX)
68 0 : pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs);
69 1 : if (rcu_kick_kthreads)
70 0 : pr_info("\tKick kthreads if too-long grace period.\n");
71 1 : if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
72 1 : pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
73 1 : if (gp_preinit_delay)
74 0 : pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
75 1 : if (gp_init_delay)
76 0 : pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
77 1 : if (gp_cleanup_delay)
78 0 : pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
79 1 : if (!use_softirq)
80 0 : pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
81 1 : if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
82 : pr_info("\tRCU debug extended QS entry/exit.\n");
83 1 : rcupdate_announce_bootup_oddness();
84 1 : }
85 :
86 : #ifdef CONFIG_PREEMPT_RCU
87 :
88 : static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake);
89 : static void rcu_read_unlock_special(struct task_struct *t);
90 :
91 : /*
92 : * Tell them what RCU they are running.
93 : */
94 : static void __init rcu_bootup_announce(void)
95 : {
96 : pr_info("Preemptible hierarchical RCU implementation.\n");
97 : rcu_bootup_announce_oddness();
98 : }
99 :
100 : /* Flags for rcu_preempt_ctxt_queue() decision table. */
101 : #define RCU_GP_TASKS 0x8
102 : #define RCU_EXP_TASKS 0x4
103 : #define RCU_GP_BLKD 0x2
104 : #define RCU_EXP_BLKD 0x1
105 :
106 : /*
107 : * Queues a task preempted within an RCU-preempt read-side critical
108 : * section into the appropriate location within the ->blkd_tasks list,
109 : * depending on the states of any ongoing normal and expedited grace
110 : * periods. The ->gp_tasks pointer indicates which element the normal
111 : * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
112 : * indicates which element the expedited grace period is waiting on (again,
113 : * NULL if none). If a grace period is waiting on a given element in the
114 : * ->blkd_tasks list, it also waits on all subsequent elements. Thus,
115 : * adding a task to the tail of the list blocks any grace period that is
116 : * already waiting on one of the elements. In contrast, adding a task
117 : * to the head of the list won't block any grace period that is already
118 : * waiting on one of the elements.
119 : *
120 : * This queuing is imprecise, and can sometimes make an ongoing grace
121 : * period wait for a task that is not strictly speaking blocking it.
122 : * Given the choice, we needlessly block a normal grace period rather than
123 : * blocking an expedited grace period.
124 : *
125 : * Note that an endless sequence of expedited grace periods still cannot
126 : * indefinitely postpone a normal grace period. Eventually, all of the
127 : * fixed number of preempted tasks blocking the normal grace period that are
128 : * not also blocking the expedited grace period will resume and complete
129 : * their RCU read-side critical sections. At that point, the ->gp_tasks
130 : * pointer will equal the ->exp_tasks pointer, at which point the end of
131 : * the corresponding expedited grace period will also be the end of the
132 : * normal grace period.
133 : */
134 : static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
135 : __releases(rnp->lock) /* But leaves rrupts disabled. */
136 : {
137 : int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
138 : (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
139 : (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
140 : (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
141 : struct task_struct *t = current;
142 :
143 : raw_lockdep_assert_held_rcu_node(rnp);
144 : WARN_ON_ONCE(rdp->mynode != rnp);
145 : WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
146 : /* RCU better not be waiting on newly onlined CPUs! */
147 : WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask &
148 : rdp->grpmask);
149 :
150 : /*
151 : * Decide where to queue the newly blocked task. In theory,
152 : * this could be an if-statement. In practice, when I tried
153 : * that, it was quite messy.
154 : */
155 : switch (blkd_state) {
156 : case 0:
157 : case RCU_EXP_TASKS:
158 : case RCU_EXP_TASKS + RCU_GP_BLKD:
159 : case RCU_GP_TASKS:
160 : case RCU_GP_TASKS + RCU_EXP_TASKS:
161 :
162 : /*
163 : * Blocking neither GP, or first task blocking the normal
164 : * GP but not blocking the already-waiting expedited GP.
165 : * Queue at the head of the list to avoid unnecessarily
166 : * blocking the already-waiting GPs.
167 : */
168 : list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
169 : break;
170 :
171 : case RCU_EXP_BLKD:
172 : case RCU_GP_BLKD:
173 : case RCU_GP_BLKD + RCU_EXP_BLKD:
174 : case RCU_GP_TASKS + RCU_EXP_BLKD:
175 : case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
176 : case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
177 :
178 : /*
179 : * First task arriving that blocks either GP, or first task
180 : * arriving that blocks the expedited GP (with the normal
181 : * GP already waiting), or a task arriving that blocks
182 : * both GPs with both GPs already waiting. Queue at the
183 : * tail of the list to avoid any GP waiting on any of the
184 : * already queued tasks that are not blocking it.
185 : */
186 : list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
187 : break;
188 :
189 : case RCU_EXP_TASKS + RCU_EXP_BLKD:
190 : case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
191 : case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
192 :
193 : /*
194 : * Second or subsequent task blocking the expedited GP.
195 : * The task either does not block the normal GP, or is the
196 : * first task blocking the normal GP. Queue just after
197 : * the first task blocking the expedited GP.
198 : */
199 : list_add(&t->rcu_node_entry, rnp->exp_tasks);
200 : break;
201 :
202 : case RCU_GP_TASKS + RCU_GP_BLKD:
203 : case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
204 :
205 : /*
206 : * Second or subsequent task blocking the normal GP.
207 : * The task does not block the expedited GP. Queue just
208 : * after the first task blocking the normal GP.
209 : */
210 : list_add(&t->rcu_node_entry, rnp->gp_tasks);
211 : break;
212 :
213 : default:
214 :
215 : /* Yet another exercise in excessive paranoia. */
216 : WARN_ON_ONCE(1);
217 : break;
218 : }
219 :
220 : /*
221 : * We have now queued the task. If it was the first one to
222 : * block either grace period, update the ->gp_tasks and/or
223 : * ->exp_tasks pointers, respectively, to reference the newly
224 : * blocked tasks.
225 : */
226 : if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
227 : WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
228 : WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
229 : }
230 : if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
231 : WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry);
232 : WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
233 : !(rnp->qsmask & rdp->grpmask));
234 : WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
235 : !(rnp->expmask & rdp->grpmask));
236 : raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
237 :
238 : /*
239 : * Report the quiescent state for the expedited GP. This expedited
240 : * GP should not be able to end until we report, so there should be
241 : * no need to check for a subsequent expedited GP. (Though we are
242 : * still in a quiescent state in any case.)
243 : */
244 : if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
245 : rcu_report_exp_rdp(rdp);
246 : else
247 : WARN_ON_ONCE(rdp->exp_deferred_qs);
248 : }
249 :
250 : /*
251 : * Record a preemptible-RCU quiescent state for the specified CPU.
252 : * Note that this does not necessarily mean that the task currently running
253 : * on the CPU is in a quiescent state: Instead, it means that the current
254 : * grace period need not wait on any RCU read-side critical section that
255 : * starts later on this CPU. It also means that if the current task is
256 : * in an RCU read-side critical section, it has already added itself to
257 : * some leaf rcu_node structure's ->blkd_tasks list. In addition to the
258 : * current task, there might be any number of other tasks blocked while
259 : * in an RCU read-side critical section.
260 : *
261 : * Callers to this function must disable preemption.
262 : */
263 : static void rcu_qs(void)
264 : {
265 : RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
266 : if (__this_cpu_read(rcu_data.cpu_no_qs.s)) {
267 : trace_rcu_grace_period(TPS("rcu_preempt"),
268 : __this_cpu_read(rcu_data.gp_seq),
269 : TPS("cpuqs"));
270 : __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
271 : barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
272 : WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
273 : }
274 : }
275 :
276 : /*
277 : * We have entered the scheduler, and the current task might soon be
278 : * context-switched away from. If this task is in an RCU read-side
279 : * critical section, we will no longer be able to rely on the CPU to
280 : * record that fact, so we enqueue the task on the blkd_tasks list.
281 : * The task will dequeue itself when it exits the outermost enclosing
282 : * RCU read-side critical section. Therefore, the current grace period
283 : * cannot be permitted to complete until the blkd_tasks list entries
284 : * predating the current grace period drain, in other words, until
285 : * rnp->gp_tasks becomes NULL.
286 : *
287 : * Caller must disable interrupts.
288 : */
289 : void rcu_note_context_switch(bool preempt)
290 : {
291 : struct task_struct *t = current;
292 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
293 : struct rcu_node *rnp;
294 :
295 : trace_rcu_utilization(TPS("Start context switch"));
296 : lockdep_assert_irqs_disabled();
297 : WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
298 : if (rcu_preempt_depth() > 0 &&
299 : !t->rcu_read_unlock_special.b.blocked) {
300 :
301 : /* Possibly blocking in an RCU read-side critical section. */
302 : rnp = rdp->mynode;
303 : raw_spin_lock_rcu_node(rnp);
304 : t->rcu_read_unlock_special.b.blocked = true;
305 : t->rcu_blocked_node = rnp;
306 :
307 : /*
308 : * Verify the CPU's sanity, trace the preemption, and
309 : * then queue the task as required based on the states
310 : * of any ongoing and expedited grace periods.
311 : */
312 : WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
313 : WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
314 : trace_rcu_preempt_task(rcu_state.name,
315 : t->pid,
316 : (rnp->qsmask & rdp->grpmask)
317 : ? rnp->gp_seq
318 : : rcu_seq_snap(&rnp->gp_seq));
319 : rcu_preempt_ctxt_queue(rnp, rdp);
320 : } else {
321 : rcu_preempt_deferred_qs(t);
322 : }
323 :
324 : /*
325 : * Either we were not in an RCU read-side critical section to
326 : * begin with, or we have now recorded that critical section
327 : * globally. Either way, we can now note a quiescent state
328 : * for this CPU. Again, if we were in an RCU read-side critical
329 : * section, and if that critical section was blocking the current
330 : * grace period, then the fact that the task has been enqueued
331 : * means that we continue to block the current grace period.
332 : */
333 : rcu_qs();
334 : if (rdp->exp_deferred_qs)
335 : rcu_report_exp_rdp(rdp);
336 : rcu_tasks_qs(current, preempt);
337 : trace_rcu_utilization(TPS("End context switch"));
338 : }
339 : EXPORT_SYMBOL_GPL(rcu_note_context_switch);
340 :
341 : /*
342 : * Check for preempted RCU readers blocking the current grace period
343 : * for the specified rcu_node structure. If the caller needs a reliable
344 : * answer, it must hold the rcu_node's ->lock.
345 : */
346 : static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
347 : {
348 : return READ_ONCE(rnp->gp_tasks) != NULL;
349 : }
350 :
351 : /* limit value for ->rcu_read_lock_nesting. */
352 : #define RCU_NEST_PMAX (INT_MAX / 2)
353 :
354 : static void rcu_preempt_read_enter(void)
355 : {
356 : current->rcu_read_lock_nesting++;
357 : }
358 :
359 : static int rcu_preempt_read_exit(void)
360 : {
361 : return --current->rcu_read_lock_nesting;
362 : }
363 :
364 : static void rcu_preempt_depth_set(int val)
365 : {
366 : current->rcu_read_lock_nesting = val;
367 : }
368 :
369 : /*
370 : * Preemptible RCU implementation for rcu_read_lock().
371 : * Just increment ->rcu_read_lock_nesting, shared state will be updated
372 : * if we block.
373 : */
374 : void __rcu_read_lock(void)
375 : {
376 : rcu_preempt_read_enter();
377 : if (IS_ENABLED(CONFIG_PROVE_LOCKING))
378 : WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
379 : if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
380 : WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
381 : barrier(); /* critical section after entry code. */
382 : }
383 : EXPORT_SYMBOL_GPL(__rcu_read_lock);
384 :
385 : /*
386 : * Preemptible RCU implementation for rcu_read_unlock().
387 : * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
388 : * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
389 : * invoke rcu_read_unlock_special() to clean up after a context switch
390 : * in an RCU read-side critical section and other special cases.
391 : */
392 : void __rcu_read_unlock(void)
393 : {
394 : struct task_struct *t = current;
395 :
396 : if (rcu_preempt_read_exit() == 0) {
397 : barrier(); /* critical section before exit code. */
398 : if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
399 : rcu_read_unlock_special(t);
400 : }
401 : if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
402 : int rrln = rcu_preempt_depth();
403 :
404 : WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX);
405 : }
406 : }
407 : EXPORT_SYMBOL_GPL(__rcu_read_unlock);
408 :
409 : /*
410 : * Advance a ->blkd_tasks-list pointer to the next entry, instead
411 : * returning NULL if at the end of the list.
412 : */
413 : static struct list_head *rcu_next_node_entry(struct task_struct *t,
414 : struct rcu_node *rnp)
415 : {
416 : struct list_head *np;
417 :
418 : np = t->rcu_node_entry.next;
419 : if (np == &rnp->blkd_tasks)
420 : np = NULL;
421 : return np;
422 : }
423 :
424 : /*
425 : * Return true if the specified rcu_node structure has tasks that were
426 : * preempted within an RCU read-side critical section.
427 : */
428 : static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
429 : {
430 : return !list_empty(&rnp->blkd_tasks);
431 : }
432 :
433 : /*
434 : * Report deferred quiescent states. The deferral time can
435 : * be quite short, for example, in the case of the call from
436 : * rcu_read_unlock_special().
437 : */
438 : static void
439 : rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
440 : {
441 : bool empty_exp;
442 : bool empty_norm;
443 : bool empty_exp_now;
444 : struct list_head *np;
445 : bool drop_boost_mutex = false;
446 : struct rcu_data *rdp;
447 : struct rcu_node *rnp;
448 : union rcu_special special;
449 :
450 : /*
451 : * If RCU core is waiting for this CPU to exit its critical section,
452 : * report the fact that it has exited. Because irqs are disabled,
453 : * t->rcu_read_unlock_special cannot change.
454 : */
455 : special = t->rcu_read_unlock_special;
456 : rdp = this_cpu_ptr(&rcu_data);
457 : if (!special.s && !rdp->exp_deferred_qs) {
458 : local_irq_restore(flags);
459 : return;
460 : }
461 : t->rcu_read_unlock_special.s = 0;
462 : if (special.b.need_qs) {
463 : if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
464 : rcu_report_qs_rdp(rdp);
465 : udelay(rcu_unlock_delay);
466 : } else {
467 : rcu_qs();
468 : }
469 : }
470 :
471 : /*
472 : * Respond to a request by an expedited grace period for a
473 : * quiescent state from this CPU. Note that requests from
474 : * tasks are handled when removing the task from the
475 : * blocked-tasks list below.
476 : */
477 : if (rdp->exp_deferred_qs)
478 : rcu_report_exp_rdp(rdp);
479 :
480 : /* Clean up if blocked during RCU read-side critical section. */
481 : if (special.b.blocked) {
482 :
483 : /*
484 : * Remove this task from the list it blocked on. The task
485 : * now remains queued on the rcu_node corresponding to the
486 : * CPU it first blocked on, so there is no longer any need
487 : * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
488 : */
489 : rnp = t->rcu_blocked_node;
490 : raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
491 : WARN_ON_ONCE(rnp != t->rcu_blocked_node);
492 : WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
493 : empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
494 : WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
495 : (!empty_norm || rnp->qsmask));
496 : empty_exp = sync_rcu_exp_done(rnp);
497 : smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
498 : np = rcu_next_node_entry(t, rnp);
499 : list_del_init(&t->rcu_node_entry);
500 : t->rcu_blocked_node = NULL;
501 : trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
502 : rnp->gp_seq, t->pid);
503 : if (&t->rcu_node_entry == rnp->gp_tasks)
504 : WRITE_ONCE(rnp->gp_tasks, np);
505 : if (&t->rcu_node_entry == rnp->exp_tasks)
506 : WRITE_ONCE(rnp->exp_tasks, np);
507 : if (IS_ENABLED(CONFIG_RCU_BOOST)) {
508 : /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
509 : drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
510 : if (&t->rcu_node_entry == rnp->boost_tasks)
511 : WRITE_ONCE(rnp->boost_tasks, np);
512 : }
513 :
514 : /*
515 : * If this was the last task on the current list, and if
516 : * we aren't waiting on any CPUs, report the quiescent state.
517 : * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
518 : * so we must take a snapshot of the expedited state.
519 : */
520 : empty_exp_now = sync_rcu_exp_done(rnp);
521 : if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
522 : trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
523 : rnp->gp_seq,
524 : 0, rnp->qsmask,
525 : rnp->level,
526 : rnp->grplo,
527 : rnp->grphi,
528 : !!rnp->gp_tasks);
529 : rcu_report_unblock_qs_rnp(rnp, flags);
530 : } else {
531 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
532 : }
533 :
534 : /* Unboost if we were boosted. */
535 : if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
536 : rt_mutex_futex_unlock(&rnp->boost_mtx);
537 :
538 : /*
539 : * If this was the last task on the expedited lists,
540 : * then we need to report up the rcu_node hierarchy.
541 : */
542 : if (!empty_exp && empty_exp_now)
543 : rcu_report_exp_rnp(rnp, true);
544 : } else {
545 : local_irq_restore(flags);
546 : }
547 : }
548 :
549 : /*
550 : * Is a deferred quiescent-state pending, and are we also not in
551 : * an RCU read-side critical section? It is the caller's responsibility
552 : * to ensure it is otherwise safe to report any deferred quiescent
553 : * states. The reason for this is that it is safe to report a
554 : * quiescent state during context switch even though preemption
555 : * is disabled. This function cannot be expected to understand these
556 : * nuances, so the caller must handle them.
557 : */
558 : static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
559 : {
560 : return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
561 : READ_ONCE(t->rcu_read_unlock_special.s)) &&
562 : rcu_preempt_depth() == 0;
563 : }
564 :
565 : /*
566 : * Report a deferred quiescent state if needed and safe to do so.
567 : * As with rcu_preempt_need_deferred_qs(), "safe" involves only
568 : * not being in an RCU read-side critical section. The caller must
569 : * evaluate safety in terms of interrupt, softirq, and preemption
570 : * disabling.
571 : */
572 : static void rcu_preempt_deferred_qs(struct task_struct *t)
573 : {
574 : unsigned long flags;
575 :
576 : if (!rcu_preempt_need_deferred_qs(t))
577 : return;
578 : local_irq_save(flags);
579 : rcu_preempt_deferred_qs_irqrestore(t, flags);
580 : }
581 :
582 : /*
583 : * Minimal handler to give the scheduler a chance to re-evaluate.
584 : */
585 : static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
586 : {
587 : struct rcu_data *rdp;
588 :
589 : rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
590 : rdp->defer_qs_iw_pending = false;
591 : }
592 :
593 : /*
594 : * Handle special cases during rcu_read_unlock(), such as needing to
595 : * notify RCU core processing or task having blocked during the RCU
596 : * read-side critical section.
597 : */
598 : static void rcu_read_unlock_special(struct task_struct *t)
599 : {
600 : unsigned long flags;
601 : bool preempt_bh_were_disabled =
602 : !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
603 : bool irqs_were_disabled;
604 :
605 : /* NMI handlers cannot block and cannot safely manipulate state. */
606 : if (in_nmi())
607 : return;
608 :
609 : local_irq_save(flags);
610 : irqs_were_disabled = irqs_disabled_flags(flags);
611 : if (preempt_bh_were_disabled || irqs_were_disabled) {
612 : bool exp;
613 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
614 : struct rcu_node *rnp = rdp->mynode;
615 :
616 : exp = (t->rcu_blocked_node &&
617 : READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
618 : (rdp->grpmask & READ_ONCE(rnp->expmask));
619 : // Need to defer quiescent state until everything is enabled.
620 : if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) {
621 : // Using softirq, safe to awaken, and either the
622 : // wakeup is free or there is an expedited GP.
623 : raise_softirq_irqoff(RCU_SOFTIRQ);
624 : } else {
625 : // Enabling BH or preempt does reschedule, so...
626 : // Also if no expediting, slow is OK.
627 : // Plus nohz_full CPUs eventually get tick enabled.
628 : set_tsk_need_resched(current);
629 : set_preempt_need_resched();
630 : if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
631 : !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) {
632 : // Get scheduler to re-evaluate and call hooks.
633 : // If !IRQ_WORK, FQS scan will eventually IPI.
634 : init_irq_work(&rdp->defer_qs_iw,
635 : rcu_preempt_deferred_qs_handler);
636 : rdp->defer_qs_iw_pending = true;
637 : irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
638 : }
639 : }
640 : local_irq_restore(flags);
641 : return;
642 : }
643 : rcu_preempt_deferred_qs_irqrestore(t, flags);
644 : }
645 :
646 : /*
647 : * Check that the list of blocked tasks for the newly completed grace
648 : * period is in fact empty. It is a serious bug to complete a grace
649 : * period that still has RCU readers blocked! This function must be
650 : * invoked -before- updating this rnp's ->gp_seq.
651 : *
652 : * Also, if there are blocked tasks on the list, they automatically
653 : * block the newly created grace period, so set up ->gp_tasks accordingly.
654 : */
655 : static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
656 : {
657 : struct task_struct *t;
658 :
659 : RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
660 : raw_lockdep_assert_held_rcu_node(rnp);
661 : if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
662 : dump_blkd_tasks(rnp, 10);
663 : if (rcu_preempt_has_tasks(rnp) &&
664 : (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
665 : WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
666 : t = container_of(rnp->gp_tasks, struct task_struct,
667 : rcu_node_entry);
668 : trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
669 : rnp->gp_seq, t->pid);
670 : }
671 : WARN_ON_ONCE(rnp->qsmask);
672 : }
673 :
674 : /*
675 : * Check for a quiescent state from the current CPU, including voluntary
676 : * context switches for Tasks RCU. When a task blocks, the task is
677 : * recorded in the corresponding CPU's rcu_node structure, which is checked
678 : * elsewhere, hence this function need only check for quiescent states
679 : * related to the current CPU, not to those related to tasks.
680 : */
681 : static void rcu_flavor_sched_clock_irq(int user)
682 : {
683 : struct task_struct *t = current;
684 :
685 : lockdep_assert_irqs_disabled();
686 : if (user || rcu_is_cpu_rrupt_from_idle()) {
687 : rcu_note_voluntary_context_switch(current);
688 : }
689 : if (rcu_preempt_depth() > 0 ||
690 : (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
691 : /* No QS, force context switch if deferred. */
692 : if (rcu_preempt_need_deferred_qs(t)) {
693 : set_tsk_need_resched(t);
694 : set_preempt_need_resched();
695 : }
696 : } else if (rcu_preempt_need_deferred_qs(t)) {
697 : rcu_preempt_deferred_qs(t); /* Report deferred QS. */
698 : return;
699 : } else if (!WARN_ON_ONCE(rcu_preempt_depth())) {
700 : rcu_qs(); /* Report immediate QS. */
701 : return;
702 : }
703 :
704 : /* If GP is oldish, ask for help from rcu_read_unlock_special(). */
705 : if (rcu_preempt_depth() > 0 &&
706 : __this_cpu_read(rcu_data.core_needs_qs) &&
707 : __this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
708 : !t->rcu_read_unlock_special.b.need_qs &&
709 : time_after(jiffies, rcu_state.gp_start + HZ))
710 : t->rcu_read_unlock_special.b.need_qs = true;
711 : }
712 :
713 : /*
714 : * Check for a task exiting while in a preemptible-RCU read-side
715 : * critical section, clean up if so. No need to issue warnings, as
716 : * debug_check_no_locks_held() already does this if lockdep is enabled.
717 : * Besides, if this function does anything other than just immediately
718 : * return, there was a bug of some sort. Spewing warnings from this
719 : * function is like as not to simply obscure important prior warnings.
720 : */
721 : void exit_rcu(void)
722 : {
723 : struct task_struct *t = current;
724 :
725 : if (unlikely(!list_empty(¤t->rcu_node_entry))) {
726 : rcu_preempt_depth_set(1);
727 : barrier();
728 : WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
729 : } else if (unlikely(rcu_preempt_depth())) {
730 : rcu_preempt_depth_set(1);
731 : } else {
732 : return;
733 : }
734 : __rcu_read_unlock();
735 : rcu_preempt_deferred_qs(current);
736 : }
737 :
738 : /*
739 : * Dump the blocked-tasks state, but limit the list dump to the
740 : * specified number of elements.
741 : */
742 : static void
743 : dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
744 : {
745 : int cpu;
746 : int i;
747 : struct list_head *lhp;
748 : bool onl;
749 : struct rcu_data *rdp;
750 : struct rcu_node *rnp1;
751 :
752 : raw_lockdep_assert_held_rcu_node(rnp);
753 : pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
754 : __func__, rnp->grplo, rnp->grphi, rnp->level,
755 : (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs);
756 : for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
757 : pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
758 : __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
759 : pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
760 : __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks),
761 : READ_ONCE(rnp->exp_tasks));
762 : pr_info("%s: ->blkd_tasks", __func__);
763 : i = 0;
764 : list_for_each(lhp, &rnp->blkd_tasks) {
765 : pr_cont(" %p", lhp);
766 : if (++i >= ncheck)
767 : break;
768 : }
769 : pr_cont("\n");
770 : for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
771 : rdp = per_cpu_ptr(&rcu_data, cpu);
772 : onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
773 : pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
774 : cpu, ".o"[onl],
775 : (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
776 : (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
777 : }
778 : }
779 :
780 : #else /* #ifdef CONFIG_PREEMPT_RCU */
781 :
782 : /*
783 : * If strict grace periods are enabled, and if the calling
784 : * __rcu_read_unlock() marks the beginning of a quiescent state, immediately
785 : * report that quiescent state and, if requested, spin for a bit.
786 : */
787 1156446 : void rcu_read_unlock_strict(void)
788 : {
789 1156446 : struct rcu_data *rdp;
790 :
791 1156446 : if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
792 : irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
793 0 : return;
794 : rdp = this_cpu_ptr(&rcu_data);
795 : rcu_report_qs_rdp(rdp);
796 : udelay(rcu_unlock_delay);
797 : }
798 : EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
799 :
800 : /*
801 : * Tell them what RCU they are running.
802 : */
803 1 : static void __init rcu_bootup_announce(void)
804 : {
805 1 : pr_info("Hierarchical RCU implementation.\n");
806 1 : rcu_bootup_announce_oddness();
807 1 : }
808 :
809 : /*
810 : * Note a quiescent state for PREEMPTION=n. Because we do not need to know
811 : * how many quiescent states passed, just if there was at least one since
812 : * the start of the grace period, this just sets a flag. The caller must
813 : * have disabled preemption.
814 : */
815 74752 : static void rcu_qs(void)
816 : {
817 74752 : RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!");
818 75032 : if (!__this_cpu_read(rcu_data.cpu_no_qs.s))
819 : return;
820 7820 : trace_rcu_grace_period(TPS("rcu_sched"),
821 7820 : __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
822 7820 : __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
823 7820 : if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
824 : return;
825 303 : __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
826 303 : rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
827 : }
828 :
829 : /*
830 : * Register an urgently needed quiescent state. If there is an
831 : * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
832 : * dyntick-idle quiescent state visible to other CPUs, which will in
833 : * some cases serve for expedited as well as normal grace periods.
834 : * Either way, register a lightweight quiescent state.
835 : */
836 539554 : void rcu_all_qs(void)
837 : {
838 539554 : unsigned long flags;
839 :
840 539554 : if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
841 : return;
842 6 : preempt_disable();
843 : /* Load rcu_urgent_qs before other flags. */
844 6 : if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
845 0 : preempt_enable();
846 0 : return;
847 : }
848 6 : this_cpu_write(rcu_data.rcu_urgent_qs, false);
849 6 : if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
850 0 : local_irq_save(flags);
851 0 : rcu_momentary_dyntick_idle();
852 0 : local_irq_restore(flags);
853 : }
854 6 : rcu_qs();
855 6 : preempt_enable();
856 : }
857 : EXPORT_SYMBOL_GPL(rcu_all_qs);
858 :
859 : /*
860 : * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
861 : */
862 29816 : void rcu_note_context_switch(bool preempt)
863 : {
864 29816 : trace_rcu_utilization(TPS("Start context switch"));
865 29818 : rcu_qs();
866 : /* Load rcu_urgent_qs before other flags. */
867 29815 : if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs)))
868 29521 : goto out;
869 294 : this_cpu_write(rcu_data.rcu_urgent_qs, false);
870 294 : if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
871 0 : rcu_momentary_dyntick_idle();
872 29815 : rcu_tasks_qs(current, preempt);
873 294 : out:
874 29815 : trace_rcu_utilization(TPS("End context switch"));
875 29818 : }
876 : EXPORT_SYMBOL_GPL(rcu_note_context_switch);
877 :
878 : /*
879 : * Because preemptible RCU does not exist, there are never any preempted
880 : * RCU readers.
881 : */
882 6411 : static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
883 : {
884 4392 : return 0;
885 : }
886 :
887 : /*
888 : * Because there is no preemptible RCU, there can be no readers blocked.
889 : */
890 161 : static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
891 : {
892 161 : return false;
893 : }
894 :
895 : /*
896 : * Because there is no preemptible RCU, there can be no deferred quiescent
897 : * states.
898 : */
899 15322 : static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
900 : {
901 15322 : return false;
902 : }
903 38831 : static void rcu_preempt_deferred_qs(struct task_struct *t) { }
904 :
905 : /*
906 : * Because there is no preemptible RCU, there can be no readers blocked,
907 : * so there is no need to check for blocked tasks. So check only for
908 : * bogus qsmask values.
909 : */
910 2020 : static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
911 : {
912 2020 : WARN_ON_ONCE(rnp->qsmask);
913 2020 : }
914 :
915 : /*
916 : * Check to see if this CPU is in a non-context-switch quiescent state,
917 : * namely user mode and idle loop.
918 : */
919 27394 : static void rcu_flavor_sched_clock_irq(int user)
920 : {
921 27394 : if (user || rcu_is_cpu_rrupt_from_idle()) {
922 :
923 : /*
924 : * Get here if this CPU took its interrupt from user
925 : * mode or from the idle loop, and if this is not a
926 : * nested interrupt. In this case, the CPU is in
927 : * a quiescent state, so note it.
928 : *
929 : * No memory barrier is required here because rcu_qs()
930 : * references only CPU-local variables that other CPUs
931 : * neither access nor modify, at least not while the
932 : * corresponding CPU is online.
933 : */
934 :
935 12051 : rcu_qs();
936 : }
937 28014 : }
938 :
939 : /*
940 : * Because preemptible RCU does not exist, tasks cannot possibly exit
941 : * while in preemptible RCU read-side critical sections.
942 : */
943 1013 : void exit_rcu(void)
944 : {
945 1013 : }
946 :
947 : /*
948 : * Dump the guaranteed-empty blocked-tasks state. Trust but verify.
949 : */
950 : static void
951 : dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
952 : {
953 : WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
954 : }
955 :
956 : #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
957 :
958 : /*
959 : * If boosting, set rcuc kthreads to realtime priority.
960 : */
961 0 : static void rcu_cpu_kthread_setup(unsigned int cpu)
962 : {
963 : #ifdef CONFIG_RCU_BOOST
964 : struct sched_param sp;
965 :
966 : sp.sched_priority = kthread_prio;
967 : sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
968 : #endif /* #ifdef CONFIG_RCU_BOOST */
969 0 : }
970 :
971 : #ifdef CONFIG_RCU_BOOST
972 :
973 : /*
974 : * Carry out RCU priority boosting on the task indicated by ->exp_tasks
975 : * or ->boost_tasks, advancing the pointer to the next task in the
976 : * ->blkd_tasks list.
977 : *
978 : * Note that irqs must be enabled: boosting the task can block.
979 : * Returns 1 if there are more tasks needing to be boosted.
980 : */
981 : static int rcu_boost(struct rcu_node *rnp)
982 : {
983 : unsigned long flags;
984 : struct task_struct *t;
985 : struct list_head *tb;
986 :
987 : if (READ_ONCE(rnp->exp_tasks) == NULL &&
988 : READ_ONCE(rnp->boost_tasks) == NULL)
989 : return 0; /* Nothing left to boost. */
990 :
991 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
992 :
993 : /*
994 : * Recheck under the lock: all tasks in need of boosting
995 : * might exit their RCU read-side critical sections on their own.
996 : */
997 : if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
998 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
999 : return 0;
1000 : }
1001 :
1002 : /*
1003 : * Preferentially boost tasks blocking expedited grace periods.
1004 : * This cannot starve the normal grace periods because a second
1005 : * expedited grace period must boost all blocked tasks, including
1006 : * those blocking the pre-existing normal grace period.
1007 : */
1008 : if (rnp->exp_tasks != NULL)
1009 : tb = rnp->exp_tasks;
1010 : else
1011 : tb = rnp->boost_tasks;
1012 :
1013 : /*
1014 : * We boost task t by manufacturing an rt_mutex that appears to
1015 : * be held by task t. We leave a pointer to that rt_mutex where
1016 : * task t can find it, and task t will release the mutex when it
1017 : * exits its outermost RCU read-side critical section. Then
1018 : * simply acquiring this artificial rt_mutex will boost task
1019 : * t's priority. (Thanks to tglx for suggesting this approach!)
1020 : *
1021 : * Note that task t must acquire rnp->lock to remove itself from
1022 : * the ->blkd_tasks list, which it will do from exit() if from
1023 : * nowhere else. We therefore are guaranteed that task t will
1024 : * stay around at least until we drop rnp->lock. Note that
1025 : * rnp->lock also resolves races between our priority boosting
1026 : * and task t's exiting its outermost RCU read-side critical
1027 : * section.
1028 : */
1029 : t = container_of(tb, struct task_struct, rcu_node_entry);
1030 : rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1031 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1032 : /* Lock only for side effect: boosts task t's priority. */
1033 : rt_mutex_lock(&rnp->boost_mtx);
1034 : rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
1035 :
1036 : return READ_ONCE(rnp->exp_tasks) != NULL ||
1037 : READ_ONCE(rnp->boost_tasks) != NULL;
1038 : }
1039 :
1040 : /*
1041 : * Priority-boosting kthread, one per leaf rcu_node.
1042 : */
1043 : static int rcu_boost_kthread(void *arg)
1044 : {
1045 : struct rcu_node *rnp = (struct rcu_node *)arg;
1046 : int spincnt = 0;
1047 : int more2boost;
1048 :
1049 : trace_rcu_utilization(TPS("Start boost kthread@init"));
1050 : for (;;) {
1051 : WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING);
1052 : trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1053 : rcu_wait(READ_ONCE(rnp->boost_tasks) ||
1054 : READ_ONCE(rnp->exp_tasks));
1055 : trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1056 : WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING);
1057 : more2boost = rcu_boost(rnp);
1058 : if (more2boost)
1059 : spincnt++;
1060 : else
1061 : spincnt = 0;
1062 : if (spincnt > 10) {
1063 : WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
1064 : trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1065 : schedule_timeout_idle(2);
1066 : trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1067 : spincnt = 0;
1068 : }
1069 : }
1070 : /* NOTREACHED */
1071 : trace_rcu_utilization(TPS("End boost kthread@notreached"));
1072 : return 0;
1073 : }
1074 :
1075 : /*
1076 : * Check to see if it is time to start boosting RCU readers that are
1077 : * blocking the current grace period, and, if so, tell the per-rcu_node
1078 : * kthread to start boosting them. If there is an expedited grace
1079 : * period in progress, it is always time to boost.
1080 : *
1081 : * The caller must hold rnp->lock, which this function releases.
1082 : * The ->boost_kthread_task is immortal, so we don't need to worry
1083 : * about it going away.
1084 : */
1085 : static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1086 : __releases(rnp->lock)
1087 : {
1088 : raw_lockdep_assert_held_rcu_node(rnp);
1089 : if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1090 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1091 : return;
1092 : }
1093 : if (rnp->exp_tasks != NULL ||
1094 : (rnp->gp_tasks != NULL &&
1095 : rnp->boost_tasks == NULL &&
1096 : rnp->qsmask == 0 &&
1097 : (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
1098 : if (rnp->exp_tasks == NULL)
1099 : WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
1100 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1101 : rcu_wake_cond(rnp->boost_kthread_task,
1102 : READ_ONCE(rnp->boost_kthread_status));
1103 : } else {
1104 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1105 : }
1106 : }
1107 :
1108 : /*
1109 : * Is the current CPU running the RCU-callbacks kthread?
1110 : * Caller must have preemption disabled.
1111 : */
1112 : static bool rcu_is_callbacks_kthread(void)
1113 : {
1114 : return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
1115 : }
1116 :
1117 : #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1118 :
1119 : /*
1120 : * Do priority-boost accounting for the start of a new grace period.
1121 : */
1122 : static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1123 : {
1124 : rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1125 : }
1126 :
1127 : /*
1128 : * Create an RCU-boost kthread for the specified node if one does not
1129 : * already exist. We only create this kthread for preemptible RCU.
1130 : * Returns zero if all is well, a negated errno otherwise.
1131 : */
1132 : static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1133 : {
1134 : int rnp_index = rnp - rcu_get_root();
1135 : unsigned long flags;
1136 : struct sched_param sp;
1137 : struct task_struct *t;
1138 :
1139 : if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
1140 : return;
1141 :
1142 : if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
1143 : return;
1144 :
1145 : rcu_state.boost = 1;
1146 :
1147 : if (rnp->boost_kthread_task != NULL)
1148 : return;
1149 :
1150 : t = kthread_create(rcu_boost_kthread, (void *)rnp,
1151 : "rcub/%d", rnp_index);
1152 : if (WARN_ON_ONCE(IS_ERR(t)))
1153 : return;
1154 :
1155 : raw_spin_lock_irqsave_rcu_node(rnp, flags);
1156 : rnp->boost_kthread_task = t;
1157 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1158 : sp.sched_priority = kthread_prio;
1159 : sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1160 : wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1161 : }
1162 :
1163 : /*
1164 : * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1165 : * served by the rcu_node in question. The CPU hotplug lock is still
1166 : * held, so the value of rnp->qsmaskinit will be stable.
1167 : *
1168 : * We don't include outgoingcpu in the affinity set, use -1 if there is
1169 : * no outgoing CPU. If there are no CPUs left in the affinity set,
1170 : * this function allows the kthread to execute on any CPU.
1171 : */
1172 : static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1173 : {
1174 : struct task_struct *t = rnp->boost_kthread_task;
1175 : unsigned long mask = rcu_rnp_online_cpus(rnp);
1176 : cpumask_var_t cm;
1177 : int cpu;
1178 :
1179 : if (!t)
1180 : return;
1181 : if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1182 : return;
1183 : for_each_leaf_node_possible_cpu(rnp, cpu)
1184 : if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
1185 : cpu != outgoingcpu)
1186 : cpumask_set_cpu(cpu, cm);
1187 : if (cpumask_weight(cm) == 0)
1188 : cpumask_setall(cm);
1189 : set_cpus_allowed_ptr(t, cm);
1190 : free_cpumask_var(cm);
1191 : }
1192 :
1193 : /*
1194 : * Spawn boost kthreads -- called as soon as the scheduler is running.
1195 : */
1196 : static void __init rcu_spawn_boost_kthreads(void)
1197 : {
1198 : struct rcu_node *rnp;
1199 :
1200 : rcu_for_each_leaf_node(rnp)
1201 : rcu_spawn_one_boost_kthread(rnp);
1202 : }
1203 :
1204 : static void rcu_prepare_kthreads(int cpu)
1205 : {
1206 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
1207 : struct rcu_node *rnp = rdp->mynode;
1208 :
1209 : /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1210 : if (rcu_scheduler_fully_active)
1211 : rcu_spawn_one_boost_kthread(rnp);
1212 : }
1213 :
1214 : #else /* #ifdef CONFIG_RCU_BOOST */
1215 :
1216 0 : static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1217 : __releases(rnp->lock)
1218 : {
1219 0 : raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1220 0 : }
1221 :
1222 48764 : static bool rcu_is_callbacks_kthread(void)
1223 : {
1224 48764 : return false;
1225 : }
1226 :
1227 2020 : static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1228 : {
1229 2020 : }
1230 :
1231 3 : static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1232 : {
1233 3 : }
1234 :
1235 1 : static void __init rcu_spawn_boost_kthreads(void)
1236 : {
1237 1 : }
1238 :
1239 4 : static void rcu_prepare_kthreads(int cpu)
1240 : {
1241 4 : }
1242 :
1243 : #endif /* #else #ifdef CONFIG_RCU_BOOST */
1244 :
1245 : #if !defined(CONFIG_RCU_FAST_NO_HZ)
1246 :
1247 : /*
1248 : * Check to see if any future non-offloaded RCU-related work will need
1249 : * to be done by the current CPU, even if none need be done immediately,
1250 : * returning 1 if so. This function is part of the RCU implementation;
1251 : * it is -not- an exported member of the RCU API.
1252 : *
1253 : * Because we not have RCU_FAST_NO_HZ, just check whether or not this
1254 : * CPU has RCU callbacks queued.
1255 : */
1256 17225 : int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1257 : {
1258 17225 : *nextevt = KTIME_MAX;
1259 17225 : return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
1260 16103 : !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);
1261 : }
1262 :
1263 : /*
1264 : * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1265 : * after it.
1266 : */
1267 : static void rcu_cleanup_after_idle(void)
1268 : {
1269 : }
1270 :
1271 : /*
1272 : * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1273 : * is nothing.
1274 : */
1275 : static void rcu_prepare_for_idle(void)
1276 : {
1277 : }
1278 :
1279 : #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1280 :
1281 : /*
1282 : * This code is invoked when a CPU goes idle, at which point we want
1283 : * to have the CPU do everything required for RCU so that it can enter
1284 : * the energy-efficient dyntick-idle mode.
1285 : *
1286 : * The following preprocessor symbol controls this:
1287 : *
1288 : * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1289 : * to sleep in dyntick-idle mode with RCU callbacks pending. This
1290 : * is sized to be roughly one RCU grace period. Those energy-efficiency
1291 : * benchmarkers who might otherwise be tempted to set this to a large
1292 : * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
1293 : * system. And if you are -that- concerned about energy efficiency,
1294 : * just power the system down and be done with it!
1295 : *
1296 : * The value below works well in practice. If future workloads require
1297 : * adjustment, they can be converted into kernel config parameters, though
1298 : * making the state machine smarter might be a better option.
1299 : */
1300 : #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1301 :
1302 : static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1303 : module_param(rcu_idle_gp_delay, int, 0644);
1304 :
1305 : /*
1306 : * Try to advance callbacks on the current CPU, but only if it has been
1307 : * awhile since the last time we did so. Afterwards, if there are any
1308 : * callbacks ready for immediate invocation, return true.
1309 : */
1310 : static bool __maybe_unused rcu_try_advance_all_cbs(void)
1311 : {
1312 : bool cbs_ready = false;
1313 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1314 : struct rcu_node *rnp;
1315 :
1316 : /* Exit early if we advanced recently. */
1317 : if (jiffies == rdp->last_advance_all)
1318 : return false;
1319 : rdp->last_advance_all = jiffies;
1320 :
1321 : rnp = rdp->mynode;
1322 :
1323 : /*
1324 : * Don't bother checking unless a grace period has
1325 : * completed since we last checked and there are
1326 : * callbacks not yet ready to invoke.
1327 : */
1328 : if ((rcu_seq_completed_gp(rdp->gp_seq,
1329 : rcu_seq_current(&rnp->gp_seq)) ||
1330 : unlikely(READ_ONCE(rdp->gpwrap))) &&
1331 : rcu_segcblist_pend_cbs(&rdp->cblist))
1332 : note_gp_changes(rdp);
1333 :
1334 : if (rcu_segcblist_ready_cbs(&rdp->cblist))
1335 : cbs_ready = true;
1336 : return cbs_ready;
1337 : }
1338 :
1339 : /*
1340 : * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1341 : * to invoke. If the CPU has callbacks, try to advance them. Tell the
1342 : * caller about what to set the timeout.
1343 : *
1344 : * The caller must have disabled interrupts.
1345 : */
1346 : int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1347 : {
1348 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1349 : unsigned long dj;
1350 :
1351 : lockdep_assert_irqs_disabled();
1352 :
1353 : /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
1354 : if (rcu_segcblist_empty(&rdp->cblist) ||
1355 : rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {
1356 : *nextevt = KTIME_MAX;
1357 : return 0;
1358 : }
1359 :
1360 : /* Attempt to advance callbacks. */
1361 : if (rcu_try_advance_all_cbs()) {
1362 : /* Some ready to invoke, so initiate later invocation. */
1363 : invoke_rcu_core();
1364 : return 1;
1365 : }
1366 : rdp->last_accelerate = jiffies;
1367 :
1368 : /* Request timer and round. */
1369 : dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
1370 :
1371 : *nextevt = basemono + dj * TICK_NSEC;
1372 : return 0;
1373 : }
1374 :
1375 : /*
1376 : * Prepare a CPU for idle from an RCU perspective. The first major task is to
1377 : * sense whether nohz mode has been enabled or disabled via sysfs. The second
1378 : * major task is to accelerate (that is, assign grace-period numbers to) any
1379 : * recently arrived callbacks.
1380 : *
1381 : * The caller must have disabled interrupts.
1382 : */
1383 : static void rcu_prepare_for_idle(void)
1384 : {
1385 : bool needwake;
1386 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1387 : struct rcu_node *rnp;
1388 : int tne;
1389 :
1390 : lockdep_assert_irqs_disabled();
1391 : if (rcu_segcblist_is_offloaded(&rdp->cblist))
1392 : return;
1393 :
1394 : /* Handle nohz enablement switches conservatively. */
1395 : tne = READ_ONCE(tick_nohz_active);
1396 : if (tne != rdp->tick_nohz_enabled_snap) {
1397 : if (!rcu_segcblist_empty(&rdp->cblist))
1398 : invoke_rcu_core(); /* force nohz to see update. */
1399 : rdp->tick_nohz_enabled_snap = tne;
1400 : return;
1401 : }
1402 : if (!tne)
1403 : return;
1404 :
1405 : /*
1406 : * If we have not yet accelerated this jiffy, accelerate all
1407 : * callbacks on this CPU.
1408 : */
1409 : if (rdp->last_accelerate == jiffies)
1410 : return;
1411 : rdp->last_accelerate = jiffies;
1412 : if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
1413 : rnp = rdp->mynode;
1414 : raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
1415 : needwake = rcu_accelerate_cbs(rnp, rdp);
1416 : raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1417 : if (needwake)
1418 : rcu_gp_kthread_wake();
1419 : }
1420 : }
1421 :
1422 : /*
1423 : * Clean up for exit from idle. Attempt to advance callbacks based on
1424 : * any grace periods that elapsed while the CPU was idle, and if any
1425 : * callbacks are now ready to invoke, initiate invocation.
1426 : */
1427 : static void rcu_cleanup_after_idle(void)
1428 : {
1429 : struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1430 :
1431 : lockdep_assert_irqs_disabled();
1432 : if (rcu_segcblist_is_offloaded(&rdp->cblist))
1433 : return;
1434 : if (rcu_try_advance_all_cbs())
1435 : invoke_rcu_core();
1436 : }
1437 :
1438 : #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1439 :
1440 : #ifdef CONFIG_RCU_NOCB_CPU
1441 :
1442 : /*
1443 : * Offload callback processing from the boot-time-specified set of CPUs
1444 : * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
1445 : * created that pull the callbacks from the corresponding CPU, wait for
1446 : * a grace period to elapse, and invoke the callbacks. These kthreads
1447 : * are organized into GP kthreads, which manage incoming callbacks, wait for
1448 : * grace periods, and awaken CB kthreads, and the CB kthreads, which only
1449 : * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
1450 : * do a wake_up() on their GP kthread when they insert a callback into any
1451 : * empty list, unless the rcu_nocb_poll boot parameter has been specified,
1452 : * in which case each kthread actively polls its CPU. (Which isn't so great
1453 : * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
1454 : *
1455 : * This is intended to be used in conjunction with Frederic Weisbecker's
1456 : * adaptive-idle work, which would seriously reduce OS jitter on CPUs
1457 : * running CPU-bound user-mode computations.
1458 : *
1459 : * Offloading of callbacks can also be used as an energy-efficiency
1460 : * measure because CPUs with no RCU callbacks queued are more aggressive
1461 : * about entering dyntick-idle mode.
1462 : */
1463 :
1464 :
1465 : /*
1466 : * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
1467 : * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
1468 : * comma-separated list of CPUs and/or CPU ranges. If an invalid list is
1469 : * given, a warning is emitted and all CPUs are offloaded.
1470 : */
1471 : static int __init rcu_nocb_setup(char *str)
1472 : {
1473 : alloc_bootmem_cpumask_var(&rcu_nocb_mask);
1474 : if (!strcasecmp(str, "all"))
1475 : cpumask_setall(rcu_nocb_mask);
1476 : else
1477 : if (cpulist_parse(str, rcu_nocb_mask)) {
1478 : pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
1479 : cpumask_setall(rcu_nocb_mask);
1480 : }
1481 : return 1;
1482 : }
1483 : __setup("rcu_nocbs=", rcu_nocb_setup);
1484 :
1485 : static int __init parse_rcu_nocb_poll(char *arg)
1486 : {
1487 : rcu_nocb_poll = true;
1488 : return 0;
1489 : }
1490 : early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
1491 :
1492 : /*
1493 : * Don't bother bypassing ->cblist if the call_rcu() rate is low.
1494 : * After all, the main point of bypassing is to avoid lock contention
1495 : * on ->nocb_lock, which only can happen at high call_rcu() rates.
1496 : */
1497 : int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
1498 : module_param(nocb_nobypass_lim_per_jiffy, int, 0);
1499 :
1500 : /*
1501 : * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
1502 : * lock isn't immediately available, increment ->nocb_lock_contended to
1503 : * flag the contention.
1504 : */
1505 : static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
1506 : __acquires(&rdp->nocb_bypass_lock)
1507 : {
1508 : lockdep_assert_irqs_disabled();
1509 : if (raw_spin_trylock(&rdp->nocb_bypass_lock))
1510 : return;
1511 : atomic_inc(&rdp->nocb_lock_contended);
1512 : WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
1513 : smp_mb__after_atomic(); /* atomic_inc() before lock. */
1514 : raw_spin_lock(&rdp->nocb_bypass_lock);
1515 : smp_mb__before_atomic(); /* atomic_dec() after lock. */
1516 : atomic_dec(&rdp->nocb_lock_contended);
1517 : }
1518 :
1519 : /*
1520 : * Spinwait until the specified rcu_data structure's ->nocb_lock is
1521 : * not contended. Please note that this is extremely special-purpose,
1522 : * relying on the fact that at most two kthreads and one CPU contend for
1523 : * this lock, and also that the two kthreads are guaranteed to have frequent
1524 : * grace-period-duration time intervals between successive acquisitions
1525 : * of the lock. This allows us to use an extremely simple throttling
1526 : * mechanism, and further to apply it only to the CPU doing floods of
1527 : * call_rcu() invocations. Don't try this at home!
1528 : */
1529 : static void rcu_nocb_wait_contended(struct rcu_data *rdp)
1530 : {
1531 : WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
1532 : while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
1533 : cpu_relax();
1534 : }
1535 :
1536 : /*
1537 : * Conditionally acquire the specified rcu_data structure's
1538 : * ->nocb_bypass_lock.
1539 : */
1540 : static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
1541 : {
1542 : lockdep_assert_irqs_disabled();
1543 : return raw_spin_trylock(&rdp->nocb_bypass_lock);
1544 : }
1545 :
1546 : /*
1547 : * Release the specified rcu_data structure's ->nocb_bypass_lock.
1548 : */
1549 : static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
1550 : __releases(&rdp->nocb_bypass_lock)
1551 : {
1552 : lockdep_assert_irqs_disabled();
1553 : raw_spin_unlock(&rdp->nocb_bypass_lock);
1554 : }
1555 :
1556 : /*
1557 : * Acquire the specified rcu_data structure's ->nocb_lock, but only
1558 : * if it corresponds to a no-CBs CPU.
1559 : */
1560 : static void rcu_nocb_lock(struct rcu_data *rdp)
1561 : {
1562 : lockdep_assert_irqs_disabled();
1563 : if (!rcu_segcblist_is_offloaded(&rdp->cblist))
1564 : return;
1565 : raw_spin_lock(&rdp->nocb_lock);
1566 : }
1567 :
1568 : /*
1569 : * Release the specified rcu_data structure's ->nocb_lock, but only
1570 : * if it corresponds to a no-CBs CPU.
1571 : */
1572 : static void rcu_nocb_unlock(struct rcu_data *rdp)
1573 : {
1574 : if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
1575 : lockdep_assert_irqs_disabled();
1576 : raw_spin_unlock(&rdp->nocb_lock);
1577 : }
1578 : }
1579 :
1580 : /*
1581 : * Release the specified rcu_data structure's ->nocb_lock and restore
1582 : * interrupts, but only if it corresponds to a no-CBs CPU.
1583 : */
1584 : static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
1585 : unsigned long flags)
1586 : {
1587 : if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
1588 : lockdep_assert_irqs_disabled();
1589 : raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1590 : } else {
1591 : local_irq_restore(flags);
1592 : }
1593 : }
1594 :
1595 : /* Lockdep check that ->cblist may be safely accessed. */
1596 : static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
1597 : {
1598 : lockdep_assert_irqs_disabled();
1599 : if (rcu_segcblist_is_offloaded(&rdp->cblist))
1600 : lockdep_assert_held(&rdp->nocb_lock);
1601 : }
1602 :
1603 : /*
1604 : * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
1605 : * grace period.
1606 : */
1607 : static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
1608 : {
1609 : swake_up_all(sq);
1610 : }
1611 :
1612 : static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
1613 : {
1614 : return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
1615 : }
1616 :
1617 : static void rcu_init_one_nocb(struct rcu_node *rnp)
1618 : {
1619 : init_swait_queue_head(&rnp->nocb_gp_wq[0]);
1620 : init_swait_queue_head(&rnp->nocb_gp_wq[1]);
1621 : }
1622 :
1623 : /* Is the specified CPU a no-CBs CPU? */
1624 : bool rcu_is_nocb_cpu(int cpu)
1625 : {
1626 : if (cpumask_available(rcu_nocb_mask))
1627 : return cpumask_test_cpu(cpu, rcu_nocb_mask);
1628 : return false;
1629 : }
1630 :
1631 : /*
1632 : * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
1633 : * and this function releases it.
1634 : */
1635 : static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
1636 : unsigned long flags)
1637 : __releases(rdp->nocb_lock)
1638 : {
1639 : bool needwake = false;
1640 : struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
1641 :
1642 : lockdep_assert_held(&rdp->nocb_lock);
1643 : if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
1644 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1645 : TPS("AlreadyAwake"));
1646 : rcu_nocb_unlock_irqrestore(rdp, flags);
1647 : return false;
1648 : }
1649 : del_timer(&rdp->nocb_timer);
1650 : rcu_nocb_unlock_irqrestore(rdp, flags);
1651 : raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
1652 : if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
1653 : WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
1654 : needwake = true;
1655 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
1656 : }
1657 : raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
1658 : if (needwake)
1659 : wake_up_process(rdp_gp->nocb_gp_kthread);
1660 :
1661 : return needwake;
1662 : }
1663 :
1664 : /*
1665 : * Arrange to wake the GP kthread for this NOCB group at some future
1666 : * time when it is safe to do so.
1667 : */
1668 : static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
1669 : const char *reason)
1670 : {
1671 : if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
1672 : return;
1673 : if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
1674 : mod_timer(&rdp->nocb_timer, jiffies + 1);
1675 : if (rdp->nocb_defer_wakeup < waketype)
1676 : WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
1677 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
1678 : }
1679 :
1680 : /*
1681 : * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
1682 : * However, if there is a callback to be enqueued and if ->nocb_bypass
1683 : * proves to be initially empty, just return false because the no-CB GP
1684 : * kthread may need to be awakened in this case.
1685 : *
1686 : * Note that this function always returns true if rhp is NULL.
1687 : */
1688 : static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1689 : unsigned long j)
1690 : {
1691 : struct rcu_cblist rcl;
1692 :
1693 : WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist));
1694 : rcu_lockdep_assert_cblist_protected(rdp);
1695 : lockdep_assert_held(&rdp->nocb_bypass_lock);
1696 : if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
1697 : raw_spin_unlock(&rdp->nocb_bypass_lock);
1698 : return false;
1699 : }
1700 : /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
1701 : if (rhp)
1702 : rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
1703 : rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
1704 : rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
1705 : WRITE_ONCE(rdp->nocb_bypass_first, j);
1706 : rcu_nocb_bypass_unlock(rdp);
1707 : return true;
1708 : }
1709 :
1710 : /*
1711 : * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
1712 : * However, if there is a callback to be enqueued and if ->nocb_bypass
1713 : * proves to be initially empty, just return false because the no-CB GP
1714 : * kthread may need to be awakened in this case.
1715 : *
1716 : * Note that this function always returns true if rhp is NULL.
1717 : */
1718 : static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1719 : unsigned long j)
1720 : {
1721 : if (!rcu_segcblist_is_offloaded(&rdp->cblist))
1722 : return true;
1723 : rcu_lockdep_assert_cblist_protected(rdp);
1724 : rcu_nocb_bypass_lock(rdp);
1725 : return rcu_nocb_do_flush_bypass(rdp, rhp, j);
1726 : }
1727 :
1728 : /*
1729 : * If the ->nocb_bypass_lock is immediately available, flush the
1730 : * ->nocb_bypass queue into ->cblist.
1731 : */
1732 : static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
1733 : {
1734 : rcu_lockdep_assert_cblist_protected(rdp);
1735 : if (!rcu_segcblist_is_offloaded(&rdp->cblist) ||
1736 : !rcu_nocb_bypass_trylock(rdp))
1737 : return;
1738 : WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
1739 : }
1740 :
1741 : /*
1742 : * See whether it is appropriate to use the ->nocb_bypass list in order
1743 : * to control contention on ->nocb_lock. A limited number of direct
1744 : * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
1745 : * is non-empty, further callbacks must be placed into ->nocb_bypass,
1746 : * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
1747 : * back to direct use of ->cblist. However, ->nocb_bypass should not be
1748 : * used if ->cblist is empty, because otherwise callbacks can be stranded
1749 : * on ->nocb_bypass because we cannot count on the current CPU ever again
1750 : * invoking call_rcu(). The general rule is that if ->nocb_bypass is
1751 : * non-empty, the corresponding no-CBs grace-period kthread must not be
1752 : * in an indefinite sleep state.
1753 : *
1754 : * Finally, it is not permitted to use the bypass during early boot,
1755 : * as doing so would confuse the auto-initialization code. Besides
1756 : * which, there is no point in worrying about lock contention while
1757 : * there is only one CPU in operation.
1758 : */
1759 : static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1760 : bool *was_alldone, unsigned long flags)
1761 : {
1762 : unsigned long c;
1763 : unsigned long cur_gp_seq;
1764 : unsigned long j = jiffies;
1765 : long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
1766 :
1767 : if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
1768 : *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1769 : return false; /* Not offloaded, no bypassing. */
1770 : }
1771 : lockdep_assert_irqs_disabled();
1772 :
1773 : // Don't use ->nocb_bypass during early boot.
1774 : if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
1775 : rcu_nocb_lock(rdp);
1776 : WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1777 : *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1778 : return false;
1779 : }
1780 :
1781 : // If we have advanced to a new jiffy, reset counts to allow
1782 : // moving back from ->nocb_bypass to ->cblist.
1783 : if (j == rdp->nocb_nobypass_last) {
1784 : c = rdp->nocb_nobypass_count + 1;
1785 : } else {
1786 : WRITE_ONCE(rdp->nocb_nobypass_last, j);
1787 : c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
1788 : if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
1789 : nocb_nobypass_lim_per_jiffy))
1790 : c = 0;
1791 : else if (c > nocb_nobypass_lim_per_jiffy)
1792 : c = nocb_nobypass_lim_per_jiffy;
1793 : }
1794 : WRITE_ONCE(rdp->nocb_nobypass_count, c);
1795 :
1796 : // If there hasn't yet been all that many ->cblist enqueues
1797 : // this jiffy, tell the caller to enqueue onto ->cblist. But flush
1798 : // ->nocb_bypass first.
1799 : if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
1800 : rcu_nocb_lock(rdp);
1801 : *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1802 : if (*was_alldone)
1803 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1804 : TPS("FirstQ"));
1805 : WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
1806 : WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1807 : return false; // Caller must enqueue the callback.
1808 : }
1809 :
1810 : // If ->nocb_bypass has been used too long or is too full,
1811 : // flush ->nocb_bypass to ->cblist.
1812 : if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
1813 : ncbs >= qhimark) {
1814 : rcu_nocb_lock(rdp);
1815 : if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
1816 : *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1817 : if (*was_alldone)
1818 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1819 : TPS("FirstQ"));
1820 : WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1821 : return false; // Caller must enqueue the callback.
1822 : }
1823 : if (j != rdp->nocb_gp_adv_time &&
1824 : rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
1825 : rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
1826 : rcu_advance_cbs_nowake(rdp->mynode, rdp);
1827 : rdp->nocb_gp_adv_time = j;
1828 : }
1829 : rcu_nocb_unlock_irqrestore(rdp, flags);
1830 : return true; // Callback already enqueued.
1831 : }
1832 :
1833 : // We need to use the bypass.
1834 : rcu_nocb_wait_contended(rdp);
1835 : rcu_nocb_bypass_lock(rdp);
1836 : ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
1837 : rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
1838 : rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
1839 : if (!ncbs) {
1840 : WRITE_ONCE(rdp->nocb_bypass_first, j);
1841 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
1842 : }
1843 : rcu_nocb_bypass_unlock(rdp);
1844 : smp_mb(); /* Order enqueue before wake. */
1845 : if (ncbs) {
1846 : local_irq_restore(flags);
1847 : } else {
1848 : // No-CBs GP kthread might be indefinitely asleep, if so, wake.
1849 : rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
1850 : if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
1851 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1852 : TPS("FirstBQwake"));
1853 : __call_rcu_nocb_wake(rdp, true, flags);
1854 : } else {
1855 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1856 : TPS("FirstBQnoWake"));
1857 : rcu_nocb_unlock_irqrestore(rdp, flags);
1858 : }
1859 : }
1860 : return true; // Callback already enqueued.
1861 : }
1862 :
1863 : /*
1864 : * Awaken the no-CBs grace-period kthead if needed, either due to it
1865 : * legitimately being asleep or due to overload conditions.
1866 : *
1867 : * If warranted, also wake up the kthread servicing this CPUs queues.
1868 : */
1869 : static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
1870 : unsigned long flags)
1871 : __releases(rdp->nocb_lock)
1872 : {
1873 : unsigned long cur_gp_seq;
1874 : unsigned long j;
1875 : long len;
1876 : struct task_struct *t;
1877 :
1878 : // If we are being polled or there is no kthread, just leave.
1879 : t = READ_ONCE(rdp->nocb_gp_kthread);
1880 : if (rcu_nocb_poll || !t) {
1881 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1882 : TPS("WakeNotPoll"));
1883 : rcu_nocb_unlock_irqrestore(rdp, flags);
1884 : return;
1885 : }
1886 : // Need to actually to a wakeup.
1887 : len = rcu_segcblist_n_cbs(&rdp->cblist);
1888 : if (was_alldone) {
1889 : rdp->qlen_last_fqs_check = len;
1890 : if (!irqs_disabled_flags(flags)) {
1891 : /* ... if queue was empty ... */
1892 : wake_nocb_gp(rdp, false, flags);
1893 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1894 : TPS("WakeEmpty"));
1895 : } else {
1896 : wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
1897 : TPS("WakeEmptyIsDeferred"));
1898 : rcu_nocb_unlock_irqrestore(rdp, flags);
1899 : }
1900 : } else if (len > rdp->qlen_last_fqs_check + qhimark) {
1901 : /* ... or if many callbacks queued. */
1902 : rdp->qlen_last_fqs_check = len;
1903 : j = jiffies;
1904 : if (j != rdp->nocb_gp_adv_time &&
1905 : rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
1906 : rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
1907 : rcu_advance_cbs_nowake(rdp->mynode, rdp);
1908 : rdp->nocb_gp_adv_time = j;
1909 : }
1910 : smp_mb(); /* Enqueue before timer_pending(). */
1911 : if ((rdp->nocb_cb_sleep ||
1912 : !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
1913 : !timer_pending(&rdp->nocb_bypass_timer))
1914 : wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
1915 : TPS("WakeOvfIsDeferred"));
1916 : rcu_nocb_unlock_irqrestore(rdp, flags);
1917 : } else {
1918 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
1919 : rcu_nocb_unlock_irqrestore(rdp, flags);
1920 : }
1921 : return;
1922 : }
1923 :
1924 : /* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
1925 : static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
1926 : {
1927 : unsigned long flags;
1928 : struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
1929 :
1930 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
1931 : rcu_nocb_lock_irqsave(rdp, flags);
1932 : smp_mb__after_spinlock(); /* Timer expire before wakeup. */
1933 : __call_rcu_nocb_wake(rdp, true, flags);
1934 : }
1935 :
1936 : /*
1937 : * Check if we ignore this rdp.
1938 : *
1939 : * We check that without holding the nocb lock but
1940 : * we make sure not to miss a freshly offloaded rdp
1941 : * with the current ordering:
1942 : *
1943 : * rdp_offload_toggle() nocb_gp_enabled_cb()
1944 : * ------------------------- ----------------------------
1945 : * WRITE flags LOCK nocb_gp_lock
1946 : * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
1947 : * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
1948 : * UNLOCK nocb_gp_lock READ flags
1949 : */
1950 : static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
1951 : {
1952 : u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
1953 :
1954 : return rcu_segcblist_test_flags(&rdp->cblist, flags);
1955 : }
1956 :
1957 : static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state)
1958 : {
1959 : struct rcu_segcblist *cblist = &rdp->cblist;
1960 :
1961 : if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
1962 : if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
1963 : rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
1964 : if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
1965 : *needwake_state = true;
1966 : }
1967 : return true;
1968 : }
1969 :
1970 : /*
1971 : * De-offloading. Clear our flag and notify the de-offload worker.
1972 : * We will ignore this rdp until it ever gets re-offloaded.
1973 : */
1974 : WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
1975 : rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
1976 : if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
1977 : *needwake_state = true;
1978 : return false;
1979 : }
1980 :
1981 :
1982 : /*
1983 : * No-CBs GP kthreads come here to wait for additional callbacks to show up
1984 : * or for grace periods to end.
1985 : */
1986 : static void nocb_gp_wait(struct rcu_data *my_rdp)
1987 : {
1988 : bool bypass = false;
1989 : long bypass_ncbs;
1990 : int __maybe_unused cpu = my_rdp->cpu;
1991 : unsigned long cur_gp_seq;
1992 : unsigned long flags;
1993 : bool gotcbs = false;
1994 : unsigned long j = jiffies;
1995 : bool needwait_gp = false; // This prevents actual uninitialized use.
1996 : bool needwake;
1997 : bool needwake_gp;
1998 : struct rcu_data *rdp;
1999 : struct rcu_node *rnp;
2000 : unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
2001 : bool wasempty = false;
2002 :
2003 : /*
2004 : * Each pass through the following loop checks for CBs and for the
2005 : * nearest grace period (if any) to wait for next. The CB kthreads
2006 : * and the global grace-period kthread are awakened if needed.
2007 : */
2008 : WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
2009 : for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
2010 : bool needwake_state = false;
2011 :
2012 : if (!nocb_gp_enabled_cb(rdp))
2013 : continue;
2014 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
2015 : rcu_nocb_lock_irqsave(rdp, flags);
2016 : if (!nocb_gp_update_state(rdp, &needwake_state)) {
2017 : rcu_nocb_unlock_irqrestore(rdp, flags);
2018 : if (needwake_state)
2019 : swake_up_one(&rdp->nocb_state_wq);
2020 : continue;
2021 : }
2022 : bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
2023 : if (bypass_ncbs &&
2024 : (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
2025 : bypass_ncbs > 2 * qhimark)) {
2026 : // Bypass full or old, so flush it.
2027 : (void)rcu_nocb_try_flush_bypass(rdp, j);
2028 : bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
2029 : } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
2030 : rcu_nocb_unlock_irqrestore(rdp, flags);
2031 : if (needwake_state)
2032 : swake_up_one(&rdp->nocb_state_wq);
2033 : continue; /* No callbacks here, try next. */
2034 : }
2035 : if (bypass_ncbs) {
2036 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
2037 : TPS("Bypass"));
2038 : bypass = true;
2039 : }
2040 : rnp = rdp->mynode;
2041 : if (bypass) { // Avoid race with first bypass CB.
2042 : WRITE_ONCE(my_rdp->nocb_defer_wakeup,
2043 : RCU_NOCB_WAKE_NOT);
2044 : del_timer(&my_rdp->nocb_timer);
2045 : }
2046 : // Advance callbacks if helpful and low contention.
2047 : needwake_gp = false;
2048 : if (!rcu_segcblist_restempty(&rdp->cblist,
2049 : RCU_NEXT_READY_TAIL) ||
2050 : (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
2051 : rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
2052 : raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
2053 : needwake_gp = rcu_advance_cbs(rnp, rdp);
2054 : wasempty = rcu_segcblist_restempty(&rdp->cblist,
2055 : RCU_NEXT_READY_TAIL);
2056 : raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
2057 : }
2058 : // Need to wait on some grace period?
2059 : WARN_ON_ONCE(wasempty &&
2060 : !rcu_segcblist_restempty(&rdp->cblist,
2061 : RCU_NEXT_READY_TAIL));
2062 : if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
2063 : if (!needwait_gp ||
2064 : ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
2065 : wait_gp_seq = cur_gp_seq;
2066 : needwait_gp = true;
2067 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
2068 : TPS("NeedWaitGP"));
2069 : }
2070 : if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
2071 : needwake = rdp->nocb_cb_sleep;
2072 : WRITE_ONCE(rdp->nocb_cb_sleep, false);
2073 : smp_mb(); /* CB invocation -after- GP end. */
2074 : } else {
2075 : needwake = false;
2076 : }
2077 : rcu_nocb_unlock_irqrestore(rdp, flags);
2078 : if (needwake) {
2079 : swake_up_one(&rdp->nocb_cb_wq);
2080 : gotcbs = true;
2081 : }
2082 : if (needwake_gp)
2083 : rcu_gp_kthread_wake();
2084 : if (needwake_state)
2085 : swake_up_one(&rdp->nocb_state_wq);
2086 : }
2087 :
2088 : my_rdp->nocb_gp_bypass = bypass;
2089 : my_rdp->nocb_gp_gp = needwait_gp;
2090 : my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
2091 : if (bypass && !rcu_nocb_poll) {
2092 : // At least one child with non-empty ->nocb_bypass, so set
2093 : // timer in order to avoid stranding its callbacks.
2094 : raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
2095 : mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
2096 : raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
2097 : }
2098 : if (rcu_nocb_poll) {
2099 : /* Polling, so trace if first poll in the series. */
2100 : if (gotcbs)
2101 : trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
2102 : schedule_timeout_idle(1);
2103 : } else if (!needwait_gp) {
2104 : /* Wait for callbacks to appear. */
2105 : trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
2106 : swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
2107 : !READ_ONCE(my_rdp->nocb_gp_sleep));
2108 : trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
2109 : } else {
2110 : rnp = my_rdp->mynode;
2111 : trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
2112 : swait_event_interruptible_exclusive(
2113 : rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
2114 : rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
2115 : !READ_ONCE(my_rdp->nocb_gp_sleep));
2116 : trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
2117 : }
2118 : if (!rcu_nocb_poll) {
2119 : raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
2120 : if (bypass)
2121 : del_timer(&my_rdp->nocb_bypass_timer);
2122 : WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
2123 : raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
2124 : }
2125 : my_rdp->nocb_gp_seq = -1;
2126 : WARN_ON(signal_pending(current));
2127 : }
2128 :
2129 : /*
2130 : * No-CBs grace-period-wait kthread. There is one of these per group
2131 : * of CPUs, but only once at least one CPU in that group has come online
2132 : * at least once since boot. This kthread checks for newly posted
2133 : * callbacks from any of the CPUs it is responsible for, waits for a
2134 : * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
2135 : * that then have callback-invocation work to do.
2136 : */
2137 : static int rcu_nocb_gp_kthread(void *arg)
2138 : {
2139 : struct rcu_data *rdp = arg;
2140 :
2141 : for (;;) {
2142 : WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
2143 : nocb_gp_wait(rdp);
2144 : cond_resched_tasks_rcu_qs();
2145 : }
2146 : return 0;
2147 : }
2148 :
2149 : static inline bool nocb_cb_can_run(struct rcu_data *rdp)
2150 : {
2151 : u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
2152 : return rcu_segcblist_test_flags(&rdp->cblist, flags);
2153 : }
2154 :
2155 : static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
2156 : {
2157 : return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
2158 : }
2159 :
2160 : /*
2161 : * Invoke any ready callbacks from the corresponding no-CBs CPU,
2162 : * then, if there are no more, wait for more to appear.
2163 : */
2164 : static void nocb_cb_wait(struct rcu_data *rdp)
2165 : {
2166 : struct rcu_segcblist *cblist = &rdp->cblist;
2167 : unsigned long cur_gp_seq;
2168 : unsigned long flags;
2169 : bool needwake_state = false;
2170 : bool needwake_gp = false;
2171 : struct rcu_node *rnp = rdp->mynode;
2172 :
2173 : local_irq_save(flags);
2174 : rcu_momentary_dyntick_idle();
2175 : local_irq_restore(flags);
2176 : local_bh_disable();
2177 : rcu_do_batch(rdp);
2178 : local_bh_enable();
2179 : lockdep_assert_irqs_enabled();
2180 : rcu_nocb_lock_irqsave(rdp, flags);
2181 : if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
2182 : rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
2183 : raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
2184 : needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
2185 : raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2186 : }
2187 :
2188 : WRITE_ONCE(rdp->nocb_cb_sleep, true);
2189 :
2190 : if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
2191 : if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
2192 : rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
2193 : if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
2194 : needwake_state = true;
2195 : }
2196 : if (rcu_segcblist_ready_cbs(cblist))
2197 : WRITE_ONCE(rdp->nocb_cb_sleep, false);
2198 : } else {
2199 : /*
2200 : * De-offloading. Clear our flag and notify the de-offload worker.
2201 : * We won't touch the callbacks and keep sleeping until we ever
2202 : * get re-offloaded.
2203 : */
2204 : WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
2205 : rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
2206 : if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
2207 : needwake_state = true;
2208 : }
2209 :
2210 : if (rdp->nocb_cb_sleep)
2211 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
2212 :
2213 : rcu_nocb_unlock_irqrestore(rdp, flags);
2214 : if (needwake_gp)
2215 : rcu_gp_kthread_wake();
2216 :
2217 : if (needwake_state)
2218 : swake_up_one(&rdp->nocb_state_wq);
2219 :
2220 : do {
2221 : swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
2222 : nocb_cb_wait_cond(rdp));
2223 :
2224 : // VVV Ensure CB invocation follows _sleep test.
2225 : if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
2226 : WARN_ON(signal_pending(current));
2227 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
2228 : }
2229 : } while (!nocb_cb_can_run(rdp));
2230 : }
2231 :
2232 : /*
2233 : * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
2234 : * nocb_cb_wait() to do the dirty work.
2235 : */
2236 : static int rcu_nocb_cb_kthread(void *arg)
2237 : {
2238 : struct rcu_data *rdp = arg;
2239 :
2240 : // Each pass through this loop does one callback batch, and,
2241 : // if there are no more ready callbacks, waits for them.
2242 : for (;;) {
2243 : nocb_cb_wait(rdp);
2244 : cond_resched_tasks_rcu_qs();
2245 : }
2246 : return 0;
2247 : }
2248 :
2249 : /* Is a deferred wakeup of rcu_nocb_kthread() required? */
2250 : static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2251 : {
2252 : return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
2253 : }
2254 :
2255 : /* Do a deferred wakeup of rcu_nocb_kthread(). */
2256 : static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
2257 : {
2258 : unsigned long flags;
2259 : int ndw;
2260 : int ret;
2261 :
2262 : rcu_nocb_lock_irqsave(rdp, flags);
2263 : if (!rcu_nocb_need_deferred_wakeup(rdp)) {
2264 : rcu_nocb_unlock_irqrestore(rdp, flags);
2265 : return false;
2266 : }
2267 : ndw = READ_ONCE(rdp->nocb_defer_wakeup);
2268 : WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2269 : ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
2270 : trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
2271 :
2272 : return ret;
2273 : }
2274 :
2275 : /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
2276 : static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
2277 : {
2278 : struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
2279 :
2280 : do_nocb_deferred_wakeup_common(rdp);
2281 : }
2282 :
2283 : /*
2284 : * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
2285 : * This means we do an inexact common-case check. Note that if
2286 : * we miss, ->nocb_timer will eventually clean things up.
2287 : */
2288 : static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
2289 : {
2290 : if (rcu_nocb_need_deferred_wakeup(rdp))
2291 : return do_nocb_deferred_wakeup_common(rdp);
2292 : return false;
2293 : }
2294 :
2295 : void rcu_nocb_flush_deferred_wakeup(void)
2296 : {
2297 : do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
2298 : }
2299 : EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
2300 :
2301 : static int rdp_offload_toggle(struct rcu_data *rdp,
2302 : bool offload, unsigned long flags)
2303 : __releases(rdp->nocb_lock)
2304 : {
2305 : struct rcu_segcblist *cblist = &rdp->cblist;
2306 : struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
2307 : bool wake_gp = false;
2308 :
2309 : rcu_segcblist_offload(cblist, offload);
2310 :
2311 : if (rdp->nocb_cb_sleep)
2312 : rdp->nocb_cb_sleep = false;
2313 : rcu_nocb_unlock_irqrestore(rdp, flags);
2314 :
2315 : /*
2316 : * Ignore former value of nocb_cb_sleep and force wake up as it could
2317 : * have been spuriously set to false already.
2318 : */
2319 : swake_up_one(&rdp->nocb_cb_wq);
2320 :
2321 : raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
2322 : if (rdp_gp->nocb_gp_sleep) {
2323 : rdp_gp->nocb_gp_sleep = false;
2324 : wake_gp = true;
2325 : }
2326 : raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
2327 :
2328 : if (wake_gp)
2329 : wake_up_process(rdp_gp->nocb_gp_kthread);
2330 :
2331 : return 0;
2332 : }
2333 :
2334 : static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
2335 : {
2336 : struct rcu_segcblist *cblist = &rdp->cblist;
2337 : unsigned long flags;
2338 : int ret;
2339 :
2340 : pr_info("De-offloading %d\n", rdp->cpu);
2341 :
2342 : rcu_nocb_lock_irqsave(rdp, flags);
2343 : /*
2344 : * If there are still pending work offloaded, the offline
2345 : * CPU won't help much handling them.
2346 : */
2347 : if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) {
2348 : rcu_nocb_unlock_irqrestore(rdp, flags);
2349 : return -EBUSY;
2350 : }
2351 :
2352 : ret = rdp_offload_toggle(rdp, false, flags);
2353 : swait_event_exclusive(rdp->nocb_state_wq,
2354 : !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
2355 : SEGCBLIST_KTHREAD_GP));
2356 : rcu_nocb_lock_irqsave(rdp, flags);
2357 : /* Make sure nocb timer won't stay around */
2358 : WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
2359 : rcu_nocb_unlock_irqrestore(rdp, flags);
2360 : del_timer_sync(&rdp->nocb_timer);
2361 :
2362 : /*
2363 : * Flush bypass. While IRQs are disabled and once we set
2364 : * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be
2365 : * enqueued on bypass.
2366 : */
2367 : rcu_nocb_lock_irqsave(rdp, flags);
2368 : rcu_nocb_flush_bypass(rdp, NULL, jiffies);
2369 : rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
2370 : /*
2371 : * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
2372 : * rcu_nocb_unlock_irqrestore() anymore. Theoretically we
2373 : * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs
2374 : * disabled now, but let's be paranoid.
2375 : */
2376 : raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
2377 :
2378 : return ret;
2379 : }
2380 :
2381 : static long rcu_nocb_rdp_deoffload(void *arg)
2382 : {
2383 : struct rcu_data *rdp = arg;
2384 :
2385 : WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
2386 : return __rcu_nocb_rdp_deoffload(rdp);
2387 : }
2388 :
2389 : int rcu_nocb_cpu_deoffload(int cpu)
2390 : {
2391 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2392 : int ret = 0;
2393 :
2394 : if (rdp == rdp->nocb_gp_rdp) {
2395 : pr_info("Can't deoffload an rdp GP leader (yet)\n");
2396 : return -EINVAL;
2397 : }
2398 : mutex_lock(&rcu_state.barrier_mutex);
2399 : cpus_read_lock();
2400 : if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
2401 : if (cpu_online(cpu))
2402 : ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
2403 : else
2404 : ret = __rcu_nocb_rdp_deoffload(rdp);
2405 : if (!ret)
2406 : cpumask_clear_cpu(cpu, rcu_nocb_mask);
2407 : }
2408 : cpus_read_unlock();
2409 : mutex_unlock(&rcu_state.barrier_mutex);
2410 :
2411 : return ret;
2412 : }
2413 : EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
2414 :
2415 : static int __rcu_nocb_rdp_offload(struct rcu_data *rdp)
2416 : {
2417 : struct rcu_segcblist *cblist = &rdp->cblist;
2418 : unsigned long flags;
2419 : int ret;
2420 :
2421 : /*
2422 : * For now we only support re-offload, ie: the rdp must have been
2423 : * offloaded on boot first.
2424 : */
2425 : if (!rdp->nocb_gp_rdp)
2426 : return -EINVAL;
2427 :
2428 : pr_info("Offloading %d\n", rdp->cpu);
2429 : /*
2430 : * Can't use rcu_nocb_lock_irqsave() while we are in
2431 : * SEGCBLIST_SOFTIRQ_ONLY mode.
2432 : */
2433 : raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
2434 : /* Re-enable nocb timer */
2435 : WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2436 : /*
2437 : * We didn't take the nocb lock while working on the
2438 : * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
2439 : * Every modifications that have been done previously on
2440 : * rdp->cblist must be visible remotely by the nocb kthreads
2441 : * upon wake up after reading the cblist flags.
2442 : *
2443 : * The layout against nocb_lock enforces that ordering:
2444 : *
2445 : * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
2446 : * ------------------------- ----------------------------
2447 : * WRITE callbacks rcu_nocb_lock()
2448 : * rcu_nocb_lock() READ flags
2449 : * WRITE flags READ callbacks
2450 : * rcu_nocb_unlock() rcu_nocb_unlock()
2451 : */
2452 : ret = rdp_offload_toggle(rdp, true, flags);
2453 : swait_event_exclusive(rdp->nocb_state_wq,
2454 : rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
2455 : rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
2456 :
2457 : return ret;
2458 : }
2459 :
2460 : static long rcu_nocb_rdp_offload(void *arg)
2461 : {
2462 : struct rcu_data *rdp = arg;
2463 :
2464 : WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
2465 : return __rcu_nocb_rdp_offload(rdp);
2466 : }
2467 :
2468 : int rcu_nocb_cpu_offload(int cpu)
2469 : {
2470 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2471 : int ret = 0;
2472 :
2473 : mutex_lock(&rcu_state.barrier_mutex);
2474 : cpus_read_lock();
2475 : if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
2476 : if (cpu_online(cpu))
2477 : ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
2478 : else
2479 : ret = __rcu_nocb_rdp_offload(rdp);
2480 : if (!ret)
2481 : cpumask_set_cpu(cpu, rcu_nocb_mask);
2482 : }
2483 : cpus_read_unlock();
2484 : mutex_unlock(&rcu_state.barrier_mutex);
2485 :
2486 : return ret;
2487 : }
2488 : EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
2489 :
2490 : void __init rcu_init_nohz(void)
2491 : {
2492 : int cpu;
2493 : bool need_rcu_nocb_mask = false;
2494 : struct rcu_data *rdp;
2495 :
2496 : #if defined(CONFIG_NO_HZ_FULL)
2497 : if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
2498 : need_rcu_nocb_mask = true;
2499 : #endif /* #if defined(CONFIG_NO_HZ_FULL) */
2500 :
2501 : if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
2502 : if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
2503 : pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
2504 : return;
2505 : }
2506 : }
2507 : if (!cpumask_available(rcu_nocb_mask))
2508 : return;
2509 :
2510 : #if defined(CONFIG_NO_HZ_FULL)
2511 : if (tick_nohz_full_running)
2512 : cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2513 : #endif /* #if defined(CONFIG_NO_HZ_FULL) */
2514 :
2515 : if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
2516 : pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
2517 : cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2518 : rcu_nocb_mask);
2519 : }
2520 : if (cpumask_empty(rcu_nocb_mask))
2521 : pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
2522 : else
2523 : pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
2524 : cpumask_pr_args(rcu_nocb_mask));
2525 : if (rcu_nocb_poll)
2526 : pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2527 :
2528 : for_each_cpu(cpu, rcu_nocb_mask) {
2529 : rdp = per_cpu_ptr(&rcu_data, cpu);
2530 : if (rcu_segcblist_empty(&rdp->cblist))
2531 : rcu_segcblist_init(&rdp->cblist);
2532 : rcu_segcblist_offload(&rdp->cblist, true);
2533 : rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
2534 : rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
2535 : }
2536 : rcu_organize_nocb_kthreads();
2537 : }
2538 :
2539 : /* Initialize per-rcu_data variables for no-CBs CPUs. */
2540 : static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2541 : {
2542 : init_swait_queue_head(&rdp->nocb_cb_wq);
2543 : init_swait_queue_head(&rdp->nocb_gp_wq);
2544 : init_swait_queue_head(&rdp->nocb_state_wq);
2545 : raw_spin_lock_init(&rdp->nocb_lock);
2546 : raw_spin_lock_init(&rdp->nocb_bypass_lock);
2547 : raw_spin_lock_init(&rdp->nocb_gp_lock);
2548 : timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
2549 : timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
2550 : rcu_cblist_init(&rdp->nocb_bypass);
2551 : }
2552 :
2553 : /*
2554 : * If the specified CPU is a no-CBs CPU that does not already have its
2555 : * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
2556 : * for this CPU's group has not yet been created, spawn it as well.
2557 : */
2558 : static void rcu_spawn_one_nocb_kthread(int cpu)
2559 : {
2560 : struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2561 : struct rcu_data *rdp_gp;
2562 : struct task_struct *t;
2563 :
2564 : /*
2565 : * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
2566 : * then nothing to do.
2567 : */
2568 : if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
2569 : return;
2570 :
2571 : /* If we didn't spawn the GP kthread first, reorganize! */
2572 : rdp_gp = rdp->nocb_gp_rdp;
2573 : if (!rdp_gp->nocb_gp_kthread) {
2574 : t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
2575 : "rcuog/%d", rdp_gp->cpu);
2576 : if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
2577 : return;
2578 : WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
2579 : }
2580 :
2581 : /* Spawn the kthread for this CPU. */
2582 : t = kthread_run(rcu_nocb_cb_kthread, rdp,
2583 : "rcuo%c/%d", rcu_state.abbr, cpu);
2584 : if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
2585 : return;
2586 : WRITE_ONCE(rdp->nocb_cb_kthread, t);
2587 : WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
2588 : }
2589 :
2590 : /*
2591 : * If the specified CPU is a no-CBs CPU that does not already have its
2592 : * rcuo kthread, spawn it.
2593 : */
2594 : static void rcu_spawn_cpu_nocb_kthread(int cpu)
2595 : {
2596 : if (rcu_scheduler_fully_active)
2597 : rcu_spawn_one_nocb_kthread(cpu);
2598 : }
2599 :
2600 : /*
2601 : * Once the scheduler is running, spawn rcuo kthreads for all online
2602 : * no-CBs CPUs. This assumes that the early_initcall()s happen before
2603 : * non-boot CPUs come online -- if this changes, we will need to add
2604 : * some mutual exclusion.
2605 : */
2606 : static void __init rcu_spawn_nocb_kthreads(void)
2607 : {
2608 : int cpu;
2609 :
2610 : for_each_online_cpu(cpu)
2611 : rcu_spawn_cpu_nocb_kthread(cpu);
2612 : }
2613 :
2614 : /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
2615 : static int rcu_nocb_gp_stride = -1;
2616 : module_param(rcu_nocb_gp_stride, int, 0444);
2617 :
2618 : /*
2619 : * Initialize GP-CB relationships for all no-CBs CPU.
2620 : */
2621 : static void __init rcu_organize_nocb_kthreads(void)
2622 : {
2623 : int cpu;
2624 : bool firsttime = true;
2625 : bool gotnocbs = false;
2626 : bool gotnocbscbs = true;
2627 : int ls = rcu_nocb_gp_stride;
2628 : int nl = 0; /* Next GP kthread. */
2629 : struct rcu_data *rdp;
2630 : struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
2631 : struct rcu_data *rdp_prev = NULL;
2632 :
2633 : if (!cpumask_available(rcu_nocb_mask))
2634 : return;
2635 : if (ls == -1) {
2636 : ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
2637 : rcu_nocb_gp_stride = ls;
2638 : }
2639 :
2640 : /*
2641 : * Each pass through this loop sets up one rcu_data structure.
2642 : * Should the corresponding CPU come online in the future, then
2643 : * we will spawn the needed set of rcu_nocb_kthread() kthreads.
2644 : */
2645 : for_each_cpu(cpu, rcu_nocb_mask) {
2646 : rdp = per_cpu_ptr(&rcu_data, cpu);
2647 : if (rdp->cpu >= nl) {
2648 : /* New GP kthread, set up for CBs & next GP. */
2649 : gotnocbs = true;
2650 : nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
2651 : rdp->nocb_gp_rdp = rdp;
2652 : rdp_gp = rdp;
2653 : if (dump_tree) {
2654 : if (!firsttime)
2655 : pr_cont("%s\n", gotnocbscbs
2656 : ? "" : " (self only)");
2657 : gotnocbscbs = false;
2658 : firsttime = false;
2659 : pr_alert("%s: No-CB GP kthread CPU %d:",
2660 : __func__, cpu);
2661 : }
2662 : } else {
2663 : /* Another CB kthread, link to previous GP kthread. */
2664 : gotnocbscbs = true;
2665 : rdp->nocb_gp_rdp = rdp_gp;
2666 : rdp_prev->nocb_next_cb_rdp = rdp;
2667 : if (dump_tree)
2668 : pr_cont(" %d", cpu);
2669 : }
2670 : rdp_prev = rdp;
2671 : }
2672 : if (gotnocbs && dump_tree)
2673 : pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
2674 : }
2675 :
2676 : /*
2677 : * Bind the current task to the offloaded CPUs. If there are no offloaded
2678 : * CPUs, leave the task unbound. Splat if the bind attempt fails.
2679 : */
2680 : void rcu_bind_current_to_nocb(void)
2681 : {
2682 : if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
2683 : WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
2684 : }
2685 : EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
2686 :
2687 : // The ->on_cpu field is available only in CONFIG_SMP=y, so...
2688 : #ifdef CONFIG_SMP
2689 : static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
2690 : {
2691 : return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
2692 : }
2693 : #else // #ifdef CONFIG_SMP
2694 : static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
2695 : {
2696 : return "";
2697 : }
2698 : #endif // #else #ifdef CONFIG_SMP
2699 :
2700 : /*
2701 : * Dump out nocb grace-period kthread state for the specified rcu_data
2702 : * structure.
2703 : */
2704 : static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
2705 : {
2706 : struct rcu_node *rnp = rdp->mynode;
2707 :
2708 : pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
2709 : rdp->cpu,
2710 : "kK"[!!rdp->nocb_gp_kthread],
2711 : "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
2712 : "dD"[!!rdp->nocb_defer_wakeup],
2713 : "tT"[timer_pending(&rdp->nocb_timer)],
2714 : "bB"[timer_pending(&rdp->nocb_bypass_timer)],
2715 : "sS"[!!rdp->nocb_gp_sleep],
2716 : ".W"[swait_active(&rdp->nocb_gp_wq)],
2717 : ".W"[swait_active(&rnp->nocb_gp_wq[0])],
2718 : ".W"[swait_active(&rnp->nocb_gp_wq[1])],
2719 : ".B"[!!rdp->nocb_gp_bypass],
2720 : ".G"[!!rdp->nocb_gp_gp],
2721 : (long)rdp->nocb_gp_seq,
2722 : rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
2723 : rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
2724 : rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
2725 : show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
2726 : }
2727 :
2728 : /* Dump out nocb kthread state for the specified rcu_data structure. */
2729 : static void show_rcu_nocb_state(struct rcu_data *rdp)
2730 : {
2731 : char bufw[20];
2732 : char bufr[20];
2733 : struct rcu_segcblist *rsclp = &rdp->cblist;
2734 : bool waslocked;
2735 : bool wastimer;
2736 : bool wassleep;
2737 :
2738 : if (rdp->nocb_gp_rdp == rdp)
2739 : show_rcu_nocb_gp_state(rdp);
2740 :
2741 : sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
2742 : sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
2743 : pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
2744 : rdp->cpu, rdp->nocb_gp_rdp->cpu,
2745 : rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
2746 : "kK"[!!rdp->nocb_cb_kthread],
2747 : "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
2748 : "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
2749 : "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
2750 : "sS"[!!rdp->nocb_cb_sleep],
2751 : ".W"[swait_active(&rdp->nocb_cb_wq)],
2752 : jiffies - rdp->nocb_bypass_first,
2753 : jiffies - rdp->nocb_nobypass_last,
2754 : rdp->nocb_nobypass_count,
2755 : ".D"[rcu_segcblist_ready_cbs(rsclp)],
2756 : ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
2757 : rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
2758 : ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
2759 : rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
2760 : ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
2761 : ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
2762 : rcu_segcblist_n_cbs(&rdp->cblist),
2763 : rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
2764 : rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
2765 : show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
2766 :
2767 : /* It is OK for GP kthreads to have GP state. */
2768 : if (rdp->nocb_gp_rdp == rdp)
2769 : return;
2770 :
2771 : waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
2772 : wastimer = timer_pending(&rdp->nocb_bypass_timer);
2773 : wassleep = swait_active(&rdp->nocb_gp_wq);
2774 : if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
2775 : return; /* Nothing untowards. */
2776 :
2777 : pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
2778 : "lL"[waslocked],
2779 : "dD"[!!rdp->nocb_defer_wakeup],
2780 : "tT"[wastimer],
2781 : "sS"[!!rdp->nocb_gp_sleep],
2782 : ".W"[wassleep]);
2783 : }
2784 :
2785 : #else /* #ifdef CONFIG_RCU_NOCB_CPU */
2786 :
2787 : /* No ->nocb_lock to acquire. */
2788 97537 : static void rcu_nocb_lock(struct rcu_data *rdp)
2789 : {
2790 97537 : }
2791 :
2792 : /* No ->nocb_lock to release. */
2793 6 : static void rcu_nocb_unlock(struct rcu_data *rdp)
2794 : {
2795 4 : }
2796 :
2797 : /* No ->nocb_lock to release. */
2798 97605 : static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
2799 : unsigned long flags)
2800 : {
2801 97605 : local_irq_restore(flags);
2802 97631 : }
2803 :
2804 : /* Lockdep check that ->cblist may be safely accessed. */
2805 26320 : static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
2806 : {
2807 52640 : lockdep_assert_irqs_disabled();
2808 26320 : }
2809 :
2810 2019 : static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
2811 : {
2812 2019 : }
2813 :
2814 2019 : static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
2815 : {
2816 2019 : return NULL;
2817 : }
2818 :
2819 1 : static void rcu_init_one_nocb(struct rcu_node *rnp)
2820 : {
2821 1 : }
2822 :
2823 2 : static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
2824 : unsigned long j)
2825 : {
2826 2 : return true;
2827 : }
2828 :
2829 627326 : static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
2830 : bool *was_alldone, unsigned long flags)
2831 : {
2832 627326 : return false;
2833 : }
2834 :
2835 : static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
2836 : unsigned long flags)
2837 : {
2838 : WARN_ON_ONCE(1); /* Should be dead code! */
2839 : }
2840 :
2841 4 : static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2842 : {
2843 4 : }
2844 :
2845 27740 : static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2846 : {
2847 27740 : return false;
2848 : }
2849 :
2850 54197 : static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
2851 : {
2852 54197 : return false;
2853 : }
2854 :
2855 4 : static void rcu_spawn_cpu_nocb_kthread(int cpu)
2856 : {
2857 4 : }
2858 :
2859 1 : static void __init rcu_spawn_nocb_kthreads(void)
2860 : {
2861 1 : }
2862 :
2863 : static void show_rcu_nocb_state(struct rcu_data *rdp)
2864 : {
2865 : }
2866 :
2867 : #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2868 :
2869 : /*
2870 : * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
2871 : * grace-period kthread will do force_quiescent_state() processing?
2872 : * The idea is to avoid waking up RCU core processing on such a
2873 : * CPU unless the grace period has extended for too long.
2874 : *
2875 : * This code relies on the fact that all NO_HZ_FULL CPUs are also
2876 : * CONFIG_RCU_NOCB_CPU CPUs.
2877 : */
2878 : static bool rcu_nohz_full_cpu(void)
2879 : {
2880 : #ifdef CONFIG_NO_HZ_FULL
2881 : if (tick_nohz_full_cpu(smp_processor_id()) &&
2882 : (!rcu_gp_in_progress() ||
2883 : time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ)))
2884 : return true;
2885 : #endif /* #ifdef CONFIG_NO_HZ_FULL */
2886 : return false;
2887 : }
2888 :
2889 : /*
2890 : * Bind the RCU grace-period kthreads to the housekeeping CPU.
2891 : */
2892 1 : static void rcu_bind_gp_kthread(void)
2893 : {
2894 1 : if (!tick_nohz_full_enabled())
2895 1 : return;
2896 1 : housekeeping_affine(current, HK_FLAG_RCU);
2897 : }
2898 :
2899 : /* Record the current task on dyntick-idle entry. */
2900 : static void noinstr rcu_dynticks_task_enter(void)
2901 : {
2902 : #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
2903 : WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
2904 : #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
2905 : }
2906 :
2907 : /* Record no current task on dyntick-idle exit. */
2908 : static void noinstr rcu_dynticks_task_exit(void)
2909 : {
2910 : #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
2911 : WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
2912 : #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
2913 : }
2914 :
2915 : /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
2916 : static void rcu_dynticks_task_trace_enter(void)
2917 : {
2918 : #ifdef CONFIG_TASKS_RCU_TRACE
2919 : if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
2920 : current->trc_reader_special.b.need_mb = true;
2921 : #endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
2922 : }
2923 :
2924 : /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
2925 : static void rcu_dynticks_task_trace_exit(void)
2926 : {
2927 : #ifdef CONFIG_TASKS_RCU_TRACE
2928 : if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
2929 : current->trc_reader_special.b.need_mb = false;
2930 : #endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
2931 : }
|