Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 : * policies)
5 : */
6 : #include "sched.h"
7 :
8 : #include "pelt.h"
9 :
10 : int sched_rr_timeslice = RR_TIMESLICE;
11 : int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
12 : /* More than 4 hours if BW_SHIFT equals 20. */
13 : static const u64 max_rt_runtime = MAX_BW;
14 :
15 : static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
16 :
17 : struct rt_bandwidth def_rt_bandwidth;
18 :
19 0 : static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
20 : {
21 0 : struct rt_bandwidth *rt_b =
22 0 : container_of(timer, struct rt_bandwidth, rt_period_timer);
23 0 : int idle = 0;
24 0 : int overrun;
25 :
26 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
27 0 : for (;;) {
28 0 : overrun = hrtimer_forward_now(timer, rt_b->rt_period);
29 0 : if (!overrun)
30 : break;
31 :
32 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
33 0 : idle = do_sched_rt_period_timer(rt_b, overrun);
34 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
35 : }
36 0 : if (idle)
37 0 : rt_b->rt_period_active = 0;
38 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
39 :
40 0 : return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
41 : }
42 :
43 1 : void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
44 : {
45 1 : rt_b->rt_period = ns_to_ktime(period);
46 1 : rt_b->rt_runtime = runtime;
47 :
48 1 : raw_spin_lock_init(&rt_b->rt_runtime_lock);
49 :
50 1 : hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
51 : HRTIMER_MODE_REL_HARD);
52 1 : rt_b->rt_period_timer.function = sched_rt_period_timer;
53 1 : }
54 :
55 0 : static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
56 : {
57 0 : if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
58 : return;
59 :
60 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
61 0 : if (!rt_b->rt_period_active) {
62 0 : rt_b->rt_period_active = 1;
63 : /*
64 : * SCHED_DEADLINE updates the bandwidth, as a run away
65 : * RT task with a DL task could hog a CPU. But DL does
66 : * not reset the period. If a deadline task was running
67 : * without an RT task running, it can cause RT tasks to
68 : * throttle when they start up. Kick the timer right away
69 : * to update the period.
70 : */
71 0 : hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
72 0 : hrtimer_start_expires(&rt_b->rt_period_timer,
73 : HRTIMER_MODE_ABS_PINNED_HARD);
74 : }
75 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
76 : }
77 :
78 4 : void init_rt_rq(struct rt_rq *rt_rq)
79 : {
80 4 : struct rt_prio_array *array;
81 4 : int i;
82 :
83 4 : array = &rt_rq->active;
84 404 : for (i = 0; i < MAX_RT_PRIO; i++) {
85 400 : INIT_LIST_HEAD(array->queue + i);
86 400 : __clear_bit(i, array->bitmap);
87 : }
88 : /* delimiter for bitsearch: */
89 4 : __set_bit(MAX_RT_PRIO, array->bitmap);
90 :
91 : #if defined CONFIG_SMP
92 4 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
93 4 : rt_rq->highest_prio.next = MAX_RT_PRIO-1;
94 4 : rt_rq->rt_nr_migratory = 0;
95 4 : rt_rq->overloaded = 0;
96 4 : plist_head_init(&rt_rq->pushable_tasks);
97 : #endif /* CONFIG_SMP */
98 : /* We start is dequeued state, because no RT tasks are queued */
99 4 : rt_rq->rt_queued = 0;
100 :
101 4 : rt_rq->rt_time = 0;
102 4 : rt_rq->rt_throttled = 0;
103 4 : rt_rq->rt_runtime = 0;
104 4 : raw_spin_lock_init(&rt_rq->rt_runtime_lock);
105 4 : }
106 :
107 : #ifdef CONFIG_RT_GROUP_SCHED
108 : static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
109 : {
110 : hrtimer_cancel(&rt_b->rt_period_timer);
111 : }
112 :
113 : #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
114 :
115 : static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
116 : {
117 : #ifdef CONFIG_SCHED_DEBUG
118 : WARN_ON_ONCE(!rt_entity_is_task(rt_se));
119 : #endif
120 : return container_of(rt_se, struct task_struct, rt);
121 : }
122 :
123 : static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
124 : {
125 : return rt_rq->rq;
126 : }
127 :
128 : static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
129 : {
130 : return rt_se->rt_rq;
131 : }
132 :
133 : static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
134 : {
135 : struct rt_rq *rt_rq = rt_se->rt_rq;
136 :
137 : return rt_rq->rq;
138 : }
139 :
140 : void free_rt_sched_group(struct task_group *tg)
141 : {
142 : int i;
143 :
144 : if (tg->rt_se)
145 : destroy_rt_bandwidth(&tg->rt_bandwidth);
146 :
147 : for_each_possible_cpu(i) {
148 : if (tg->rt_rq)
149 : kfree(tg->rt_rq[i]);
150 : if (tg->rt_se)
151 : kfree(tg->rt_se[i]);
152 : }
153 :
154 : kfree(tg->rt_rq);
155 : kfree(tg->rt_se);
156 : }
157 :
158 : void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
159 : struct sched_rt_entity *rt_se, int cpu,
160 : struct sched_rt_entity *parent)
161 : {
162 : struct rq *rq = cpu_rq(cpu);
163 :
164 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
165 : rt_rq->rt_nr_boosted = 0;
166 : rt_rq->rq = rq;
167 : rt_rq->tg = tg;
168 :
169 : tg->rt_rq[cpu] = rt_rq;
170 : tg->rt_se[cpu] = rt_se;
171 :
172 : if (!rt_se)
173 : return;
174 :
175 : if (!parent)
176 : rt_se->rt_rq = &rq->rt;
177 : else
178 : rt_se->rt_rq = parent->my_q;
179 :
180 : rt_se->my_q = rt_rq;
181 : rt_se->parent = parent;
182 : INIT_LIST_HEAD(&rt_se->run_list);
183 : }
184 :
185 : int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
186 : {
187 : struct rt_rq *rt_rq;
188 : struct sched_rt_entity *rt_se;
189 : int i;
190 :
191 : tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
192 : if (!tg->rt_rq)
193 : goto err;
194 : tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
195 : if (!tg->rt_se)
196 : goto err;
197 :
198 : init_rt_bandwidth(&tg->rt_bandwidth,
199 : ktime_to_ns(def_rt_bandwidth.rt_period), 0);
200 :
201 : for_each_possible_cpu(i) {
202 : rt_rq = kzalloc_node(sizeof(struct rt_rq),
203 : GFP_KERNEL, cpu_to_node(i));
204 : if (!rt_rq)
205 : goto err;
206 :
207 : rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
208 : GFP_KERNEL, cpu_to_node(i));
209 : if (!rt_se)
210 : goto err_free_rq;
211 :
212 : init_rt_rq(rt_rq);
213 : rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
214 : init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
215 : }
216 :
217 : return 1;
218 :
219 : err_free_rq:
220 : kfree(rt_rq);
221 : err:
222 : return 0;
223 : }
224 :
225 : #else /* CONFIG_RT_GROUP_SCHED */
226 :
227 : #define rt_entity_is_task(rt_se) (1)
228 :
229 0 : static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
230 : {
231 0 : return container_of(rt_se, struct task_struct, rt);
232 : }
233 :
234 4 : static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
235 : {
236 4 : return container_of(rt_rq, struct rq, rt);
237 : }
238 :
239 0 : static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
240 : {
241 0 : struct task_struct *p = rt_task_of(rt_se);
242 :
243 0 : return task_rq(p);
244 : }
245 :
246 0 : static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
247 : {
248 0 : struct rq *rq = rq_of_rt_se(rt_se);
249 :
250 0 : return &rq->rt;
251 : }
252 :
253 0 : void free_rt_sched_group(struct task_group *tg) { }
254 :
255 0 : int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
256 : {
257 0 : return 1;
258 : }
259 : #endif /* CONFIG_RT_GROUP_SCHED */
260 :
261 : #ifdef CONFIG_SMP
262 :
263 : static void pull_rt_task(struct rq *this_rq);
264 :
265 39 : static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
266 : {
267 : /* Try to pull RT tasks here if we lower this rq's prio */
268 39 : return rq->online && rq->rt.highest_prio.curr > prev->prio;
269 : }
270 :
271 36 : static inline int rt_overloaded(struct rq *rq)
272 : {
273 72 : return atomic_read(&rq->rd->rto_count);
274 : }
275 :
276 0 : static inline void rt_set_overload(struct rq *rq)
277 : {
278 0 : if (!rq->online)
279 : return;
280 :
281 0 : cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
282 : /*
283 : * Make sure the mask is visible before we set
284 : * the overload count. That is checked to determine
285 : * if we should look at the mask. It would be a shame
286 : * if we looked at the mask, but the mask was not
287 : * updated yet.
288 : *
289 : * Matched by the barrier in pull_rt_task().
290 : */
291 0 : smp_wmb();
292 0 : atomic_inc(&rq->rd->rto_count);
293 : }
294 :
295 0 : static inline void rt_clear_overload(struct rq *rq)
296 : {
297 0 : if (!rq->online)
298 : return;
299 :
300 : /* the order here really doesn't matter */
301 0 : atomic_dec(&rq->rd->rto_count);
302 0 : cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
303 : }
304 :
305 0 : static void update_rt_migration(struct rt_rq *rt_rq)
306 : {
307 0 : if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
308 0 : if (!rt_rq->overloaded) {
309 0 : rt_set_overload(rq_of_rt_rq(rt_rq));
310 0 : rt_rq->overloaded = 1;
311 : }
312 0 : } else if (rt_rq->overloaded) {
313 0 : rt_clear_overload(rq_of_rt_rq(rt_rq));
314 0 : rt_rq->overloaded = 0;
315 : }
316 0 : }
317 :
318 0 : static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
319 : {
320 0 : struct task_struct *p;
321 :
322 0 : if (!rt_entity_is_task(rt_se))
323 : return;
324 :
325 0 : p = rt_task_of(rt_se);
326 0 : rt_rq = &rq_of_rt_rq(rt_rq)->rt;
327 :
328 0 : rt_rq->rt_nr_total++;
329 0 : if (p->nr_cpus_allowed > 1)
330 0 : rt_rq->rt_nr_migratory++;
331 :
332 0 : update_rt_migration(rt_rq);
333 : }
334 :
335 0 : static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
336 : {
337 0 : struct task_struct *p;
338 :
339 0 : if (!rt_entity_is_task(rt_se))
340 : return;
341 :
342 0 : p = rt_task_of(rt_se);
343 0 : rt_rq = &rq_of_rt_rq(rt_rq)->rt;
344 :
345 0 : rt_rq->rt_nr_total--;
346 0 : if (p->nr_cpus_allowed > 1)
347 0 : rt_rq->rt_nr_migratory--;
348 :
349 0 : update_rt_migration(rt_rq);
350 : }
351 :
352 0 : static inline int has_pushable_tasks(struct rq *rq)
353 : {
354 0 : return !plist_head_empty(&rq->rt.pushable_tasks);
355 : }
356 :
357 : static DEFINE_PER_CPU(struct callback_head, rt_push_head);
358 : static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
359 :
360 : static void push_rt_tasks(struct rq *);
361 : static void pull_rt_task(struct rq *);
362 :
363 0 : static inline void rt_queue_push_tasks(struct rq *rq)
364 : {
365 0 : if (!has_pushable_tasks(rq))
366 : return;
367 :
368 0 : queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
369 : }
370 :
371 0 : static inline void rt_queue_pull_task(struct rq *rq)
372 : {
373 0 : queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
374 0 : }
375 :
376 0 : static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
377 : {
378 0 : plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
379 0 : plist_node_init(&p->pushable_tasks, p->prio);
380 0 : plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
381 :
382 : /* Update the highest prio pushable task */
383 0 : if (p->prio < rq->rt.highest_prio.next)
384 0 : rq->rt.highest_prio.next = p->prio;
385 0 : }
386 :
387 0 : static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
388 : {
389 0 : plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
390 :
391 : /* Update the new highest prio pushable task */
392 0 : if (has_pushable_tasks(rq)) {
393 0 : p = plist_first_entry(&rq->rt.pushable_tasks,
394 : struct task_struct, pushable_tasks);
395 0 : rq->rt.highest_prio.next = p->prio;
396 : } else {
397 0 : rq->rt.highest_prio.next = MAX_RT_PRIO-1;
398 : }
399 0 : }
400 :
401 : #else
402 :
403 : static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
404 : {
405 : }
406 :
407 : static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
408 : {
409 : }
410 :
411 : static inline
412 : void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
413 : {
414 : }
415 :
416 : static inline
417 : void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
418 : {
419 : }
420 :
421 : static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
422 : {
423 : return false;
424 : }
425 :
426 : static inline void pull_rt_task(struct rq *this_rq)
427 : {
428 : }
429 :
430 : static inline void rt_queue_push_tasks(struct rq *rq)
431 : {
432 : }
433 : #endif /* CONFIG_SMP */
434 :
435 : static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
436 : static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
437 :
438 41 : static inline int on_rt_rq(struct sched_rt_entity *rt_se)
439 : {
440 41 : return rt_se->on_rq;
441 : }
442 :
443 : #ifdef CONFIG_UCLAMP_TASK
444 : /*
445 : * Verify the fitness of task @p to run on @cpu taking into account the uclamp
446 : * settings.
447 : *
448 : * This check is only important for heterogeneous systems where uclamp_min value
449 : * is higher than the capacity of a @cpu. For non-heterogeneous system this
450 : * function will always return true.
451 : *
452 : * The function will return true if the capacity of the @cpu is >= the
453 : * uclamp_min and false otherwise.
454 : *
455 : * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
456 : * > uclamp_max.
457 : */
458 : static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
459 : {
460 : unsigned int min_cap;
461 : unsigned int max_cap;
462 : unsigned int cpu_cap;
463 :
464 : /* Only heterogeneous systems can benefit from this check */
465 : if (!static_branch_unlikely(&sched_asym_cpucapacity))
466 : return true;
467 :
468 : min_cap = uclamp_eff_value(p, UCLAMP_MIN);
469 : max_cap = uclamp_eff_value(p, UCLAMP_MAX);
470 :
471 : cpu_cap = capacity_orig_of(cpu);
472 :
473 : return cpu_cap >= min(min_cap, max_cap);
474 : }
475 : #else
476 0 : static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
477 : {
478 0 : return true;
479 : }
480 : #endif
481 :
482 : #ifdef CONFIG_RT_GROUP_SCHED
483 :
484 : static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
485 : {
486 : if (!rt_rq->tg)
487 : return RUNTIME_INF;
488 :
489 : return rt_rq->rt_runtime;
490 : }
491 :
492 : static inline u64 sched_rt_period(struct rt_rq *rt_rq)
493 : {
494 : return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
495 : }
496 :
497 : typedef struct task_group *rt_rq_iter_t;
498 :
499 : static inline struct task_group *next_task_group(struct task_group *tg)
500 : {
501 : do {
502 : tg = list_entry_rcu(tg->list.next,
503 : typeof(struct task_group), list);
504 : } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
505 :
506 : if (&tg->list == &task_groups)
507 : tg = NULL;
508 :
509 : return tg;
510 : }
511 :
512 : #define for_each_rt_rq(rt_rq, iter, rq) \
513 : for (iter = container_of(&task_groups, typeof(*iter), list); \
514 : (iter = next_task_group(iter)) && \
515 : (rt_rq = iter->rt_rq[cpu_of(rq)]);)
516 :
517 : #define for_each_sched_rt_entity(rt_se) \
518 : for (; rt_se; rt_se = rt_se->parent)
519 :
520 : static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
521 : {
522 : return rt_se->my_q;
523 : }
524 :
525 : static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
526 : static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
527 :
528 : static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
529 : {
530 : struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
531 : struct rq *rq = rq_of_rt_rq(rt_rq);
532 : struct sched_rt_entity *rt_se;
533 :
534 : int cpu = cpu_of(rq);
535 :
536 : rt_se = rt_rq->tg->rt_se[cpu];
537 :
538 : if (rt_rq->rt_nr_running) {
539 : if (!rt_se)
540 : enqueue_top_rt_rq(rt_rq);
541 : else if (!on_rt_rq(rt_se))
542 : enqueue_rt_entity(rt_se, 0);
543 :
544 : if (rt_rq->highest_prio.curr < curr->prio)
545 : resched_curr(rq);
546 : }
547 : }
548 :
549 : static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
550 : {
551 : struct sched_rt_entity *rt_se;
552 : int cpu = cpu_of(rq_of_rt_rq(rt_rq));
553 :
554 : rt_se = rt_rq->tg->rt_se[cpu];
555 :
556 : if (!rt_se) {
557 : dequeue_top_rt_rq(rt_rq);
558 : /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
559 : cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
560 : }
561 : else if (on_rt_rq(rt_se))
562 : dequeue_rt_entity(rt_se, 0);
563 : }
564 :
565 : static inline int rt_rq_throttled(struct rt_rq *rt_rq)
566 : {
567 : return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
568 : }
569 :
570 : static int rt_se_boosted(struct sched_rt_entity *rt_se)
571 : {
572 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
573 : struct task_struct *p;
574 :
575 : if (rt_rq)
576 : return !!rt_rq->rt_nr_boosted;
577 :
578 : p = rt_task_of(rt_se);
579 : return p->prio != p->normal_prio;
580 : }
581 :
582 : #ifdef CONFIG_SMP
583 : static inline const struct cpumask *sched_rt_period_mask(void)
584 : {
585 : return this_rq()->rd->span;
586 : }
587 : #else
588 : static inline const struct cpumask *sched_rt_period_mask(void)
589 : {
590 : return cpu_online_mask;
591 : }
592 : #endif
593 :
594 : static inline
595 : struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
596 : {
597 : return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
598 : }
599 :
600 : static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
601 : {
602 : return &rt_rq->tg->rt_bandwidth;
603 : }
604 :
605 : #else /* !CONFIG_RT_GROUP_SCHED */
606 :
607 0 : static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
608 : {
609 0 : return rt_rq->rt_runtime;
610 : }
611 :
612 0 : static inline u64 sched_rt_period(struct rt_rq *rt_rq)
613 : {
614 0 : return ktime_to_ns(def_rt_bandwidth.rt_period);
615 : }
616 :
617 : typedef struct rt_rq *rt_rq_iter_t;
618 :
619 : #define for_each_rt_rq(rt_rq, iter, rq) \
620 : for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
621 :
622 : #define for_each_sched_rt_entity(rt_se) \
623 : for (; rt_se; rt_se = NULL)
624 :
625 0 : static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
626 : {
627 0 : return NULL;
628 : }
629 :
630 4 : static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
631 : {
632 4 : struct rq *rq = rq_of_rt_rq(rt_rq);
633 :
634 4 : if (!rt_rq->rt_nr_running)
635 : return;
636 :
637 0 : enqueue_top_rt_rq(rt_rq);
638 0 : resched_curr(rq);
639 : }
640 :
641 0 : static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
642 : {
643 0 : dequeue_top_rt_rq(rt_rq);
644 : }
645 :
646 0 : static inline int rt_rq_throttled(struct rt_rq *rt_rq)
647 : {
648 0 : return rt_rq->rt_throttled;
649 : }
650 :
651 0 : static inline const struct cpumask *sched_rt_period_mask(void)
652 : {
653 0 : return cpu_online_mask;
654 : }
655 :
656 : static inline
657 0 : struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
658 : {
659 0 : return &cpu_rq(cpu)->rt;
660 : }
661 :
662 11 : static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
663 : {
664 11 : return &def_rt_bandwidth;
665 : }
666 :
667 : #endif /* CONFIG_RT_GROUP_SCHED */
668 :
669 0 : bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
670 : {
671 0 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
672 :
673 0 : return (hrtimer_active(&rt_b->rt_period_timer) ||
674 0 : rt_rq->rt_time < rt_b->rt_runtime);
675 : }
676 :
677 : #ifdef CONFIG_SMP
678 : /*
679 : * We ran out of runtime, see if we can borrow some from our neighbours.
680 : */
681 : static void do_balance_runtime(struct rt_rq *rt_rq)
682 : {
683 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
684 : struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
685 : int i, weight;
686 : u64 rt_period;
687 :
688 : weight = cpumask_weight(rd->span);
689 :
690 : raw_spin_lock(&rt_b->rt_runtime_lock);
691 : rt_period = ktime_to_ns(rt_b->rt_period);
692 : for_each_cpu(i, rd->span) {
693 : struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
694 : s64 diff;
695 :
696 : if (iter == rt_rq)
697 : continue;
698 :
699 : raw_spin_lock(&iter->rt_runtime_lock);
700 : /*
701 : * Either all rqs have inf runtime and there's nothing to steal
702 : * or __disable_runtime() below sets a specific rq to inf to
703 : * indicate its been disabled and disalow stealing.
704 : */
705 : if (iter->rt_runtime == RUNTIME_INF)
706 : goto next;
707 :
708 : /*
709 : * From runqueues with spare time, take 1/n part of their
710 : * spare time, but no more than our period.
711 : */
712 : diff = iter->rt_runtime - iter->rt_time;
713 : if (diff > 0) {
714 : diff = div_u64((u64)diff, weight);
715 : if (rt_rq->rt_runtime + diff > rt_period)
716 : diff = rt_period - rt_rq->rt_runtime;
717 : iter->rt_runtime -= diff;
718 : rt_rq->rt_runtime += diff;
719 : if (rt_rq->rt_runtime == rt_period) {
720 : raw_spin_unlock(&iter->rt_runtime_lock);
721 : break;
722 : }
723 : }
724 : next:
725 : raw_spin_unlock(&iter->rt_runtime_lock);
726 : }
727 : raw_spin_unlock(&rt_b->rt_runtime_lock);
728 : }
729 :
730 : /*
731 : * Ensure this RQ takes back all the runtime it lend to its neighbours.
732 : */
733 4 : static void __disable_runtime(struct rq *rq)
734 : {
735 4 : struct root_domain *rd = rq->rd;
736 4 : rt_rq_iter_t iter;
737 4 : struct rt_rq *rt_rq;
738 :
739 4 : if (unlikely(!scheduler_running))
740 : return;
741 :
742 4 : for_each_rt_rq(rt_rq, iter, rq) {
743 4 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
744 4 : s64 want;
745 4 : int i;
746 :
747 4 : raw_spin_lock(&rt_b->rt_runtime_lock);
748 4 : raw_spin_lock(&rt_rq->rt_runtime_lock);
749 : /*
750 : * Either we're all inf and nobody needs to borrow, or we're
751 : * already disabled and thus have nothing to do, or we have
752 : * exactly the right amount of runtime to take out.
753 : */
754 4 : if (rt_rq->rt_runtime == RUNTIME_INF ||
755 4 : rt_rq->rt_runtime == rt_b->rt_runtime)
756 4 : goto balanced;
757 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
758 :
759 : /*
760 : * Calculate the difference between what we started out with
761 : * and what we current have, that's the amount of runtime
762 : * we lend and now have to reclaim.
763 : */
764 0 : want = rt_b->rt_runtime - rt_rq->rt_runtime;
765 :
766 : /*
767 : * Greedy reclaim, take back as much as we can.
768 : */
769 0 : for_each_cpu(i, rd->span) {
770 0 : struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
771 0 : s64 diff;
772 :
773 : /*
774 : * Can't reclaim from ourselves or disabled runqueues.
775 : */
776 0 : if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
777 0 : continue;
778 :
779 0 : raw_spin_lock(&iter->rt_runtime_lock);
780 0 : if (want > 0) {
781 0 : diff = min_t(s64, iter->rt_runtime, want);
782 0 : iter->rt_runtime -= diff;
783 0 : want -= diff;
784 : } else {
785 0 : iter->rt_runtime -= want;
786 0 : want -= want;
787 : }
788 0 : raw_spin_unlock(&iter->rt_runtime_lock);
789 :
790 0 : if (!want)
791 : break;
792 : }
793 :
794 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
795 : /*
796 : * We cannot be left wanting - that would mean some runtime
797 : * leaked out of the system.
798 : */
799 0 : BUG_ON(want);
800 0 : balanced:
801 : /*
802 : * Disable all the borrow logic by pretending we have inf
803 : * runtime - in which case borrowing doesn't make sense.
804 : */
805 4 : rt_rq->rt_runtime = RUNTIME_INF;
806 4 : rt_rq->rt_throttled = 0;
807 4 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
808 4 : raw_spin_unlock(&rt_b->rt_runtime_lock);
809 :
810 : /* Make rt_rq available for pick_next_task() */
811 4 : sched_rt_rq_enqueue(rt_rq);
812 : }
813 : }
814 :
815 8 : static void __enable_runtime(struct rq *rq)
816 : {
817 8 : rt_rq_iter_t iter;
818 8 : struct rt_rq *rt_rq;
819 :
820 8 : if (unlikely(!scheduler_running))
821 : return;
822 :
823 : /*
824 : * Reset each runqueue's bandwidth settings
825 : */
826 7 : for_each_rt_rq(rt_rq, iter, rq) {
827 7 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
828 :
829 7 : raw_spin_lock(&rt_b->rt_runtime_lock);
830 7 : raw_spin_lock(&rt_rq->rt_runtime_lock);
831 7 : rt_rq->rt_runtime = rt_b->rt_runtime;
832 7 : rt_rq->rt_time = 0;
833 7 : rt_rq->rt_throttled = 0;
834 7 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
835 7 : raw_spin_unlock(&rt_b->rt_runtime_lock);
836 : }
837 : }
838 :
839 0 : static void balance_runtime(struct rt_rq *rt_rq)
840 : {
841 0 : if (!sched_feat(RT_RUNTIME_SHARE))
842 0 : return;
843 :
844 : if (rt_rq->rt_time > rt_rq->rt_runtime) {
845 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
846 : do_balance_runtime(rt_rq);
847 : raw_spin_lock(&rt_rq->rt_runtime_lock);
848 : }
849 : }
850 : #else /* !CONFIG_SMP */
851 : static inline void balance_runtime(struct rt_rq *rt_rq) {}
852 : #endif /* CONFIG_SMP */
853 :
854 0 : static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
855 : {
856 0 : int i, idle = 1, throttled = 0;
857 0 : const struct cpumask *span;
858 :
859 0 : span = sched_rt_period_mask();
860 : #ifdef CONFIG_RT_GROUP_SCHED
861 : /*
862 : * FIXME: isolated CPUs should really leave the root task group,
863 : * whether they are isolcpus or were isolated via cpusets, lest
864 : * the timer run on a CPU which does not service all runqueues,
865 : * potentially leaving other CPUs indefinitely throttled. If
866 : * isolation is really required, the user will turn the throttle
867 : * off to kill the perturbations it causes anyway. Meanwhile,
868 : * this maintains functionality for boot and/or troubleshooting.
869 : */
870 : if (rt_b == &root_task_group.rt_bandwidth)
871 : span = cpu_online_mask;
872 : #endif
873 0 : for_each_cpu(i, span) {
874 0 : int enqueue = 0;
875 0 : struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
876 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
877 0 : int skip;
878 :
879 : /*
880 : * When span == cpu_online_mask, taking each rq->lock
881 : * can be time-consuming. Try to avoid it when possible.
882 : */
883 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
884 0 : if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
885 0 : rt_rq->rt_runtime = rt_b->rt_runtime;
886 0 : skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
887 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
888 0 : if (skip)
889 0 : continue;
890 :
891 0 : raw_spin_lock(&rq->lock);
892 0 : update_rq_clock(rq);
893 :
894 0 : if (rt_rq->rt_time) {
895 0 : u64 runtime;
896 :
897 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
898 0 : if (rt_rq->rt_throttled)
899 0 : balance_runtime(rt_rq);
900 0 : runtime = rt_rq->rt_runtime;
901 0 : rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
902 0 : if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
903 0 : rt_rq->rt_throttled = 0;
904 0 : enqueue = 1;
905 :
906 : /*
907 : * When we're idle and a woken (rt) task is
908 : * throttled check_preempt_curr() will set
909 : * skip_update and the time between the wakeup
910 : * and this unthrottle will get accounted as
911 : * 'runtime'.
912 : */
913 0 : if (rt_rq->rt_nr_running && rq->curr == rq->idle)
914 0 : rq_clock_cancel_skipupdate(rq);
915 : }
916 0 : if (rt_rq->rt_time || rt_rq->rt_nr_running)
917 0 : idle = 0;
918 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
919 0 : } else if (rt_rq->rt_nr_running) {
920 0 : idle = 0;
921 0 : if (!rt_rq_throttled(rt_rq))
922 0 : enqueue = 1;
923 : }
924 0 : if (rt_rq->rt_throttled)
925 0 : throttled = 1;
926 :
927 0 : if (enqueue)
928 0 : sched_rt_rq_enqueue(rt_rq);
929 0 : raw_spin_unlock(&rq->lock);
930 : }
931 :
932 0 : if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
933 0 : return 1;
934 :
935 : return idle;
936 : }
937 :
938 0 : static inline int rt_se_prio(struct sched_rt_entity *rt_se)
939 : {
940 : #ifdef CONFIG_RT_GROUP_SCHED
941 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
942 :
943 : if (rt_rq)
944 : return rt_rq->highest_prio.curr;
945 : #endif
946 :
947 0 : return rt_task_of(rt_se)->prio;
948 : }
949 :
950 0 : static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
951 : {
952 0 : u64 runtime = sched_rt_runtime(rt_rq);
953 :
954 0 : if (rt_rq->rt_throttled)
955 0 : return rt_rq_throttled(rt_rq);
956 :
957 0 : if (runtime >= sched_rt_period(rt_rq))
958 : return 0;
959 :
960 0 : balance_runtime(rt_rq);
961 0 : runtime = sched_rt_runtime(rt_rq);
962 0 : if (runtime == RUNTIME_INF)
963 : return 0;
964 :
965 0 : if (rt_rq->rt_time > runtime) {
966 0 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
967 :
968 : /*
969 : * Don't actually throttle groups that have no runtime assigned
970 : * but accrue some time due to boosting.
971 : */
972 0 : if (likely(rt_b->rt_runtime)) {
973 0 : rt_rq->rt_throttled = 1;
974 0 : printk_deferred_once("sched: RT throttling activated\n");
975 : } else {
976 : /*
977 : * In case we did anyway, make it go away,
978 : * replenishment is a joke, since it will replenish us
979 : * with exactly 0 ns.
980 : */
981 0 : rt_rq->rt_time = 0;
982 : }
983 :
984 0 : if (rt_rq_throttled(rt_rq)) {
985 0 : sched_rt_rq_dequeue(rt_rq);
986 0 : return 1;
987 : }
988 : }
989 :
990 : return 0;
991 : }
992 :
993 : /*
994 : * Update the current task's runtime statistics. Skip current tasks that
995 : * are not in our scheduling class.
996 : */
997 0 : static void update_curr_rt(struct rq *rq)
998 : {
999 0 : struct task_struct *curr = rq->curr;
1000 0 : struct sched_rt_entity *rt_se = &curr->rt;
1001 0 : u64 delta_exec;
1002 0 : u64 now;
1003 :
1004 0 : if (curr->sched_class != &rt_sched_class)
1005 : return;
1006 :
1007 0 : now = rq_clock_task(rq);
1008 0 : delta_exec = now - curr->se.exec_start;
1009 0 : if (unlikely((s64)delta_exec <= 0))
1010 : return;
1011 :
1012 0 : schedstat_set(curr->se.statistics.exec_max,
1013 : max(curr->se.statistics.exec_max, delta_exec));
1014 :
1015 0 : curr->se.sum_exec_runtime += delta_exec;
1016 0 : account_group_exec_runtime(curr, delta_exec);
1017 :
1018 0 : curr->se.exec_start = now;
1019 0 : cgroup_account_cputime(curr, delta_exec);
1020 :
1021 0 : if (!rt_bandwidth_enabled())
1022 : return;
1023 :
1024 0 : for_each_sched_rt_entity(rt_se) {
1025 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1026 :
1027 0 : if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1028 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
1029 0 : rt_rq->rt_time += delta_exec;
1030 0 : if (sched_rt_runtime_exceeded(rt_rq))
1031 0 : resched_curr(rq);
1032 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
1033 : }
1034 : }
1035 : }
1036 :
1037 : static void
1038 0 : dequeue_top_rt_rq(struct rt_rq *rt_rq)
1039 : {
1040 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
1041 :
1042 0 : BUG_ON(&rq->rt != rt_rq);
1043 :
1044 0 : if (!rt_rq->rt_queued)
1045 : return;
1046 :
1047 0 : BUG_ON(!rq->nr_running);
1048 :
1049 0 : sub_nr_running(rq, rt_rq->rt_nr_running);
1050 0 : rt_rq->rt_queued = 0;
1051 :
1052 : }
1053 :
1054 : static void
1055 0 : enqueue_top_rt_rq(struct rt_rq *rt_rq)
1056 : {
1057 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
1058 :
1059 0 : BUG_ON(&rq->rt != rt_rq);
1060 :
1061 0 : if (rt_rq->rt_queued)
1062 : return;
1063 :
1064 0 : if (rt_rq_throttled(rt_rq))
1065 : return;
1066 :
1067 0 : if (rt_rq->rt_nr_running) {
1068 0 : add_nr_running(rq, rt_rq->rt_nr_running);
1069 0 : rt_rq->rt_queued = 1;
1070 : }
1071 :
1072 : /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1073 0 : cpufreq_update_util(rq, 0);
1074 : }
1075 :
1076 : #if defined CONFIG_SMP
1077 :
1078 : static void
1079 0 : inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1080 : {
1081 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
1082 :
1083 : #ifdef CONFIG_RT_GROUP_SCHED
1084 : /*
1085 : * Change rq's cpupri only if rt_rq is the top queue.
1086 : */
1087 : if (&rq->rt != rt_rq)
1088 : return;
1089 : #endif
1090 0 : if (rq->online && prio < prev_prio)
1091 0 : cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1092 0 : }
1093 :
1094 : static void
1095 0 : dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1096 : {
1097 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
1098 :
1099 : #ifdef CONFIG_RT_GROUP_SCHED
1100 : /*
1101 : * Change rq's cpupri only if rt_rq is the top queue.
1102 : */
1103 : if (&rq->rt != rt_rq)
1104 : return;
1105 : #endif
1106 0 : if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1107 0 : cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1108 0 : }
1109 :
1110 : #else /* CONFIG_SMP */
1111 :
1112 : static inline
1113 : void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1114 : static inline
1115 : void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1116 :
1117 : #endif /* CONFIG_SMP */
1118 :
1119 : #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1120 : static void
1121 0 : inc_rt_prio(struct rt_rq *rt_rq, int prio)
1122 : {
1123 0 : int prev_prio = rt_rq->highest_prio.curr;
1124 :
1125 0 : if (prio < prev_prio)
1126 0 : rt_rq->highest_prio.curr = prio;
1127 :
1128 0 : inc_rt_prio_smp(rt_rq, prio, prev_prio);
1129 0 : }
1130 :
1131 : static void
1132 0 : dec_rt_prio(struct rt_rq *rt_rq, int prio)
1133 : {
1134 0 : int prev_prio = rt_rq->highest_prio.curr;
1135 :
1136 0 : if (rt_rq->rt_nr_running) {
1137 :
1138 0 : WARN_ON(prio < prev_prio);
1139 :
1140 : /*
1141 : * This may have been our highest task, and therefore
1142 : * we may have some recomputation to do
1143 : */
1144 0 : if (prio == prev_prio) {
1145 0 : struct rt_prio_array *array = &rt_rq->active;
1146 :
1147 0 : rt_rq->highest_prio.curr =
1148 0 : sched_find_first_bit(array->bitmap);
1149 : }
1150 :
1151 : } else {
1152 0 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
1153 : }
1154 :
1155 0 : dec_rt_prio_smp(rt_rq, prio, prev_prio);
1156 0 : }
1157 :
1158 : #else
1159 :
1160 : static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1161 : static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1162 :
1163 : #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1164 :
1165 : #ifdef CONFIG_RT_GROUP_SCHED
1166 :
1167 : static void
1168 : inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1169 : {
1170 : if (rt_se_boosted(rt_se))
1171 : rt_rq->rt_nr_boosted++;
1172 :
1173 : if (rt_rq->tg)
1174 : start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1175 : }
1176 :
1177 : static void
1178 : dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1179 : {
1180 : if (rt_se_boosted(rt_se))
1181 : rt_rq->rt_nr_boosted--;
1182 :
1183 : WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1184 : }
1185 :
1186 : #else /* CONFIG_RT_GROUP_SCHED */
1187 :
1188 : static void
1189 0 : inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1190 : {
1191 0 : start_rt_bandwidth(&def_rt_bandwidth);
1192 : }
1193 :
1194 : static inline
1195 0 : void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1196 :
1197 : #endif /* CONFIG_RT_GROUP_SCHED */
1198 :
1199 : static inline
1200 0 : unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1201 : {
1202 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1203 :
1204 0 : if (group_rq)
1205 : return group_rq->rt_nr_running;
1206 : else
1207 0 : return 1;
1208 : }
1209 :
1210 : static inline
1211 0 : unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1212 : {
1213 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1214 0 : struct task_struct *tsk;
1215 :
1216 0 : if (group_rq)
1217 : return group_rq->rr_nr_running;
1218 :
1219 0 : tsk = rt_task_of(rt_se);
1220 :
1221 0 : return (tsk->policy == SCHED_RR) ? 1 : 0;
1222 : }
1223 :
1224 : static inline
1225 0 : void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1226 : {
1227 0 : int prio = rt_se_prio(rt_se);
1228 :
1229 0 : WARN_ON(!rt_prio(prio));
1230 0 : rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1231 0 : rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1232 :
1233 0 : inc_rt_prio(rt_rq, prio);
1234 0 : inc_rt_migration(rt_se, rt_rq);
1235 0 : inc_rt_group(rt_se, rt_rq);
1236 0 : }
1237 :
1238 : static inline
1239 0 : void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1240 : {
1241 0 : WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1242 0 : WARN_ON(!rt_rq->rt_nr_running);
1243 0 : rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1244 0 : rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1245 :
1246 0 : dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1247 0 : dec_rt_migration(rt_se, rt_rq);
1248 0 : dec_rt_group(rt_se, rt_rq);
1249 0 : }
1250 :
1251 : /*
1252 : * Change rt_se->run_list location unless SAVE && !MOVE
1253 : *
1254 : * assumes ENQUEUE/DEQUEUE flags match
1255 : */
1256 0 : static inline bool move_entity(unsigned int flags)
1257 : {
1258 0 : if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1259 : return false;
1260 :
1261 : return true;
1262 : }
1263 :
1264 0 : static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1265 : {
1266 0 : list_del_init(&rt_se->run_list);
1267 :
1268 0 : if (list_empty(array->queue + rt_se_prio(rt_se)))
1269 0 : __clear_bit(rt_se_prio(rt_se), array->bitmap);
1270 :
1271 0 : rt_se->on_list = 0;
1272 0 : }
1273 :
1274 0 : static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1275 : {
1276 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1277 0 : struct rt_prio_array *array = &rt_rq->active;
1278 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1279 0 : struct list_head *queue = array->queue + rt_se_prio(rt_se);
1280 :
1281 : /*
1282 : * Don't enqueue the group if its throttled, or when empty.
1283 : * The latter is a consequence of the former when a child group
1284 : * get throttled and the current group doesn't have any other
1285 : * active members.
1286 : */
1287 0 : if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1288 : if (rt_se->on_list)
1289 : __delist_rt_entity(rt_se, array);
1290 : return;
1291 : }
1292 :
1293 0 : if (move_entity(flags)) {
1294 0 : WARN_ON_ONCE(rt_se->on_list);
1295 0 : if (flags & ENQUEUE_HEAD)
1296 0 : list_add(&rt_se->run_list, queue);
1297 : else
1298 0 : list_add_tail(&rt_se->run_list, queue);
1299 :
1300 0 : __set_bit(rt_se_prio(rt_se), array->bitmap);
1301 0 : rt_se->on_list = 1;
1302 : }
1303 0 : rt_se->on_rq = 1;
1304 :
1305 0 : inc_rt_tasks(rt_se, rt_rq);
1306 : }
1307 :
1308 0 : static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1309 : {
1310 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1311 0 : struct rt_prio_array *array = &rt_rq->active;
1312 :
1313 0 : if (move_entity(flags)) {
1314 0 : WARN_ON_ONCE(!rt_se->on_list);
1315 0 : __delist_rt_entity(rt_se, array);
1316 : }
1317 0 : rt_se->on_rq = 0;
1318 :
1319 0 : dec_rt_tasks(rt_se, rt_rq);
1320 0 : }
1321 :
1322 : /*
1323 : * Because the prio of an upper entry depends on the lower
1324 : * entries, we must remove entries top - down.
1325 : */
1326 0 : static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1327 : {
1328 0 : struct sched_rt_entity *back = NULL;
1329 :
1330 0 : for_each_sched_rt_entity(rt_se) {
1331 0 : rt_se->back = back;
1332 0 : back = rt_se;
1333 : }
1334 :
1335 0 : dequeue_top_rt_rq(rt_rq_of_se(back));
1336 :
1337 0 : for (rt_se = back; rt_se; rt_se = rt_se->back) {
1338 0 : if (on_rt_rq(rt_se))
1339 0 : __dequeue_rt_entity(rt_se, flags);
1340 : }
1341 0 : }
1342 :
1343 0 : static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1344 : {
1345 0 : struct rq *rq = rq_of_rt_se(rt_se);
1346 :
1347 0 : dequeue_rt_stack(rt_se, flags);
1348 0 : for_each_sched_rt_entity(rt_se)
1349 0 : __enqueue_rt_entity(rt_se, flags);
1350 0 : enqueue_top_rt_rq(&rq->rt);
1351 0 : }
1352 :
1353 0 : static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1354 : {
1355 0 : struct rq *rq = rq_of_rt_se(rt_se);
1356 :
1357 0 : dequeue_rt_stack(rt_se, flags);
1358 :
1359 0 : for_each_sched_rt_entity(rt_se) {
1360 0 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
1361 :
1362 : if (rt_rq && rt_rq->rt_nr_running)
1363 : __enqueue_rt_entity(rt_se, flags);
1364 : }
1365 0 : enqueue_top_rt_rq(&rq->rt);
1366 0 : }
1367 :
1368 : /*
1369 : * Adding/removing a task to/from a priority array:
1370 : */
1371 : static void
1372 0 : enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1373 : {
1374 0 : struct sched_rt_entity *rt_se = &p->rt;
1375 :
1376 0 : if (flags & ENQUEUE_WAKEUP)
1377 0 : rt_se->timeout = 0;
1378 :
1379 0 : enqueue_rt_entity(rt_se, flags);
1380 :
1381 0 : if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1382 0 : enqueue_pushable_task(rq, p);
1383 0 : }
1384 :
1385 0 : static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1386 : {
1387 0 : struct sched_rt_entity *rt_se = &p->rt;
1388 :
1389 0 : update_curr_rt(rq);
1390 0 : dequeue_rt_entity(rt_se, flags);
1391 :
1392 0 : dequeue_pushable_task(rq, p);
1393 0 : }
1394 :
1395 : /*
1396 : * Put task to the head or the end of the run list without the overhead of
1397 : * dequeue followed by enqueue.
1398 : */
1399 : static void
1400 0 : requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1401 : {
1402 0 : if (on_rt_rq(rt_se)) {
1403 0 : struct rt_prio_array *array = &rt_rq->active;
1404 0 : struct list_head *queue = array->queue + rt_se_prio(rt_se);
1405 :
1406 0 : if (head)
1407 0 : list_move(&rt_se->run_list, queue);
1408 : else
1409 0 : list_move_tail(&rt_se->run_list, queue);
1410 : }
1411 0 : }
1412 :
1413 0 : static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1414 : {
1415 0 : struct sched_rt_entity *rt_se = &p->rt;
1416 0 : struct rt_rq *rt_rq;
1417 :
1418 0 : for_each_sched_rt_entity(rt_se) {
1419 0 : rt_rq = rt_rq_of_se(rt_se);
1420 0 : requeue_rt_entity(rt_rq, rt_se, head);
1421 : }
1422 0 : }
1423 :
1424 0 : static void yield_task_rt(struct rq *rq)
1425 : {
1426 0 : requeue_task_rt(rq, rq->curr, 0);
1427 0 : }
1428 :
1429 : #ifdef CONFIG_SMP
1430 : static int find_lowest_rq(struct task_struct *task);
1431 :
1432 : static int
1433 0 : select_task_rq_rt(struct task_struct *p, int cpu, int flags)
1434 : {
1435 0 : struct task_struct *curr;
1436 0 : struct rq *rq;
1437 0 : bool test;
1438 :
1439 : /* For anything but wake ups, just return the task_cpu */
1440 0 : if (!(flags & (WF_TTWU | WF_FORK)))
1441 0 : goto out;
1442 :
1443 0 : rq = cpu_rq(cpu);
1444 :
1445 0 : rcu_read_lock();
1446 0 : curr = READ_ONCE(rq->curr); /* unlocked access */
1447 :
1448 : /*
1449 : * If the current task on @p's runqueue is an RT task, then
1450 : * try to see if we can wake this RT task up on another
1451 : * runqueue. Otherwise simply start this RT task
1452 : * on its current runqueue.
1453 : *
1454 : * We want to avoid overloading runqueues. If the woken
1455 : * task is a higher priority, then it will stay on this CPU
1456 : * and the lower prio task should be moved to another CPU.
1457 : * Even though this will probably make the lower prio task
1458 : * lose its cache, we do not want to bounce a higher task
1459 : * around just because it gave up its CPU, perhaps for a
1460 : * lock?
1461 : *
1462 : * For equal prio tasks, we just let the scheduler sort it out.
1463 : *
1464 : * Otherwise, just let it ride on the affined RQ and the
1465 : * post-schedule router will push the preempted task away
1466 : *
1467 : * This test is optimistic, if we get it wrong the load-balancer
1468 : * will have to sort it out.
1469 : *
1470 : * We take into account the capacity of the CPU to ensure it fits the
1471 : * requirement of the task - which is only important on heterogeneous
1472 : * systems like big.LITTLE.
1473 : */
1474 0 : test = curr &&
1475 0 : unlikely(rt_task(curr)) &&
1476 0 : (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1477 :
1478 0 : if (test || !rt_task_fits_capacity(p, cpu)) {
1479 0 : int target = find_lowest_rq(p);
1480 :
1481 : /*
1482 : * Bail out if we were forcing a migration to find a better
1483 : * fitting CPU but our search failed.
1484 : */
1485 0 : if (!test && target != -1 && !rt_task_fits_capacity(p, target))
1486 : goto out_unlock;
1487 :
1488 : /*
1489 : * Don't bother moving it if the destination CPU is
1490 : * not running a lower priority task.
1491 : */
1492 0 : if (target != -1 &&
1493 0 : p->prio < cpu_rq(target)->rt.highest_prio.curr)
1494 0 : cpu = target;
1495 : }
1496 :
1497 0 : out_unlock:
1498 0 : rcu_read_unlock();
1499 :
1500 0 : out:
1501 0 : return cpu;
1502 : }
1503 :
1504 0 : static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1505 : {
1506 : /*
1507 : * Current can't be migrated, useless to reschedule,
1508 : * let's hope p can move out.
1509 : */
1510 0 : if (rq->curr->nr_cpus_allowed == 1 ||
1511 0 : !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1512 0 : return;
1513 :
1514 : /*
1515 : * p is migratable, so let's not schedule it and
1516 : * see if it is pushed or pulled somewhere else.
1517 : */
1518 0 : if (p->nr_cpus_allowed != 1 &&
1519 0 : cpupri_find(&rq->rd->cpupri, p, NULL))
1520 : return;
1521 :
1522 : /*
1523 : * There appear to be other CPUs that can accept
1524 : * the current task but none can run 'p', so lets reschedule
1525 : * to try and push the current task away:
1526 : */
1527 0 : requeue_task_rt(rq, p, 1);
1528 0 : resched_curr(rq);
1529 : }
1530 :
1531 41 : static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1532 : {
1533 80 : if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1534 : /*
1535 : * This is OK, because current is on_cpu, which avoids it being
1536 : * picked for load-balance and preemption/IRQs are still
1537 : * disabled avoiding further scheduler activity on it and we've
1538 : * not yet started the picking loop.
1539 : */
1540 37 : rq_unpin_lock(rq, rf);
1541 37 : pull_rt_task(rq);
1542 36 : rq_repin_lock(rq, rf);
1543 : }
1544 :
1545 80 : return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1546 : }
1547 : #endif /* CONFIG_SMP */
1548 :
1549 : /*
1550 : * Preempt the current task with a newly woken task if needed:
1551 : */
1552 0 : static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1553 : {
1554 0 : if (p->prio < rq->curr->prio) {
1555 0 : resched_curr(rq);
1556 0 : return;
1557 : }
1558 :
1559 : #ifdef CONFIG_SMP
1560 : /*
1561 : * If:
1562 : *
1563 : * - the newly woken task is of equal priority to the current task
1564 : * - the newly woken task is non-migratable while current is migratable
1565 : * - current will be preempted on the next reschedule
1566 : *
1567 : * we should check to see if current can readily move to a different
1568 : * cpu. If so, we will reschedule to allow the push logic to try
1569 : * to move current somewhere else, making room for our non-migratable
1570 : * task.
1571 : */
1572 0 : if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1573 0 : check_preempt_equal_prio(rq, p);
1574 : #endif
1575 : }
1576 :
1577 0 : static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1578 : {
1579 0 : p->se.exec_start = rq_clock_task(rq);
1580 :
1581 : /* The running task is never eligible for pushing */
1582 0 : dequeue_pushable_task(rq, p);
1583 :
1584 0 : if (!first)
1585 : return;
1586 :
1587 : /*
1588 : * If prev task was rt, put_prev_task() has already updated the
1589 : * utilization. We only care of the case where we start to schedule a
1590 : * rt task
1591 : */
1592 0 : if (rq->curr->sched_class != &rt_sched_class)
1593 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1594 :
1595 0 : rt_queue_push_tasks(rq);
1596 : }
1597 :
1598 0 : static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1599 : struct rt_rq *rt_rq)
1600 : {
1601 0 : struct rt_prio_array *array = &rt_rq->active;
1602 0 : struct sched_rt_entity *next = NULL;
1603 0 : struct list_head *queue;
1604 0 : int idx;
1605 :
1606 0 : idx = sched_find_first_bit(array->bitmap);
1607 0 : BUG_ON(idx >= MAX_RT_PRIO);
1608 :
1609 0 : queue = array->queue + idx;
1610 0 : next = list_entry(queue->next, struct sched_rt_entity, run_list);
1611 :
1612 0 : return next;
1613 : }
1614 :
1615 0 : static struct task_struct *_pick_next_task_rt(struct rq *rq)
1616 : {
1617 0 : struct sched_rt_entity *rt_se;
1618 0 : struct rt_rq *rt_rq = &rq->rt;
1619 :
1620 0 : do {
1621 0 : rt_se = pick_next_rt_entity(rq, rt_rq);
1622 0 : BUG_ON(!rt_se);
1623 0 : rt_rq = group_rt_rq(rt_se);
1624 0 : } while (rt_rq);
1625 :
1626 0 : return rt_task_of(rt_se);
1627 : }
1628 :
1629 39 : static struct task_struct *pick_next_task_rt(struct rq *rq)
1630 : {
1631 39 : struct task_struct *p;
1632 :
1633 39 : if (!sched_rt_runnable(rq))
1634 : return NULL;
1635 :
1636 0 : p = _pick_next_task_rt(rq);
1637 0 : set_next_task_rt(rq, p, true);
1638 0 : return p;
1639 : }
1640 :
1641 0 : static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1642 : {
1643 0 : update_curr_rt(rq);
1644 :
1645 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1646 :
1647 : /*
1648 : * The previous task needs to be made eligible for pushing
1649 : * if it is still active
1650 : */
1651 0 : if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1652 0 : enqueue_pushable_task(rq, p);
1653 0 : }
1654 :
1655 : #ifdef CONFIG_SMP
1656 :
1657 : /* Only try algorithms three times */
1658 : #define RT_MAX_TRIES 3
1659 :
1660 : static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1661 : {
1662 : if (!task_running(rq, p) &&
1663 : cpumask_test_cpu(cpu, &p->cpus_mask))
1664 : return 1;
1665 :
1666 : return 0;
1667 : }
1668 :
1669 : /*
1670 : * Return the highest pushable rq's task, which is suitable to be executed
1671 : * on the CPU, NULL otherwise
1672 : */
1673 : static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1674 : {
1675 : struct plist_head *head = &rq->rt.pushable_tasks;
1676 : struct task_struct *p;
1677 :
1678 : if (!has_pushable_tasks(rq))
1679 : return NULL;
1680 :
1681 : plist_for_each_entry(p, head, pushable_tasks) {
1682 : if (pick_rt_task(rq, p, cpu))
1683 : return p;
1684 : }
1685 :
1686 : return NULL;
1687 : }
1688 :
1689 : static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1690 :
1691 0 : static int find_lowest_rq(struct task_struct *task)
1692 : {
1693 0 : struct sched_domain *sd;
1694 0 : struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1695 0 : int this_cpu = smp_processor_id();
1696 0 : int cpu = task_cpu(task);
1697 0 : int ret;
1698 :
1699 : /* Make sure the mask is initialized first */
1700 0 : if (unlikely(!lowest_mask))
1701 : return -1;
1702 :
1703 0 : if (task->nr_cpus_allowed == 1)
1704 : return -1; /* No other targets possible */
1705 :
1706 : /*
1707 : * If we're on asym system ensure we consider the different capacities
1708 : * of the CPUs when searching for the lowest_mask.
1709 : */
1710 0 : if (static_branch_unlikely(&sched_asym_cpucapacity)) {
1711 :
1712 0 : ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
1713 : task, lowest_mask,
1714 : rt_task_fits_capacity);
1715 : } else {
1716 :
1717 0 : ret = cpupri_find(&task_rq(task)->rd->cpupri,
1718 : task, lowest_mask);
1719 : }
1720 :
1721 0 : if (!ret)
1722 : return -1; /* No targets found */
1723 :
1724 : /*
1725 : * At this point we have built a mask of CPUs representing the
1726 : * lowest priority tasks in the system. Now we want to elect
1727 : * the best one based on our affinity and topology.
1728 : *
1729 : * We prioritize the last CPU that the task executed on since
1730 : * it is most likely cache-hot in that location.
1731 : */
1732 0 : if (cpumask_test_cpu(cpu, lowest_mask))
1733 : return cpu;
1734 :
1735 : /*
1736 : * Otherwise, we consult the sched_domains span maps to figure
1737 : * out which CPU is logically closest to our hot cache data.
1738 : */
1739 0 : if (!cpumask_test_cpu(this_cpu, lowest_mask))
1740 0 : this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1741 :
1742 0 : rcu_read_lock();
1743 0 : for_each_domain(cpu, sd) {
1744 0 : if (sd->flags & SD_WAKE_AFFINE) {
1745 0 : int best_cpu;
1746 :
1747 : /*
1748 : * "this_cpu" is cheaper to preempt than a
1749 : * remote processor.
1750 : */
1751 0 : if (this_cpu != -1 &&
1752 0 : cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1753 0 : rcu_read_unlock();
1754 0 : return this_cpu;
1755 : }
1756 :
1757 0 : best_cpu = cpumask_any_and_distribute(lowest_mask,
1758 0 : sched_domain_span(sd));
1759 0 : if (best_cpu < nr_cpu_ids) {
1760 0 : rcu_read_unlock();
1761 0 : return best_cpu;
1762 : }
1763 : }
1764 : }
1765 0 : rcu_read_unlock();
1766 :
1767 : /*
1768 : * And finally, if there were no matches within the domains
1769 : * just give the caller *something* to work with from the compatible
1770 : * locations.
1771 : */
1772 0 : if (this_cpu != -1)
1773 : return this_cpu;
1774 :
1775 0 : cpu = cpumask_any_distribute(lowest_mask);
1776 0 : if (cpu < nr_cpu_ids)
1777 0 : return cpu;
1778 :
1779 : return -1;
1780 : }
1781 :
1782 : /* Will lock the rq it finds */
1783 0 : static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1784 : {
1785 0 : struct rq *lowest_rq = NULL;
1786 0 : int tries;
1787 0 : int cpu;
1788 :
1789 0 : for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1790 0 : cpu = find_lowest_rq(task);
1791 :
1792 0 : if ((cpu == -1) || (cpu == rq->cpu))
1793 : break;
1794 :
1795 0 : lowest_rq = cpu_rq(cpu);
1796 :
1797 0 : if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1798 : /*
1799 : * Target rq has tasks of equal or higher priority,
1800 : * retrying does not release any lock and is unlikely
1801 : * to yield a different result.
1802 : */
1803 : lowest_rq = NULL;
1804 : break;
1805 : }
1806 :
1807 : /* if the prio of this runqueue changed, try again */
1808 0 : if (double_lock_balance(rq, lowest_rq)) {
1809 : /*
1810 : * We had to unlock the run queue. In
1811 : * the mean time, task could have
1812 : * migrated already or had its affinity changed.
1813 : * Also make sure that it wasn't scheduled on its rq.
1814 : */
1815 0 : if (unlikely(task_rq(task) != rq ||
1816 : !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
1817 : task_running(rq, task) ||
1818 : !rt_task(task) ||
1819 : !task_on_rq_queued(task))) {
1820 :
1821 0 : double_unlock_balance(rq, lowest_rq);
1822 0 : lowest_rq = NULL;
1823 0 : break;
1824 : }
1825 : }
1826 :
1827 : /* If this rq is still suitable use it. */
1828 0 : if (lowest_rq->rt.highest_prio.curr > task->prio)
1829 : break;
1830 :
1831 : /* try again */
1832 0 : double_unlock_balance(rq, lowest_rq);
1833 0 : lowest_rq = NULL;
1834 : }
1835 :
1836 0 : return lowest_rq;
1837 : }
1838 :
1839 0 : static struct task_struct *pick_next_pushable_task(struct rq *rq)
1840 : {
1841 0 : struct task_struct *p;
1842 :
1843 0 : if (!has_pushable_tasks(rq))
1844 : return NULL;
1845 :
1846 0 : p = plist_first_entry(&rq->rt.pushable_tasks,
1847 : struct task_struct, pushable_tasks);
1848 :
1849 0 : BUG_ON(rq->cpu != task_cpu(p));
1850 0 : BUG_ON(task_current(rq, p));
1851 0 : BUG_ON(p->nr_cpus_allowed <= 1);
1852 :
1853 0 : BUG_ON(!task_on_rq_queued(p));
1854 0 : BUG_ON(!rt_task(p));
1855 :
1856 : return p;
1857 : }
1858 :
1859 : /*
1860 : * If the current CPU has more than one RT task, see if the non
1861 : * running task can migrate over to a CPU that is running a task
1862 : * of lesser priority.
1863 : */
1864 0 : static int push_rt_task(struct rq *rq, bool pull)
1865 : {
1866 0 : struct task_struct *next_task;
1867 0 : struct rq *lowest_rq;
1868 0 : int ret = 0;
1869 :
1870 0 : if (!rq->rt.overloaded)
1871 : return 0;
1872 :
1873 0 : next_task = pick_next_pushable_task(rq);
1874 0 : if (!next_task)
1875 : return 0;
1876 :
1877 0 : retry:
1878 0 : if (is_migration_disabled(next_task)) {
1879 0 : struct task_struct *push_task = NULL;
1880 0 : int cpu;
1881 :
1882 0 : if (!pull || rq->push_busy)
1883 : return 0;
1884 :
1885 0 : cpu = find_lowest_rq(rq->curr);
1886 0 : if (cpu == -1 || cpu == rq->cpu)
1887 : return 0;
1888 :
1889 : /*
1890 : * Given we found a CPU with lower priority than @next_task,
1891 : * therefore it should be running. However we cannot migrate it
1892 : * to this other CPU, instead attempt to push the current
1893 : * running task on this CPU away.
1894 : */
1895 0 : push_task = get_push_task(rq);
1896 0 : if (push_task) {
1897 0 : raw_spin_unlock(&rq->lock);
1898 0 : stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
1899 : push_task, &rq->push_work);
1900 0 : raw_spin_lock(&rq->lock);
1901 : }
1902 :
1903 0 : return 0;
1904 : }
1905 :
1906 0 : if (WARN_ON(next_task == rq->curr))
1907 : return 0;
1908 :
1909 : /*
1910 : * It's possible that the next_task slipped in of
1911 : * higher priority than current. If that's the case
1912 : * just reschedule current.
1913 : */
1914 0 : if (unlikely(next_task->prio < rq->curr->prio)) {
1915 0 : resched_curr(rq);
1916 0 : return 0;
1917 : }
1918 :
1919 : /* We might release rq lock */
1920 0 : get_task_struct(next_task);
1921 :
1922 : /* find_lock_lowest_rq locks the rq if found */
1923 0 : lowest_rq = find_lock_lowest_rq(next_task, rq);
1924 0 : if (!lowest_rq) {
1925 0 : struct task_struct *task;
1926 : /*
1927 : * find_lock_lowest_rq releases rq->lock
1928 : * so it is possible that next_task has migrated.
1929 : *
1930 : * We need to make sure that the task is still on the same
1931 : * run-queue and is also still the next task eligible for
1932 : * pushing.
1933 : */
1934 0 : task = pick_next_pushable_task(rq);
1935 0 : if (task == next_task) {
1936 : /*
1937 : * The task hasn't migrated, and is still the next
1938 : * eligible task, but we failed to find a run-queue
1939 : * to push it to. Do not retry in this case, since
1940 : * other CPUs will pull from us when ready.
1941 : */
1942 0 : goto out;
1943 : }
1944 :
1945 0 : if (!task)
1946 : /* No more tasks, just exit */
1947 0 : goto out;
1948 :
1949 : /*
1950 : * Something has shifted, try again.
1951 : */
1952 0 : put_task_struct(next_task);
1953 0 : next_task = task;
1954 0 : goto retry;
1955 : }
1956 :
1957 0 : deactivate_task(rq, next_task, 0);
1958 0 : set_task_cpu(next_task, lowest_rq->cpu);
1959 0 : activate_task(lowest_rq, next_task, 0);
1960 0 : resched_curr(lowest_rq);
1961 0 : ret = 1;
1962 :
1963 0 : double_unlock_balance(rq, lowest_rq);
1964 0 : out:
1965 0 : put_task_struct(next_task);
1966 :
1967 0 : return ret;
1968 : }
1969 :
1970 0 : static void push_rt_tasks(struct rq *rq)
1971 : {
1972 : /* push_rt_task will return true if it moved an RT */
1973 0 : while (push_rt_task(rq, false))
1974 0 : ;
1975 0 : }
1976 :
1977 : #ifdef HAVE_RT_PUSH_IPI
1978 :
1979 : /*
1980 : * When a high priority task schedules out from a CPU and a lower priority
1981 : * task is scheduled in, a check is made to see if there's any RT tasks
1982 : * on other CPUs that are waiting to run because a higher priority RT task
1983 : * is currently running on its CPU. In this case, the CPU with multiple RT
1984 : * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1985 : * up that may be able to run one of its non-running queued RT tasks.
1986 : *
1987 : * All CPUs with overloaded RT tasks need to be notified as there is currently
1988 : * no way to know which of these CPUs have the highest priority task waiting
1989 : * to run. Instead of trying to take a spinlock on each of these CPUs,
1990 : * which has shown to cause large latency when done on machines with many
1991 : * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1992 : * RT tasks waiting to run.
1993 : *
1994 : * Just sending an IPI to each of the CPUs is also an issue, as on large
1995 : * count CPU machines, this can cause an IPI storm on a CPU, especially
1996 : * if its the only CPU with multiple RT tasks queued, and a large number
1997 : * of CPUs scheduling a lower priority task at the same time.
1998 : *
1999 : * Each root domain has its own irq work function that can iterate over
2000 : * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2001 : * tassk must be checked if there's one or many CPUs that are lowering
2002 : * their priority, there's a single irq work iterator that will try to
2003 : * push off RT tasks that are waiting to run.
2004 : *
2005 : * When a CPU schedules a lower priority task, it will kick off the
2006 : * irq work iterator that will jump to each CPU with overloaded RT tasks.
2007 : * As it only takes the first CPU that schedules a lower priority task
2008 : * to start the process, the rto_start variable is incremented and if
2009 : * the atomic result is one, then that CPU will try to take the rto_lock.
2010 : * This prevents high contention on the lock as the process handles all
2011 : * CPUs scheduling lower priority tasks.
2012 : *
2013 : * All CPUs that are scheduling a lower priority task will increment the
2014 : * rt_loop_next variable. This will make sure that the irq work iterator
2015 : * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2016 : * priority task, even if the iterator is in the middle of a scan. Incrementing
2017 : * the rt_loop_next will cause the iterator to perform another scan.
2018 : *
2019 : */
2020 0 : static int rto_next_cpu(struct root_domain *rd)
2021 : {
2022 0 : int next;
2023 0 : int cpu;
2024 :
2025 : /*
2026 : * When starting the IPI RT pushing, the rto_cpu is set to -1,
2027 : * rt_next_cpu() will simply return the first CPU found in
2028 : * the rto_mask.
2029 : *
2030 : * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2031 : * will return the next CPU found in the rto_mask.
2032 : *
2033 : * If there are no more CPUs left in the rto_mask, then a check is made
2034 : * against rto_loop and rto_loop_next. rto_loop is only updated with
2035 : * the rto_lock held, but any CPU may increment the rto_loop_next
2036 : * without any locking.
2037 : */
2038 0 : for (;;) {
2039 :
2040 : /* When rto_cpu is -1 this acts like cpumask_first() */
2041 0 : cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2042 :
2043 0 : rd->rto_cpu = cpu;
2044 :
2045 0 : if (cpu < nr_cpu_ids)
2046 0 : return cpu;
2047 :
2048 0 : rd->rto_cpu = -1;
2049 :
2050 : /*
2051 : * ACQUIRE ensures we see the @rto_mask changes
2052 : * made prior to the @next value observed.
2053 : *
2054 : * Matches WMB in rt_set_overload().
2055 : */
2056 0 : next = atomic_read_acquire(&rd->rto_loop_next);
2057 :
2058 0 : if (rd->rto_loop == next)
2059 : break;
2060 :
2061 0 : rd->rto_loop = next;
2062 : }
2063 :
2064 : return -1;
2065 : }
2066 :
2067 0 : static inline bool rto_start_trylock(atomic_t *v)
2068 : {
2069 0 : return !atomic_cmpxchg_acquire(v, 0, 1);
2070 : }
2071 :
2072 0 : static inline void rto_start_unlock(atomic_t *v)
2073 : {
2074 0 : atomic_set_release(v, 0);
2075 : }
2076 :
2077 0 : static void tell_cpu_to_push(struct rq *rq)
2078 : {
2079 0 : int cpu = -1;
2080 :
2081 : /* Keep the loop going if the IPI is currently active */
2082 0 : atomic_inc(&rq->rd->rto_loop_next);
2083 :
2084 : /* Only one CPU can initiate a loop at a time */
2085 0 : if (!rto_start_trylock(&rq->rd->rto_loop_start))
2086 : return;
2087 :
2088 0 : raw_spin_lock(&rq->rd->rto_lock);
2089 :
2090 : /*
2091 : * The rto_cpu is updated under the lock, if it has a valid CPU
2092 : * then the IPI is still running and will continue due to the
2093 : * update to loop_next, and nothing needs to be done here.
2094 : * Otherwise it is finishing up and an ipi needs to be sent.
2095 : */
2096 0 : if (rq->rd->rto_cpu < 0)
2097 0 : cpu = rto_next_cpu(rq->rd);
2098 :
2099 0 : raw_spin_unlock(&rq->rd->rto_lock);
2100 :
2101 0 : rto_start_unlock(&rq->rd->rto_loop_start);
2102 :
2103 0 : if (cpu >= 0) {
2104 : /* Make sure the rd does not get freed while pushing */
2105 0 : sched_get_rd(rq->rd);
2106 0 : irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2107 : }
2108 : }
2109 :
2110 : /* Called from hardirq context */
2111 0 : void rto_push_irq_work_func(struct irq_work *work)
2112 : {
2113 0 : struct root_domain *rd =
2114 0 : container_of(work, struct root_domain, rto_push_work);
2115 0 : struct rq *rq;
2116 0 : int cpu;
2117 :
2118 0 : rq = this_rq();
2119 :
2120 : /*
2121 : * We do not need to grab the lock to check for has_pushable_tasks.
2122 : * When it gets updated, a check is made if a push is possible.
2123 : */
2124 0 : if (has_pushable_tasks(rq)) {
2125 0 : raw_spin_lock(&rq->lock);
2126 0 : while (push_rt_task(rq, true))
2127 0 : ;
2128 0 : raw_spin_unlock(&rq->lock);
2129 : }
2130 :
2131 0 : raw_spin_lock(&rd->rto_lock);
2132 :
2133 : /* Pass the IPI to the next rt overloaded queue */
2134 0 : cpu = rto_next_cpu(rd);
2135 :
2136 0 : raw_spin_unlock(&rd->rto_lock);
2137 :
2138 0 : if (cpu < 0) {
2139 0 : sched_put_rd(rd);
2140 0 : return;
2141 : }
2142 :
2143 : /* Try the next RT overloaded CPU */
2144 0 : irq_work_queue_on(&rd->rto_push_work, cpu);
2145 : }
2146 : #endif /* HAVE_RT_PUSH_IPI */
2147 :
2148 36 : static void pull_rt_task(struct rq *this_rq)
2149 : {
2150 36 : int this_cpu = this_rq->cpu, cpu;
2151 36 : bool resched = false;
2152 36 : struct task_struct *p, *push_task;
2153 36 : struct rq *src_rq;
2154 36 : int rt_overload_count = rt_overloaded(this_rq);
2155 :
2156 36 : if (likely(!rt_overload_count))
2157 : return;
2158 :
2159 : /*
2160 : * Match the barrier from rt_set_overloaded; this guarantees that if we
2161 : * see overloaded we must also see the rto_mask bit.
2162 : */
2163 0 : smp_rmb();
2164 :
2165 : /* If we are the only overloaded CPU do nothing */
2166 0 : if (rt_overload_count == 1 &&
2167 0 : cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2168 : return;
2169 :
2170 : #ifdef HAVE_RT_PUSH_IPI
2171 0 : if (sched_feat(RT_PUSH_IPI)) {
2172 0 : tell_cpu_to_push(this_rq);
2173 0 : return;
2174 : }
2175 : #endif
2176 :
2177 : for_each_cpu(cpu, this_rq->rd->rto_mask) {
2178 : if (this_cpu == cpu)
2179 : continue;
2180 :
2181 : src_rq = cpu_rq(cpu);
2182 :
2183 : /*
2184 : * Don't bother taking the src_rq->lock if the next highest
2185 : * task is known to be lower-priority than our current task.
2186 : * This may look racy, but if this value is about to go
2187 : * logically higher, the src_rq will push this task away.
2188 : * And if its going logically lower, we do not care
2189 : */
2190 : if (src_rq->rt.highest_prio.next >=
2191 : this_rq->rt.highest_prio.curr)
2192 : continue;
2193 :
2194 : /*
2195 : * We can potentially drop this_rq's lock in
2196 : * double_lock_balance, and another CPU could
2197 : * alter this_rq
2198 : */
2199 : push_task = NULL;
2200 : double_lock_balance(this_rq, src_rq);
2201 :
2202 : /*
2203 : * We can pull only a task, which is pushable
2204 : * on its rq, and no others.
2205 : */
2206 : p = pick_highest_pushable_task(src_rq, this_cpu);
2207 :
2208 : /*
2209 : * Do we have an RT task that preempts
2210 : * the to-be-scheduled task?
2211 : */
2212 : if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2213 : WARN_ON(p == src_rq->curr);
2214 : WARN_ON(!task_on_rq_queued(p));
2215 :
2216 : /*
2217 : * There's a chance that p is higher in priority
2218 : * than what's currently running on its CPU.
2219 : * This is just that p is wakeing up and hasn't
2220 : * had a chance to schedule. We only pull
2221 : * p if it is lower in priority than the
2222 : * current task on the run queue
2223 : */
2224 : if (p->prio < src_rq->curr->prio)
2225 : goto skip;
2226 :
2227 : if (is_migration_disabled(p)) {
2228 : push_task = get_push_task(src_rq);
2229 : } else {
2230 : deactivate_task(src_rq, p, 0);
2231 : set_task_cpu(p, this_cpu);
2232 : activate_task(this_rq, p, 0);
2233 : resched = true;
2234 : }
2235 : /*
2236 : * We continue with the search, just in
2237 : * case there's an even higher prio task
2238 : * in another runqueue. (low likelihood
2239 : * but possible)
2240 : */
2241 : }
2242 : skip:
2243 : double_unlock_balance(this_rq, src_rq);
2244 :
2245 : if (push_task) {
2246 : raw_spin_unlock(&this_rq->lock);
2247 : stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2248 : push_task, &src_rq->push_work);
2249 : raw_spin_lock(&this_rq->lock);
2250 : }
2251 : }
2252 :
2253 : if (resched)
2254 : resched_curr(this_rq);
2255 : }
2256 :
2257 : /*
2258 : * If we are not running and we are not going to reschedule soon, we should
2259 : * try to push tasks away now
2260 : */
2261 0 : static void task_woken_rt(struct rq *rq, struct task_struct *p)
2262 : {
2263 0 : bool need_to_push = !task_running(rq, p) &&
2264 0 : !test_tsk_need_resched(rq->curr) &&
2265 0 : p->nr_cpus_allowed > 1 &&
2266 0 : (dl_task(rq->curr) || rt_task(rq->curr)) &&
2267 0 : (rq->curr->nr_cpus_allowed < 2 ||
2268 0 : rq->curr->prio <= p->prio);
2269 :
2270 0 : if (need_to_push)
2271 0 : push_rt_tasks(rq);
2272 0 : }
2273 :
2274 : /* Assumes rq->lock is held */
2275 8 : static void rq_online_rt(struct rq *rq)
2276 : {
2277 8 : if (rq->rt.overloaded)
2278 0 : rt_set_overload(rq);
2279 :
2280 8 : __enable_runtime(rq);
2281 :
2282 8 : cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2283 8 : }
2284 :
2285 : /* Assumes rq->lock is held */
2286 4 : static void rq_offline_rt(struct rq *rq)
2287 : {
2288 4 : if (rq->rt.overloaded)
2289 0 : rt_clear_overload(rq);
2290 :
2291 4 : __disable_runtime(rq);
2292 :
2293 4 : cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2294 4 : }
2295 :
2296 : /*
2297 : * When switch from the rt queue, we bring ourselves to a position
2298 : * that we might want to pull RT tasks from other runqueues.
2299 : */
2300 0 : static void switched_from_rt(struct rq *rq, struct task_struct *p)
2301 : {
2302 : /*
2303 : * If there are other RT tasks then we will reschedule
2304 : * and the scheduling of the other RT tasks will handle
2305 : * the balancing. But if we are the last RT task
2306 : * we may need to handle the pulling of RT tasks
2307 : * now.
2308 : */
2309 0 : if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2310 : return;
2311 :
2312 0 : rt_queue_pull_task(rq);
2313 : }
2314 :
2315 1 : void __init init_sched_rt_class(void)
2316 : {
2317 1 : unsigned int i;
2318 :
2319 6 : for_each_possible_cpu(i) {
2320 5 : zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2321 : GFP_KERNEL, cpu_to_node(i));
2322 : }
2323 1 : }
2324 : #endif /* CONFIG_SMP */
2325 :
2326 : /*
2327 : * When switching a task to RT, we may overload the runqueue
2328 : * with RT tasks. In this case we try to push them off to
2329 : * other runqueues.
2330 : */
2331 4 : static void switched_to_rt(struct rq *rq, struct task_struct *p)
2332 : {
2333 : /*
2334 : * If we are already running, then there's nothing
2335 : * that needs to be done. But if we are not running
2336 : * we may need to preempt the current running task.
2337 : * If that current running task is also an RT task
2338 : * then see if we can move to another run queue.
2339 : */
2340 4 : if (task_on_rq_queued(p) && rq->curr != p) {
2341 : #ifdef CONFIG_SMP
2342 0 : if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2343 0 : rt_queue_push_tasks(rq);
2344 : #endif /* CONFIG_SMP */
2345 0 : if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2346 0 : resched_curr(rq);
2347 : }
2348 4 : }
2349 :
2350 : /*
2351 : * Priority of the task has changed. This may cause
2352 : * us to initiate a push or pull.
2353 : */
2354 : static void
2355 0 : prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2356 : {
2357 0 : if (!task_on_rq_queued(p))
2358 : return;
2359 :
2360 0 : if (task_current(rq, p)) {
2361 : #ifdef CONFIG_SMP
2362 : /*
2363 : * If our priority decreases while running, we
2364 : * may need to pull tasks to this runqueue.
2365 : */
2366 0 : if (oldprio < p->prio)
2367 0 : rt_queue_pull_task(rq);
2368 :
2369 : /*
2370 : * If there's a higher priority task waiting to run
2371 : * then reschedule.
2372 : */
2373 0 : if (p->prio > rq->rt.highest_prio.curr)
2374 0 : resched_curr(rq);
2375 : #else
2376 : /* For UP simply resched on drop of prio */
2377 : if (oldprio < p->prio)
2378 : resched_curr(rq);
2379 : #endif /* CONFIG_SMP */
2380 : } else {
2381 : /*
2382 : * This task is not running, but if it is
2383 : * greater than the current running task
2384 : * then reschedule.
2385 : */
2386 0 : if (p->prio < rq->curr->prio)
2387 0 : resched_curr(rq);
2388 : }
2389 : }
2390 :
2391 : #ifdef CONFIG_POSIX_TIMERS
2392 0 : static void watchdog(struct rq *rq, struct task_struct *p)
2393 : {
2394 0 : unsigned long soft, hard;
2395 :
2396 : /* max may change after cur was read, this will be fixed next tick */
2397 0 : soft = task_rlimit(p, RLIMIT_RTTIME);
2398 0 : hard = task_rlimit_max(p, RLIMIT_RTTIME);
2399 :
2400 0 : if (soft != RLIM_INFINITY) {
2401 0 : unsigned long next;
2402 :
2403 0 : if (p->rt.watchdog_stamp != jiffies) {
2404 0 : p->rt.timeout++;
2405 0 : p->rt.watchdog_stamp = jiffies;
2406 : }
2407 :
2408 0 : next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2409 0 : if (p->rt.timeout > next) {
2410 0 : posix_cputimers_rt_watchdog(&p->posix_cputimers,
2411 : p->se.sum_exec_runtime);
2412 : }
2413 : }
2414 0 : }
2415 : #else
2416 : static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2417 : #endif
2418 :
2419 : /*
2420 : * scheduler tick hitting a task of our scheduling class.
2421 : *
2422 : * NOTE: This function can be called remotely by the tick offload that
2423 : * goes along full dynticks. Therefore no local assumption can be made
2424 : * and everything must be accessed through the @rq and @curr passed in
2425 : * parameters.
2426 : */
2427 0 : static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2428 : {
2429 0 : struct sched_rt_entity *rt_se = &p->rt;
2430 :
2431 0 : update_curr_rt(rq);
2432 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2433 :
2434 0 : watchdog(rq, p);
2435 :
2436 : /*
2437 : * RR tasks need a special form of timeslice management.
2438 : * FIFO tasks have no timeslices.
2439 : */
2440 0 : if (p->policy != SCHED_RR)
2441 : return;
2442 :
2443 0 : if (--p->rt.time_slice)
2444 : return;
2445 :
2446 0 : p->rt.time_slice = sched_rr_timeslice;
2447 :
2448 : /*
2449 : * Requeue to the end of queue if we (and all of our ancestors) are not
2450 : * the only element on the queue
2451 : */
2452 0 : for_each_sched_rt_entity(rt_se) {
2453 0 : if (rt_se->run_list.prev != rt_se->run_list.next) {
2454 0 : requeue_task_rt(rq, p, 0);
2455 0 : resched_curr(rq);
2456 0 : return;
2457 : }
2458 : }
2459 : }
2460 :
2461 0 : static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2462 : {
2463 : /*
2464 : * Time slice is 0 for SCHED_FIFO tasks
2465 : */
2466 0 : if (task->policy == SCHED_RR)
2467 0 : return sched_rr_timeslice;
2468 : else
2469 : return 0;
2470 : }
2471 :
2472 : DEFINE_SCHED_CLASS(rt) = {
2473 :
2474 : .enqueue_task = enqueue_task_rt,
2475 : .dequeue_task = dequeue_task_rt,
2476 : .yield_task = yield_task_rt,
2477 :
2478 : .check_preempt_curr = check_preempt_curr_rt,
2479 :
2480 : .pick_next_task = pick_next_task_rt,
2481 : .put_prev_task = put_prev_task_rt,
2482 : .set_next_task = set_next_task_rt,
2483 :
2484 : #ifdef CONFIG_SMP
2485 : .balance = balance_rt,
2486 : .select_task_rq = select_task_rq_rt,
2487 : .set_cpus_allowed = set_cpus_allowed_common,
2488 : .rq_online = rq_online_rt,
2489 : .rq_offline = rq_offline_rt,
2490 : .task_woken = task_woken_rt,
2491 : .switched_from = switched_from_rt,
2492 : .find_lock_rq = find_lock_lowest_rq,
2493 : #endif
2494 :
2495 : .task_tick = task_tick_rt,
2496 :
2497 : .get_rr_interval = get_rr_interval_rt,
2498 :
2499 : .prio_changed = prio_changed_rt,
2500 : .switched_to = switched_to_rt,
2501 :
2502 : .update_curr = update_curr_rt,
2503 :
2504 : #ifdef CONFIG_UCLAMP_TASK
2505 : .uclamp_enabled = 1,
2506 : #endif
2507 : };
2508 :
2509 : #ifdef CONFIG_RT_GROUP_SCHED
2510 : /*
2511 : * Ensure that the real time constraints are schedulable.
2512 : */
2513 : static DEFINE_MUTEX(rt_constraints_mutex);
2514 :
2515 : static inline int tg_has_rt_tasks(struct task_group *tg)
2516 : {
2517 : struct task_struct *task;
2518 : struct css_task_iter it;
2519 : int ret = 0;
2520 :
2521 : /*
2522 : * Autogroups do not have RT tasks; see autogroup_create().
2523 : */
2524 : if (task_group_is_autogroup(tg))
2525 : return 0;
2526 :
2527 : css_task_iter_start(&tg->css, 0, &it);
2528 : while (!ret && (task = css_task_iter_next(&it)))
2529 : ret |= rt_task(task);
2530 : css_task_iter_end(&it);
2531 :
2532 : return ret;
2533 : }
2534 :
2535 : struct rt_schedulable_data {
2536 : struct task_group *tg;
2537 : u64 rt_period;
2538 : u64 rt_runtime;
2539 : };
2540 :
2541 : static int tg_rt_schedulable(struct task_group *tg, void *data)
2542 : {
2543 : struct rt_schedulable_data *d = data;
2544 : struct task_group *child;
2545 : unsigned long total, sum = 0;
2546 : u64 period, runtime;
2547 :
2548 : period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2549 : runtime = tg->rt_bandwidth.rt_runtime;
2550 :
2551 : if (tg == d->tg) {
2552 : period = d->rt_period;
2553 : runtime = d->rt_runtime;
2554 : }
2555 :
2556 : /*
2557 : * Cannot have more runtime than the period.
2558 : */
2559 : if (runtime > period && runtime != RUNTIME_INF)
2560 : return -EINVAL;
2561 :
2562 : /*
2563 : * Ensure we don't starve existing RT tasks if runtime turns zero.
2564 : */
2565 : if (rt_bandwidth_enabled() && !runtime &&
2566 : tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2567 : return -EBUSY;
2568 :
2569 : total = to_ratio(period, runtime);
2570 :
2571 : /*
2572 : * Nobody can have more than the global setting allows.
2573 : */
2574 : if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2575 : return -EINVAL;
2576 :
2577 : /*
2578 : * The sum of our children's runtime should not exceed our own.
2579 : */
2580 : list_for_each_entry_rcu(child, &tg->children, siblings) {
2581 : period = ktime_to_ns(child->rt_bandwidth.rt_period);
2582 : runtime = child->rt_bandwidth.rt_runtime;
2583 :
2584 : if (child == d->tg) {
2585 : period = d->rt_period;
2586 : runtime = d->rt_runtime;
2587 : }
2588 :
2589 : sum += to_ratio(period, runtime);
2590 : }
2591 :
2592 : if (sum > total)
2593 : return -EINVAL;
2594 :
2595 : return 0;
2596 : }
2597 :
2598 : static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2599 : {
2600 : int ret;
2601 :
2602 : struct rt_schedulable_data data = {
2603 : .tg = tg,
2604 : .rt_period = period,
2605 : .rt_runtime = runtime,
2606 : };
2607 :
2608 : rcu_read_lock();
2609 : ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2610 : rcu_read_unlock();
2611 :
2612 : return ret;
2613 : }
2614 :
2615 : static int tg_set_rt_bandwidth(struct task_group *tg,
2616 : u64 rt_period, u64 rt_runtime)
2617 : {
2618 : int i, err = 0;
2619 :
2620 : /*
2621 : * Disallowing the root group RT runtime is BAD, it would disallow the
2622 : * kernel creating (and or operating) RT threads.
2623 : */
2624 : if (tg == &root_task_group && rt_runtime == 0)
2625 : return -EINVAL;
2626 :
2627 : /* No period doesn't make any sense. */
2628 : if (rt_period == 0)
2629 : return -EINVAL;
2630 :
2631 : /*
2632 : * Bound quota to defend quota against overflow during bandwidth shift.
2633 : */
2634 : if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2635 : return -EINVAL;
2636 :
2637 : mutex_lock(&rt_constraints_mutex);
2638 : err = __rt_schedulable(tg, rt_period, rt_runtime);
2639 : if (err)
2640 : goto unlock;
2641 :
2642 : raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2643 : tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2644 : tg->rt_bandwidth.rt_runtime = rt_runtime;
2645 :
2646 : for_each_possible_cpu(i) {
2647 : struct rt_rq *rt_rq = tg->rt_rq[i];
2648 :
2649 : raw_spin_lock(&rt_rq->rt_runtime_lock);
2650 : rt_rq->rt_runtime = rt_runtime;
2651 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
2652 : }
2653 : raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2654 : unlock:
2655 : mutex_unlock(&rt_constraints_mutex);
2656 :
2657 : return err;
2658 : }
2659 :
2660 : int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2661 : {
2662 : u64 rt_runtime, rt_period;
2663 :
2664 : rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2665 : rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2666 : if (rt_runtime_us < 0)
2667 : rt_runtime = RUNTIME_INF;
2668 : else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2669 : return -EINVAL;
2670 :
2671 : return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2672 : }
2673 :
2674 : long sched_group_rt_runtime(struct task_group *tg)
2675 : {
2676 : u64 rt_runtime_us;
2677 :
2678 : if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2679 : return -1;
2680 :
2681 : rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2682 : do_div(rt_runtime_us, NSEC_PER_USEC);
2683 : return rt_runtime_us;
2684 : }
2685 :
2686 : int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2687 : {
2688 : u64 rt_runtime, rt_period;
2689 :
2690 : if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2691 : return -EINVAL;
2692 :
2693 : rt_period = rt_period_us * NSEC_PER_USEC;
2694 : rt_runtime = tg->rt_bandwidth.rt_runtime;
2695 :
2696 : return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2697 : }
2698 :
2699 : long sched_group_rt_period(struct task_group *tg)
2700 : {
2701 : u64 rt_period_us;
2702 :
2703 : rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2704 : do_div(rt_period_us, NSEC_PER_USEC);
2705 : return rt_period_us;
2706 : }
2707 :
2708 : static int sched_rt_global_constraints(void)
2709 : {
2710 : int ret = 0;
2711 :
2712 : mutex_lock(&rt_constraints_mutex);
2713 : ret = __rt_schedulable(NULL, 0, 0);
2714 : mutex_unlock(&rt_constraints_mutex);
2715 :
2716 : return ret;
2717 : }
2718 :
2719 : int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2720 : {
2721 : /* Don't accept realtime tasks when there is no way for them to run */
2722 : if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2723 : return 0;
2724 :
2725 : return 1;
2726 : }
2727 :
2728 : #else /* !CONFIG_RT_GROUP_SCHED */
2729 0 : static int sched_rt_global_constraints(void)
2730 : {
2731 0 : unsigned long flags;
2732 0 : int i;
2733 :
2734 0 : raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2735 0 : for_each_possible_cpu(i) {
2736 0 : struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2737 :
2738 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
2739 0 : rt_rq->rt_runtime = global_rt_runtime();
2740 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
2741 : }
2742 0 : raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2743 :
2744 0 : return 0;
2745 : }
2746 : #endif /* CONFIG_RT_GROUP_SCHED */
2747 :
2748 0 : static int sched_rt_global_validate(void)
2749 : {
2750 0 : if (sysctl_sched_rt_period <= 0)
2751 : return -EINVAL;
2752 :
2753 0 : if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2754 0 : ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2755 0 : ((u64)sysctl_sched_rt_runtime *
2756 : NSEC_PER_USEC > max_rt_runtime)))
2757 0 : return -EINVAL;
2758 :
2759 : return 0;
2760 : }
2761 :
2762 0 : static void sched_rt_do_global(void)
2763 : {
2764 0 : def_rt_bandwidth.rt_runtime = global_rt_runtime();
2765 0 : def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2766 0 : }
2767 :
2768 0 : int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
2769 : size_t *lenp, loff_t *ppos)
2770 : {
2771 0 : int old_period, old_runtime;
2772 0 : static DEFINE_MUTEX(mutex);
2773 0 : int ret;
2774 :
2775 0 : mutex_lock(&mutex);
2776 0 : old_period = sysctl_sched_rt_period;
2777 0 : old_runtime = sysctl_sched_rt_runtime;
2778 :
2779 0 : ret = proc_dointvec(table, write, buffer, lenp, ppos);
2780 :
2781 0 : if (!ret && write) {
2782 0 : ret = sched_rt_global_validate();
2783 0 : if (ret)
2784 0 : goto undo;
2785 :
2786 0 : ret = sched_dl_global_validate();
2787 0 : if (ret)
2788 0 : goto undo;
2789 :
2790 0 : ret = sched_rt_global_constraints();
2791 0 : if (ret)
2792 0 : goto undo;
2793 :
2794 0 : sched_rt_do_global();
2795 0 : sched_dl_do_global();
2796 : }
2797 : if (0) {
2798 0 : undo:
2799 0 : sysctl_sched_rt_period = old_period;
2800 0 : sysctl_sched_rt_runtime = old_runtime;
2801 : }
2802 0 : mutex_unlock(&mutex);
2803 :
2804 0 : return ret;
2805 : }
2806 :
2807 0 : int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
2808 : size_t *lenp, loff_t *ppos)
2809 : {
2810 0 : int ret;
2811 0 : static DEFINE_MUTEX(mutex);
2812 :
2813 0 : mutex_lock(&mutex);
2814 0 : ret = proc_dointvec(table, write, buffer, lenp, ppos);
2815 : /*
2816 : * Make sure that internally we keep jiffies.
2817 : * Also, writing zero resets the timeslice to default:
2818 : */
2819 0 : if (!ret && write) {
2820 0 : sched_rr_timeslice =
2821 0 : sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2822 0 : msecs_to_jiffies(sysctl_sched_rr_timeslice);
2823 : }
2824 0 : mutex_unlock(&mutex);
2825 :
2826 0 : return ret;
2827 : }
2828 :
2829 : #ifdef CONFIG_SCHED_DEBUG
2830 : void print_rt_stats(struct seq_file *m, int cpu)
2831 : {
2832 : rt_rq_iter_t iter;
2833 : struct rt_rq *rt_rq;
2834 :
2835 : rcu_read_lock();
2836 : for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2837 : print_rt_rq(m, cpu, rt_rq);
2838 : rcu_read_unlock();
2839 : }
2840 : #endif /* CONFIG_SCHED_DEBUG */
|