Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 : *
5 : * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 : *
7 : * Interactivity improvements by Mike Galbraith
8 : * (C) 2007 Mike Galbraith <efault@gmx.de>
9 : *
10 : * Various enhancements by Dmitry Adamushko.
11 : * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 : *
13 : * Group scheduling enhancements by Srivatsa Vaddagiri
14 : * Copyright IBM Corporation, 2007
15 : * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 : *
17 : * Scaled math optimizations by Thomas Gleixner
18 : * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 : *
20 : * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 : * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 : */
23 : #include "sched.h"
24 :
25 : /*
26 : * Targeted preemption latency for CPU-bound tasks:
27 : *
28 : * NOTE: this latency value is not the same as the concept of
29 : * 'timeslice length' - timeslices in CFS are of variable length
30 : * and have no persistent notion like in traditional, time-slice
31 : * based scheduling concepts.
32 : *
33 : * (to see the precise effective timeslice length of your workload,
34 : * run vmstat and monitor the context-switches (cs) field)
35 : *
36 : * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
37 : */
38 : unsigned int sysctl_sched_latency = 6000000ULL;
39 : static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 :
41 : /*
42 : * The initial- and re-scaling of tunables is configurable
43 : *
44 : * Options are:
45 : *
46 : * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 : * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 : * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 : *
50 : * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
51 : */
52 : enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
53 :
54 : /*
55 : * Minimal preemption granularity for CPU-bound tasks:
56 : *
57 : * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
58 : */
59 : unsigned int sysctl_sched_min_granularity = 750000ULL;
60 : static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
61 :
62 : /*
63 : * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
64 : */
65 : static unsigned int sched_nr_latency = 8;
66 :
67 : /*
68 : * After fork, child runs first. If set to 0 (default) then
69 : * parent will (try to) run first.
70 : */
71 : unsigned int sysctl_sched_child_runs_first __read_mostly;
72 :
73 : /*
74 : * SCHED_OTHER wake-up granularity.
75 : *
76 : * This option delays the preemption effects of decoupled workloads
77 : * and reduces their over-scheduling. Synchronous workloads will still
78 : * have immediate wakeup/sleep latencies.
79 : *
80 : * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
81 : */
82 : unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
83 : static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
84 :
85 : const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
86 :
87 : int sched_thermal_decay_shift;
88 0 : static int __init setup_sched_thermal_decay_shift(char *str)
89 : {
90 0 : int _shift = 0;
91 :
92 0 : if (kstrtoint(str, 0, &_shift))
93 0 : pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
94 :
95 0 : sched_thermal_decay_shift = clamp(_shift, 0, 10);
96 0 : return 1;
97 : }
98 : __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
99 :
100 : #ifdef CONFIG_SMP
101 : /*
102 : * For asym packing, by default the lower numbered CPU has higher priority.
103 : */
104 0 : int __weak arch_asym_cpu_priority(int cpu)
105 : {
106 0 : return -cpu;
107 : }
108 :
109 : /*
110 : * The margin used when comparing utilization with CPU capacity.
111 : *
112 : * (default: ~20%)
113 : */
114 : #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
115 :
116 : #endif
117 :
118 : #ifdef CONFIG_CFS_BANDWIDTH
119 : /*
120 : * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
121 : * each time a cfs_rq requests quota.
122 : *
123 : * Note: in the case that the slice exceeds the runtime remaining (either due
124 : * to consumption or the quota being specified to be smaller than the slice)
125 : * we will always only issue the remaining available time.
126 : *
127 : * (default: 5 msec, units: microseconds)
128 : */
129 : unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
130 : #endif
131 :
132 16754 : static inline void update_load_add(struct load_weight *lw, unsigned long inc)
133 : {
134 16754 : lw->weight += inc;
135 16754 : lw->inv_weight = 0;
136 0 : }
137 :
138 15767 : static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
139 : {
140 15767 : lw->weight -= dec;
141 15767 : lw->inv_weight = 0;
142 0 : }
143 :
144 25 : static inline void update_load_set(struct load_weight *lw, unsigned long w)
145 : {
146 25 : lw->weight = w;
147 25 : lw->inv_weight = 0;
148 : }
149 :
150 : /*
151 : * Increase the granularity value when there are more CPUs,
152 : * because with more CPUs the 'effective latency' as visible
153 : * to users decreases. But the relationship is not linear,
154 : * so pick a second-best guess by going with the log2 of the
155 : * number of CPUs.
156 : *
157 : * This idea comes from the SD scheduler of Con Kolivas:
158 : */
159 13 : static unsigned int get_update_sysctl_factor(void)
160 : {
161 13 : unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
162 13 : unsigned int factor;
163 :
164 13 : switch (sysctl_sched_tunable_scaling) {
165 : case SCHED_TUNABLESCALING_NONE:
166 : factor = 1;
167 : break;
168 0 : case SCHED_TUNABLESCALING_LINEAR:
169 0 : factor = cpus;
170 0 : break;
171 13 : case SCHED_TUNABLESCALING_LOG:
172 : default:
173 13 : factor = 1 + ilog2(cpus);
174 : break;
175 : }
176 :
177 13 : return factor;
178 : }
179 :
180 13 : static void update_sysctl(void)
181 : {
182 13 : unsigned int factor = get_update_sysctl_factor();
183 :
184 : #define SET_SYSCTL(name) \
185 : (sysctl_##name = (factor) * normalized_sysctl_##name)
186 13 : SET_SYSCTL(sched_min_granularity);
187 13 : SET_SYSCTL(sched_latency);
188 13 : SET_SYSCTL(sched_wakeup_granularity);
189 : #undef SET_SYSCTL
190 13 : }
191 :
192 1 : void __init sched_init_granularity(void)
193 : {
194 1 : update_sysctl();
195 1 : }
196 :
197 : #define WMULT_CONST (~0U)
198 : #define WMULT_SHIFT 32
199 :
200 7846 : static void __update_inv_weight(struct load_weight *lw)
201 : {
202 7846 : unsigned long w;
203 :
204 7846 : if (likely(lw->inv_weight))
205 : return;
206 :
207 3512 : w = scale_load_down(lw->weight);
208 :
209 3512 : if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
210 0 : lw->inv_weight = 1;
211 3512 : else if (unlikely(!w))
212 0 : lw->inv_weight = WMULT_CONST;
213 : else
214 3512 : lw->inv_weight = WMULT_CONST / w;
215 : }
216 :
217 : /*
218 : * delta_exec * weight / lw.weight
219 : * OR
220 : * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
221 : *
222 : * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
223 : * we're guaranteed shift stays positive because inv_weight is guaranteed to
224 : * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
225 : *
226 : * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
227 : * weight/lw.weight <= 1, and therefore our shift will also be positive.
228 : */
229 7859 : static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
230 : {
231 7859 : u64 fact = scale_load_down(weight);
232 7859 : int shift = WMULT_SHIFT;
233 :
234 7859 : __update_inv_weight(lw);
235 :
236 7854 : if (unlikely(fact >> 32)) {
237 0 : while (fact >> 32) {
238 0 : fact >>= 1;
239 0 : shift--;
240 : }
241 : }
242 :
243 7854 : fact = mul_u32_u32(fact, lw->inv_weight);
244 :
245 23833 : while (fact >> 32) {
246 15979 : fact >>= 1;
247 15979 : shift--;
248 : }
249 :
250 7854 : return mul_u64_u32_shr(delta_exec, fact, shift);
251 : }
252 :
253 :
254 : const struct sched_class fair_sched_class;
255 :
256 : /**************************************************************
257 : * CFS operations on generic schedulable entities:
258 : */
259 :
260 : #ifdef CONFIG_FAIR_GROUP_SCHED
261 : static inline struct task_struct *task_of(struct sched_entity *se)
262 : {
263 : SCHED_WARN_ON(!entity_is_task(se));
264 : return container_of(se, struct task_struct, se);
265 : }
266 :
267 : /* Walk up scheduling entities hierarchy */
268 : #define for_each_sched_entity(se) \
269 : for (; se; se = se->parent)
270 :
271 : static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
272 : {
273 : return p->se.cfs_rq;
274 : }
275 :
276 : /* runqueue on which this entity is (to be) queued */
277 : static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
278 : {
279 : return se->cfs_rq;
280 : }
281 :
282 : /* runqueue "owned" by this group */
283 : static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
284 : {
285 : return grp->my_q;
286 : }
287 :
288 : static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
289 : {
290 : if (!path)
291 : return;
292 :
293 : if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
294 : autogroup_path(cfs_rq->tg, path, len);
295 : else if (cfs_rq && cfs_rq->tg->css.cgroup)
296 : cgroup_path(cfs_rq->tg->css.cgroup, path, len);
297 : else
298 : strlcpy(path, "(null)", len);
299 : }
300 :
301 : static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
302 : {
303 : struct rq *rq = rq_of(cfs_rq);
304 : int cpu = cpu_of(rq);
305 :
306 : if (cfs_rq->on_list)
307 : return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
308 :
309 : cfs_rq->on_list = 1;
310 :
311 : /*
312 : * Ensure we either appear before our parent (if already
313 : * enqueued) or force our parent to appear after us when it is
314 : * enqueued. The fact that we always enqueue bottom-up
315 : * reduces this to two cases and a special case for the root
316 : * cfs_rq. Furthermore, it also means that we will always reset
317 : * tmp_alone_branch either when the branch is connected
318 : * to a tree or when we reach the top of the tree
319 : */
320 : if (cfs_rq->tg->parent &&
321 : cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
322 : /*
323 : * If parent is already on the list, we add the child
324 : * just before. Thanks to circular linked property of
325 : * the list, this means to put the child at the tail
326 : * of the list that starts by parent.
327 : */
328 : list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
329 : &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
330 : /*
331 : * The branch is now connected to its tree so we can
332 : * reset tmp_alone_branch to the beginning of the
333 : * list.
334 : */
335 : rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
336 : return true;
337 : }
338 :
339 : if (!cfs_rq->tg->parent) {
340 : /*
341 : * cfs rq without parent should be put
342 : * at the tail of the list.
343 : */
344 : list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
345 : &rq->leaf_cfs_rq_list);
346 : /*
347 : * We have reach the top of a tree so we can reset
348 : * tmp_alone_branch to the beginning of the list.
349 : */
350 : rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
351 : return true;
352 : }
353 :
354 : /*
355 : * The parent has not already been added so we want to
356 : * make sure that it will be put after us.
357 : * tmp_alone_branch points to the begin of the branch
358 : * where we will add parent.
359 : */
360 : list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
361 : /*
362 : * update tmp_alone_branch to points to the new begin
363 : * of the branch
364 : */
365 : rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
366 : return false;
367 : }
368 :
369 : static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
370 : {
371 : if (cfs_rq->on_list) {
372 : struct rq *rq = rq_of(cfs_rq);
373 :
374 : /*
375 : * With cfs_rq being unthrottled/throttled during an enqueue,
376 : * it can happen the tmp_alone_branch points the a leaf that
377 : * we finally want to del. In this case, tmp_alone_branch moves
378 : * to the prev element but it will point to rq->leaf_cfs_rq_list
379 : * at the end of the enqueue.
380 : */
381 : if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
382 : rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
383 :
384 : list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
385 : cfs_rq->on_list = 0;
386 : }
387 : }
388 :
389 : static inline void assert_list_leaf_cfs_rq(struct rq *rq)
390 : {
391 : SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
392 : }
393 :
394 : /* Iterate thr' all leaf cfs_rq's on a runqueue */
395 : #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
396 : list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
397 : leaf_cfs_rq_list)
398 :
399 : /* Do the two (enqueued) entities belong to the same group ? */
400 : static inline struct cfs_rq *
401 : is_same_group(struct sched_entity *se, struct sched_entity *pse)
402 : {
403 : if (se->cfs_rq == pse->cfs_rq)
404 : return se->cfs_rq;
405 :
406 : return NULL;
407 : }
408 :
409 : static inline struct sched_entity *parent_entity(struct sched_entity *se)
410 : {
411 : return se->parent;
412 : }
413 :
414 : static void
415 : find_matching_se(struct sched_entity **se, struct sched_entity **pse)
416 : {
417 : int se_depth, pse_depth;
418 :
419 : /*
420 : * preemption test can be made between sibling entities who are in the
421 : * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
422 : * both tasks until we find their ancestors who are siblings of common
423 : * parent.
424 : */
425 :
426 : /* First walk up until both entities are at same depth */
427 : se_depth = (*se)->depth;
428 : pse_depth = (*pse)->depth;
429 :
430 : while (se_depth > pse_depth) {
431 : se_depth--;
432 : *se = parent_entity(*se);
433 : }
434 :
435 : while (pse_depth > se_depth) {
436 : pse_depth--;
437 : *pse = parent_entity(*pse);
438 : }
439 :
440 : while (!is_same_group(*se, *pse)) {
441 : *se = parent_entity(*se);
442 : *pse = parent_entity(*pse);
443 : }
444 : }
445 :
446 : #else /* !CONFIG_FAIR_GROUP_SCHED */
447 :
448 203536 : static inline struct task_struct *task_of(struct sched_entity *se)
449 : {
450 203536 : return container_of(se, struct task_struct, se);
451 : }
452 :
453 : #define for_each_sched_entity(se) \
454 : for (; se; se = NULL)
455 :
456 9362 : static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
457 : {
458 990 : return &task_rq(p)->cfs;
459 : }
460 :
461 104734 : static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
462 : {
463 104734 : struct task_struct *p = task_of(se);
464 0 : struct rq *rq = task_rq(p);
465 :
466 7763 : return &rq->cfs;
467 : }
468 :
469 : /* runqueue "owned" by this group */
470 21076 : static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
471 : {
472 21076 : return NULL;
473 : }
474 :
475 0 : static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
476 : {
477 0 : if (path)
478 0 : strlcpy(path, "(null)", len);
479 : }
480 :
481 : static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
482 : {
483 : return true;
484 : }
485 :
486 : static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
487 : {
488 : }
489 :
490 15764 : static inline void assert_list_leaf_cfs_rq(struct rq *rq)
491 : {
492 15764 : }
493 :
494 : #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
495 : for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
496 :
497 : static inline struct sched_entity *parent_entity(struct sched_entity *se)
498 : {
499 : return NULL;
500 : }
501 :
502 : static inline void
503 7714 : find_matching_se(struct sched_entity **se, struct sched_entity **pse)
504 : {
505 7714 : }
506 :
507 : #endif /* CONFIG_FAIR_GROUP_SCHED */
508 :
509 : static __always_inline
510 : void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
511 :
512 : /**************************************************************
513 : * Scheduling class tree data structure manipulation methods:
514 : */
515 :
516 71722 : static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
517 : {
518 71722 : s64 delta = (s64)(vruntime - max_vruntime);
519 71722 : if (delta > 0)
520 29432 : max_vruntime = vruntime;
521 :
522 71722 : return max_vruntime;
523 : }
524 :
525 16236 : static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
526 : {
527 16236 : s64 delta = (s64)(vruntime - min_vruntime);
528 16236 : if (delta < 0)
529 6350 : min_vruntime = vruntime;
530 :
531 : return min_vruntime;
532 : }
533 :
534 10321 : static inline bool entity_before(struct sched_entity *a,
535 : struct sched_entity *b)
536 : {
537 10321 : return (s64)(a->vruntime - b->vruntime) < 0;
538 : }
539 :
540 : #define __node_2_se(node) \
541 : rb_entry((node), struct sched_entity, run_node)
542 :
543 56851 : static void update_min_vruntime(struct cfs_rq *cfs_rq)
544 : {
545 56851 : struct sched_entity *curr = cfs_rq->curr;
546 56851 : struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
547 :
548 56851 : u64 vruntime = cfs_rq->min_vruntime;
549 :
550 56851 : if (curr) {
551 56813 : if (curr->on_rq)
552 41458 : vruntime = curr->vruntime;
553 : else
554 : curr = NULL;
555 : }
556 :
557 56851 : if (leftmost) { /* non-empty tree */
558 23725 : struct sched_entity *se = __node_2_se(leftmost);
559 :
560 23725 : if (!curr)
561 7489 : vruntime = se->vruntime;
562 : else
563 16236 : vruntime = min_vruntime(vruntime, se->vruntime);
564 : }
565 :
566 : /* ensure we never gain time by being placed backwards. */
567 56851 : cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
568 : #ifndef CONFIG_64BIT
569 : smp_wmb();
570 : cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
571 : #endif
572 56851 : }
573 :
574 10321 : static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
575 : {
576 10321 : return entity_before(__node_2_se(a), __node_2_se(b));
577 : }
578 :
579 : /*
580 : * Enqueue an entity into the rb-tree:
581 : */
582 21979 : static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
583 : {
584 21979 : rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
585 21977 : }
586 :
587 21981 : static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
588 : {
589 21981 : rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
590 887 : }
591 :
592 23017 : struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
593 : {
594 23017 : struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
595 :
596 0 : if (!left)
597 : return NULL;
598 :
599 1942 : return __node_2_se(left);
600 : }
601 :
602 0 : static struct sched_entity *__pick_next_entity(struct sched_entity *se)
603 : {
604 0 : struct rb_node *next = rb_next(&se->run_node);
605 :
606 0 : if (!next)
607 : return NULL;
608 :
609 0 : return __node_2_se(next);
610 : }
611 :
612 : #ifdef CONFIG_SCHED_DEBUG
613 : struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
614 : {
615 : struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
616 :
617 : if (!last)
618 : return NULL;
619 :
620 : return __node_2_se(last);
621 : }
622 :
623 : /**************************************************************
624 : * Scheduling class statistics methods:
625 : */
626 :
627 : int sched_proc_update_handler(struct ctl_table *table, int write,
628 : void *buffer, size_t *lenp, loff_t *ppos)
629 : {
630 : int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
631 : unsigned int factor = get_update_sysctl_factor();
632 :
633 : if (ret || !write)
634 : return ret;
635 :
636 : sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
637 : sysctl_sched_min_granularity);
638 :
639 : #define WRT_SYSCTL(name) \
640 : (normalized_sysctl_##name = sysctl_##name / (factor))
641 : WRT_SYSCTL(sched_min_granularity);
642 : WRT_SYSCTL(sched_latency);
643 : WRT_SYSCTL(sched_wakeup_granularity);
644 : #undef WRT_SYSCTL
645 :
646 : return 0;
647 : }
648 : #endif
649 :
650 : /*
651 : * delta /= w
652 : */
653 48421 : static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
654 : {
655 48421 : if (unlikely(se->load.weight != NICE_0_LOAD))
656 2787 : delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
657 :
658 48420 : return delta;
659 : }
660 :
661 : /*
662 : * The idea is to set a period in which each task runs once.
663 : *
664 : * When there are too many tasks (sched_nr_latency) we have to stretch
665 : * this period because otherwise the slices get too small.
666 : *
667 : * p = (nr <= nl) ? l : l*nr/nl
668 : */
669 5097 : static u64 __sched_period(unsigned long nr_running)
670 : {
671 5097 : if (unlikely(nr_running > sched_nr_latency))
672 0 : return nr_running * sysctl_sched_min_granularity;
673 : else
674 5097 : return sysctl_sched_latency;
675 : }
676 :
677 : /*
678 : * We calculate the wall-time slice from the period by taking a part
679 : * proportional to the weight.
680 : *
681 : * s = p*P[w/rw]
682 : */
683 5097 : static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
684 : {
685 5097 : u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
686 :
687 5097 : for_each_sched_entity(se) {
688 5097 : struct load_weight *load;
689 5097 : struct load_weight lw;
690 :
691 5097 : cfs_rq = cfs_rq_of(se);
692 5097 : load = &cfs_rq->load;
693 :
694 5097 : if (unlikely(!se->on_rq)) {
695 990 : lw = cfs_rq->load;
696 :
697 990 : update_load_add(&lw, se->load.weight);
698 990 : load = &lw;
699 : }
700 5097 : slice = __calc_delta(slice, se->load.weight, load);
701 : }
702 5115 : return slice;
703 : }
704 :
705 : /*
706 : * We calculate the vruntime slice of a to-be-inserted task.
707 : *
708 : * vs = s/w
709 : */
710 990 : static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
711 : {
712 990 : return calc_delta_fair(sched_slice(cfs_rq, se), se);
713 : }
714 :
715 : #include "pelt.h"
716 : #ifdef CONFIG_SMP
717 :
718 : static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
719 : static unsigned long task_h_load(struct task_struct *p);
720 : static unsigned long capacity_of(int cpu);
721 :
722 : /* Give new sched_entity start runnable values to heavy its load in infant time */
723 990 : void init_entity_runnable_average(struct sched_entity *se)
724 : {
725 990 : struct sched_avg *sa = &se->avg;
726 :
727 990 : memset(sa, 0, sizeof(*sa));
728 :
729 : /*
730 : * Tasks are initialized with full load to be seen as heavy tasks until
731 : * they get a chance to stabilize to their real load level.
732 : * Group entities are initialized with zero load to reflect the fact that
733 : * nothing has been attached to the task group yet.
734 : */
735 990 : if (entity_is_task(se))
736 990 : sa->load_avg = scale_load_down(se->load.weight);
737 :
738 : /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
739 990 : }
740 :
741 : static void attach_entity_cfs_rq(struct sched_entity *se);
742 :
743 : /*
744 : * With new tasks being created, their initial util_avgs are extrapolated
745 : * based on the cfs_rq's current util_avg:
746 : *
747 : * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
748 : *
749 : * However, in many cases, the above util_avg does not give a desired
750 : * value. Moreover, the sum of the util_avgs may be divergent, such
751 : * as when the series is a harmonic series.
752 : *
753 : * To solve this problem, we also cap the util_avg of successive tasks to
754 : * only 1/2 of the left utilization budget:
755 : *
756 : * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
757 : *
758 : * where n denotes the nth task and cpu_scale the CPU capacity.
759 : *
760 : * For example, for a CPU with 1024 of capacity, a simplest series from
761 : * the beginning would be like:
762 : *
763 : * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
764 : * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
765 : *
766 : * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
767 : * if util_avg > util_avg_cap.
768 : */
769 987 : void post_init_entity_util_avg(struct task_struct *p)
770 : {
771 987 : struct sched_entity *se = &p->se;
772 987 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
773 987 : struct sched_avg *sa = &se->avg;
774 987 : long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
775 987 : long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
776 :
777 987 : if (cap > 0) {
778 970 : if (cfs_rq->avg.util_avg != 0) {
779 964 : sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
780 964 : sa->util_avg /= (cfs_rq->avg.load_avg + 1);
781 :
782 964 : if (sa->util_avg > cap)
783 962 : sa->util_avg = cap;
784 : } else {
785 6 : sa->util_avg = cap;
786 : }
787 : }
788 :
789 987 : sa->runnable_avg = sa->util_avg;
790 :
791 987 : if (p->sched_class != &fair_sched_class) {
792 : /*
793 : * For !fair tasks do:
794 : *
795 : update_cfs_rq_load_avg(now, cfs_rq);
796 : attach_entity_load_avg(cfs_rq, se);
797 : switched_from_fair(rq, p);
798 : *
799 : * such that the next switched_to_fair() has the
800 : * expected state.
801 : */
802 0 : se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
803 0 : return;
804 : }
805 :
806 987 : attach_entity_cfs_rq(se);
807 : }
808 :
809 : #else /* !CONFIG_SMP */
810 : void init_entity_runnable_average(struct sched_entity *se)
811 : {
812 : }
813 : void post_init_entity_util_avg(struct task_struct *p)
814 : {
815 : }
816 : static void update_tg_load_avg(struct cfs_rq *cfs_rq)
817 : {
818 : }
819 : #endif /* CONFIG_SMP */
820 :
821 : /*
822 : * Update the current task's runtime statistics.
823 : */
824 61115 : static void update_curr(struct cfs_rq *cfs_rq)
825 : {
826 61115 : struct sched_entity *curr = cfs_rq->curr;
827 61115 : u64 now = rq_clock_task(rq_of(cfs_rq));
828 61385 : u64 delta_exec;
829 :
830 61385 : if (unlikely(!curr))
831 : return;
832 :
833 53789 : delta_exec = now - curr->exec_start;
834 53789 : if (unlikely((s64)delta_exec <= 0))
835 : return;
836 :
837 41065 : curr->exec_start = now;
838 :
839 41065 : schedstat_set(curr->statistics.exec_max,
840 : max(delta_exec, curr->statistics.exec_max));
841 :
842 41065 : curr->sum_exec_runtime += delta_exec;
843 41065 : schedstat_add(cfs_rq->exec_clock, delta_exec);
844 :
845 41065 : curr->vruntime += calc_delta_fair(delta_exec, curr);
846 41074 : update_min_vruntime(cfs_rq);
847 :
848 41098 : if (entity_is_task(curr)) {
849 41098 : struct task_struct *curtask = task_of(curr);
850 :
851 41098 : trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
852 40958 : cgroup_account_cputime(curtask, delta_exec);
853 41013 : account_group_exec_runtime(curtask, delta_exec);
854 : }
855 :
856 102358 : account_cfs_rq_runtime(cfs_rq, delta_exec);
857 : }
858 :
859 49 : static void update_curr_fair(struct rq *rq)
860 : {
861 49 : update_curr(cfs_rq_of(&rq->curr->se));
862 49 : }
863 :
864 : static inline void
865 6213 : update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
866 : {
867 6213 : u64 wait_start, prev_wait_start;
868 :
869 6213 : if (!schedstat_enabled())
870 6213 : return;
871 :
872 : wait_start = rq_clock(rq_of(cfs_rq));
873 : prev_wait_start = schedstat_val(se->statistics.wait_start);
874 :
875 : if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
876 : likely(wait_start > prev_wait_start))
877 : wait_start -= prev_wait_start;
878 :
879 6213 : __schedstat_set(se->statistics.wait_start, wait_start);
880 : }
881 :
882 : static inline void
883 21093 : update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
884 : {
885 21093 : struct task_struct *p;
886 21093 : u64 delta;
887 :
888 21093 : if (!schedstat_enabled())
889 21093 : return;
890 :
891 : /*
892 : * When the sched_schedstat changes from 0 to 1, some sched se
893 : * maybe already in the runqueue, the se->statistics.wait_start
894 : * will be 0.So it will let the delta wrong. We need to avoid this
895 : * scenario.
896 : */
897 : if (unlikely(!schedstat_val(se->statistics.wait_start)))
898 : return;
899 :
900 : delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
901 :
902 : if (entity_is_task(se)) {
903 : p = task_of(se);
904 : if (task_on_rq_migrating(p)) {
905 : /*
906 : * Preserve migrating task's wait time so wait_start
907 : * time stamp can be adjusted to accumulate wait time
908 : * prior to migration.
909 : */
910 : __schedstat_set(se->statistics.wait_start, delta);
911 : return;
912 : }
913 : trace_sched_stat_wait(p, delta);
914 : }
915 :
916 21093 : __schedstat_set(se->statistics.wait_max,
917 : max(schedstat_val(se->statistics.wait_max), delta));
918 21093 : __schedstat_inc(se->statistics.wait_count);
919 21093 : __schedstat_add(se->statistics.wait_sum, delta);
920 21093 : __schedstat_set(se->statistics.wait_start, 0);
921 : }
922 :
923 : static inline void
924 : update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
925 : {
926 : struct task_struct *tsk = NULL;
927 : u64 sleep_start, block_start;
928 :
929 : if (!schedstat_enabled())
930 : return;
931 :
932 : sleep_start = schedstat_val(se->statistics.sleep_start);
933 : block_start = schedstat_val(se->statistics.block_start);
934 :
935 : if (entity_is_task(se))
936 : tsk = task_of(se);
937 :
938 : if (sleep_start) {
939 : u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
940 :
941 : if ((s64)delta < 0)
942 : delta = 0;
943 :
944 : if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
945 : __schedstat_set(se->statistics.sleep_max, delta);
946 :
947 : __schedstat_set(se->statistics.sleep_start, 0);
948 : __schedstat_add(se->statistics.sum_sleep_runtime, delta);
949 :
950 : if (tsk) {
951 : account_scheduler_latency(tsk, delta >> 10, 1);
952 : trace_sched_stat_sleep(tsk, delta);
953 : }
954 : }
955 : if (block_start) {
956 : u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
957 :
958 : if ((s64)delta < 0)
959 : delta = 0;
960 :
961 : if (unlikely(delta > schedstat_val(se->statistics.block_max)))
962 : __schedstat_set(se->statistics.block_max, delta);
963 :
964 : __schedstat_set(se->statistics.block_start, 0);
965 : __schedstat_add(se->statistics.sum_sleep_runtime, delta);
966 :
967 : if (tsk) {
968 : if (tsk->in_iowait) {
969 : __schedstat_add(se->statistics.iowait_sum, delta);
970 : __schedstat_inc(se->statistics.iowait_count);
971 : trace_sched_stat_iowait(tsk, delta);
972 : }
973 :
974 : trace_sched_stat_blocked(tsk, delta);
975 :
976 : /*
977 : * Blocking time is in units of nanosecs, so shift by
978 : * 20 to get a milliseconds-range estimation of the
979 : * amount of time that the task spent sleeping:
980 : */
981 : if (unlikely(prof_on == SLEEP_PROFILING)) {
982 : profile_hits(SLEEP_PROFILING,
983 : (void *)get_wchan(tsk),
984 : delta >> 20);
985 : }
986 : account_scheduler_latency(tsk, delta >> 10, 0);
987 : }
988 : }
989 : }
990 :
991 : /*
992 : * Task is being enqueued - update stats:
993 : */
994 : static inline void
995 15767 : update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
996 : {
997 15767 : if (!schedstat_enabled())
998 15767 : return;
999 :
1000 : /*
1001 : * Are we enqueueing a waiting task? (for current tasks
1002 : * a dequeue/enqueue event is a NOP)
1003 : */
1004 : if (se != cfs_rq->curr)
1005 : update_stats_wait_start(cfs_rq, se);
1006 :
1007 : if (flags & ENQUEUE_WAKEUP)
1008 : update_stats_enqueue_sleeper(cfs_rq, se);
1009 : }
1010 :
1011 : static inline void
1012 15768 : update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1013 : {
1014 :
1015 15768 : if (!schedstat_enabled())
1016 15768 : return;
1017 :
1018 : /*
1019 : * Mark the end of the wait period if dequeueing a
1020 : * waiting task:
1021 : */
1022 : if (se != cfs_rq->curr)
1023 : update_stats_wait_end(cfs_rq, se);
1024 :
1025 : if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1026 : struct task_struct *tsk = task_of(se);
1027 :
1028 : if (tsk->state & TASK_INTERRUPTIBLE)
1029 : __schedstat_set(se->statistics.sleep_start,
1030 : rq_clock(rq_of(cfs_rq)));
1031 : if (tsk->state & TASK_UNINTERRUPTIBLE)
1032 15768 : __schedstat_set(se->statistics.block_start,
1033 : rq_clock(rq_of(cfs_rq)));
1034 : }
1035 : }
1036 :
1037 : /*
1038 : * We are picking a new current task - update its stats:
1039 : */
1040 : static inline void
1041 21087 : update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1042 : {
1043 : /*
1044 : * We are starting a new run period:
1045 : */
1046 42179 : se->exec_start = rq_clock_task(rq_of(cfs_rq));
1047 : }
1048 :
1049 : /**************************************************
1050 : * Scheduling class queueing methods:
1051 : */
1052 :
1053 : #ifdef CONFIG_NUMA_BALANCING
1054 : /*
1055 : * Approximate time to scan a full NUMA task in ms. The task scan period is
1056 : * calculated based on the tasks virtual memory size and
1057 : * numa_balancing_scan_size.
1058 : */
1059 : unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1060 : unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1061 :
1062 : /* Portion of address space to scan in MB */
1063 : unsigned int sysctl_numa_balancing_scan_size = 256;
1064 :
1065 : /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1066 : unsigned int sysctl_numa_balancing_scan_delay = 1000;
1067 :
1068 : struct numa_group {
1069 : refcount_t refcount;
1070 :
1071 : spinlock_t lock; /* nr_tasks, tasks */
1072 : int nr_tasks;
1073 : pid_t gid;
1074 : int active_nodes;
1075 :
1076 : struct rcu_head rcu;
1077 : unsigned long total_faults;
1078 : unsigned long max_faults_cpu;
1079 : /*
1080 : * Faults_cpu is used to decide whether memory should move
1081 : * towards the CPU. As a consequence, these stats are weighted
1082 : * more by CPU use than by memory faults.
1083 : */
1084 : unsigned long *faults_cpu;
1085 : unsigned long faults[];
1086 : };
1087 :
1088 : /*
1089 : * For functions that can be called in multiple contexts that permit reading
1090 : * ->numa_group (see struct task_struct for locking rules).
1091 : */
1092 : static struct numa_group *deref_task_numa_group(struct task_struct *p)
1093 : {
1094 : return rcu_dereference_check(p->numa_group, p == current ||
1095 : (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
1096 : }
1097 :
1098 : static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1099 : {
1100 : return rcu_dereference_protected(p->numa_group, p == current);
1101 : }
1102 :
1103 : static inline unsigned long group_faults_priv(struct numa_group *ng);
1104 : static inline unsigned long group_faults_shared(struct numa_group *ng);
1105 :
1106 : static unsigned int task_nr_scan_windows(struct task_struct *p)
1107 : {
1108 : unsigned long rss = 0;
1109 : unsigned long nr_scan_pages;
1110 :
1111 : /*
1112 : * Calculations based on RSS as non-present and empty pages are skipped
1113 : * by the PTE scanner and NUMA hinting faults should be trapped based
1114 : * on resident pages
1115 : */
1116 : nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1117 : rss = get_mm_rss(p->mm);
1118 : if (!rss)
1119 : rss = nr_scan_pages;
1120 :
1121 : rss = round_up(rss, nr_scan_pages);
1122 : return rss / nr_scan_pages;
1123 : }
1124 :
1125 : /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1126 : #define MAX_SCAN_WINDOW 2560
1127 :
1128 : static unsigned int task_scan_min(struct task_struct *p)
1129 : {
1130 : unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1131 : unsigned int scan, floor;
1132 : unsigned int windows = 1;
1133 :
1134 : if (scan_size < MAX_SCAN_WINDOW)
1135 : windows = MAX_SCAN_WINDOW / scan_size;
1136 : floor = 1000 / windows;
1137 :
1138 : scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1139 : return max_t(unsigned int, floor, scan);
1140 : }
1141 :
1142 : static unsigned int task_scan_start(struct task_struct *p)
1143 : {
1144 : unsigned long smin = task_scan_min(p);
1145 : unsigned long period = smin;
1146 : struct numa_group *ng;
1147 :
1148 : /* Scale the maximum scan period with the amount of shared memory. */
1149 : rcu_read_lock();
1150 : ng = rcu_dereference(p->numa_group);
1151 : if (ng) {
1152 : unsigned long shared = group_faults_shared(ng);
1153 : unsigned long private = group_faults_priv(ng);
1154 :
1155 : period *= refcount_read(&ng->refcount);
1156 : period *= shared + 1;
1157 : period /= private + shared + 1;
1158 : }
1159 : rcu_read_unlock();
1160 :
1161 : return max(smin, period);
1162 : }
1163 :
1164 : static unsigned int task_scan_max(struct task_struct *p)
1165 : {
1166 : unsigned long smin = task_scan_min(p);
1167 : unsigned long smax;
1168 : struct numa_group *ng;
1169 :
1170 : /* Watch for min being lower than max due to floor calculations */
1171 : smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1172 :
1173 : /* Scale the maximum scan period with the amount of shared memory. */
1174 : ng = deref_curr_numa_group(p);
1175 : if (ng) {
1176 : unsigned long shared = group_faults_shared(ng);
1177 : unsigned long private = group_faults_priv(ng);
1178 : unsigned long period = smax;
1179 :
1180 : period *= refcount_read(&ng->refcount);
1181 : period *= shared + 1;
1182 : period /= private + shared + 1;
1183 :
1184 : smax = max(smax, period);
1185 : }
1186 :
1187 : return max(smin, smax);
1188 : }
1189 :
1190 : static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1191 : {
1192 : rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1193 : rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1194 : }
1195 :
1196 : static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1197 : {
1198 : rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1199 : rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1200 : }
1201 :
1202 : /* Shared or private faults. */
1203 : #define NR_NUMA_HINT_FAULT_TYPES 2
1204 :
1205 : /* Memory and CPU locality */
1206 : #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1207 :
1208 : /* Averaged statistics, and temporary buffers. */
1209 : #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1210 :
1211 : pid_t task_numa_group_id(struct task_struct *p)
1212 : {
1213 : struct numa_group *ng;
1214 : pid_t gid = 0;
1215 :
1216 : rcu_read_lock();
1217 : ng = rcu_dereference(p->numa_group);
1218 : if (ng)
1219 : gid = ng->gid;
1220 : rcu_read_unlock();
1221 :
1222 : return gid;
1223 : }
1224 :
1225 : /*
1226 : * The averaged statistics, shared & private, memory & CPU,
1227 : * occupy the first half of the array. The second half of the
1228 : * array is for current counters, which are averaged into the
1229 : * first set by task_numa_placement.
1230 : */
1231 : static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1232 : {
1233 : return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1234 : }
1235 :
1236 : static inline unsigned long task_faults(struct task_struct *p, int nid)
1237 : {
1238 : if (!p->numa_faults)
1239 : return 0;
1240 :
1241 : return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1242 : p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1243 : }
1244 :
1245 : static inline unsigned long group_faults(struct task_struct *p, int nid)
1246 : {
1247 : struct numa_group *ng = deref_task_numa_group(p);
1248 :
1249 : if (!ng)
1250 : return 0;
1251 :
1252 : return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1253 : ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1254 : }
1255 :
1256 : static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1257 : {
1258 : return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1259 : group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1260 : }
1261 :
1262 : static inline unsigned long group_faults_priv(struct numa_group *ng)
1263 : {
1264 : unsigned long faults = 0;
1265 : int node;
1266 :
1267 : for_each_online_node(node) {
1268 : faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1269 : }
1270 :
1271 : return faults;
1272 : }
1273 :
1274 : static inline unsigned long group_faults_shared(struct numa_group *ng)
1275 : {
1276 : unsigned long faults = 0;
1277 : int node;
1278 :
1279 : for_each_online_node(node) {
1280 : faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1281 : }
1282 :
1283 : return faults;
1284 : }
1285 :
1286 : /*
1287 : * A node triggering more than 1/3 as many NUMA faults as the maximum is
1288 : * considered part of a numa group's pseudo-interleaving set. Migrations
1289 : * between these nodes are slowed down, to allow things to settle down.
1290 : */
1291 : #define ACTIVE_NODE_FRACTION 3
1292 :
1293 : static bool numa_is_active_node(int nid, struct numa_group *ng)
1294 : {
1295 : return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1296 : }
1297 :
1298 : /* Handle placement on systems where not all nodes are directly connected. */
1299 : static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1300 : int maxdist, bool task)
1301 : {
1302 : unsigned long score = 0;
1303 : int node;
1304 :
1305 : /*
1306 : * All nodes are directly connected, and the same distance
1307 : * from each other. No need for fancy placement algorithms.
1308 : */
1309 : if (sched_numa_topology_type == NUMA_DIRECT)
1310 : return 0;
1311 :
1312 : /*
1313 : * This code is called for each node, introducing N^2 complexity,
1314 : * which should be ok given the number of nodes rarely exceeds 8.
1315 : */
1316 : for_each_online_node(node) {
1317 : unsigned long faults;
1318 : int dist = node_distance(nid, node);
1319 :
1320 : /*
1321 : * The furthest away nodes in the system are not interesting
1322 : * for placement; nid was already counted.
1323 : */
1324 : if (dist == sched_max_numa_distance || node == nid)
1325 : continue;
1326 :
1327 : /*
1328 : * On systems with a backplane NUMA topology, compare groups
1329 : * of nodes, and move tasks towards the group with the most
1330 : * memory accesses. When comparing two nodes at distance
1331 : * "hoplimit", only nodes closer by than "hoplimit" are part
1332 : * of each group. Skip other nodes.
1333 : */
1334 : if (sched_numa_topology_type == NUMA_BACKPLANE &&
1335 : dist >= maxdist)
1336 : continue;
1337 :
1338 : /* Add up the faults from nearby nodes. */
1339 : if (task)
1340 : faults = task_faults(p, node);
1341 : else
1342 : faults = group_faults(p, node);
1343 :
1344 : /*
1345 : * On systems with a glueless mesh NUMA topology, there are
1346 : * no fixed "groups of nodes". Instead, nodes that are not
1347 : * directly connected bounce traffic through intermediate
1348 : * nodes; a numa_group can occupy any set of nodes.
1349 : * The further away a node is, the less the faults count.
1350 : * This seems to result in good task placement.
1351 : */
1352 : if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1353 : faults *= (sched_max_numa_distance - dist);
1354 : faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1355 : }
1356 :
1357 : score += faults;
1358 : }
1359 :
1360 : return score;
1361 : }
1362 :
1363 : /*
1364 : * These return the fraction of accesses done by a particular task, or
1365 : * task group, on a particular numa node. The group weight is given a
1366 : * larger multiplier, in order to group tasks together that are almost
1367 : * evenly spread out between numa nodes.
1368 : */
1369 : static inline unsigned long task_weight(struct task_struct *p, int nid,
1370 : int dist)
1371 : {
1372 : unsigned long faults, total_faults;
1373 :
1374 : if (!p->numa_faults)
1375 : return 0;
1376 :
1377 : total_faults = p->total_numa_faults;
1378 :
1379 : if (!total_faults)
1380 : return 0;
1381 :
1382 : faults = task_faults(p, nid);
1383 : faults += score_nearby_nodes(p, nid, dist, true);
1384 :
1385 : return 1000 * faults / total_faults;
1386 : }
1387 :
1388 : static inline unsigned long group_weight(struct task_struct *p, int nid,
1389 : int dist)
1390 : {
1391 : struct numa_group *ng = deref_task_numa_group(p);
1392 : unsigned long faults, total_faults;
1393 :
1394 : if (!ng)
1395 : return 0;
1396 :
1397 : total_faults = ng->total_faults;
1398 :
1399 : if (!total_faults)
1400 : return 0;
1401 :
1402 : faults = group_faults(p, nid);
1403 : faults += score_nearby_nodes(p, nid, dist, false);
1404 :
1405 : return 1000 * faults / total_faults;
1406 : }
1407 :
1408 : bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1409 : int src_nid, int dst_cpu)
1410 : {
1411 : struct numa_group *ng = deref_curr_numa_group(p);
1412 : int dst_nid = cpu_to_node(dst_cpu);
1413 : int last_cpupid, this_cpupid;
1414 :
1415 : this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1416 : last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1417 :
1418 : /*
1419 : * Allow first faults or private faults to migrate immediately early in
1420 : * the lifetime of a task. The magic number 4 is based on waiting for
1421 : * two full passes of the "multi-stage node selection" test that is
1422 : * executed below.
1423 : */
1424 : if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1425 : (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1426 : return true;
1427 :
1428 : /*
1429 : * Multi-stage node selection is used in conjunction with a periodic
1430 : * migration fault to build a temporal task<->page relation. By using
1431 : * a two-stage filter we remove short/unlikely relations.
1432 : *
1433 : * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1434 : * a task's usage of a particular page (n_p) per total usage of this
1435 : * page (n_t) (in a given time-span) to a probability.
1436 : *
1437 : * Our periodic faults will sample this probability and getting the
1438 : * same result twice in a row, given these samples are fully
1439 : * independent, is then given by P(n)^2, provided our sample period
1440 : * is sufficiently short compared to the usage pattern.
1441 : *
1442 : * This quadric squishes small probabilities, making it less likely we
1443 : * act on an unlikely task<->page relation.
1444 : */
1445 : if (!cpupid_pid_unset(last_cpupid) &&
1446 : cpupid_to_nid(last_cpupid) != dst_nid)
1447 : return false;
1448 :
1449 : /* Always allow migrate on private faults */
1450 : if (cpupid_match_pid(p, last_cpupid))
1451 : return true;
1452 :
1453 : /* A shared fault, but p->numa_group has not been set up yet. */
1454 : if (!ng)
1455 : return true;
1456 :
1457 : /*
1458 : * Destination node is much more heavily used than the source
1459 : * node? Allow migration.
1460 : */
1461 : if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1462 : ACTIVE_NODE_FRACTION)
1463 : return true;
1464 :
1465 : /*
1466 : * Distribute memory according to CPU & memory use on each node,
1467 : * with 3/4 hysteresis to avoid unnecessary memory migrations:
1468 : *
1469 : * faults_cpu(dst) 3 faults_cpu(src)
1470 : * --------------- * - > ---------------
1471 : * faults_mem(dst) 4 faults_mem(src)
1472 : */
1473 : return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1474 : group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1475 : }
1476 :
1477 : /*
1478 : * 'numa_type' describes the node at the moment of load balancing.
1479 : */
1480 : enum numa_type {
1481 : /* The node has spare capacity that can be used to run more tasks. */
1482 : node_has_spare = 0,
1483 : /*
1484 : * The node is fully used and the tasks don't compete for more CPU
1485 : * cycles. Nevertheless, some tasks might wait before running.
1486 : */
1487 : node_fully_busy,
1488 : /*
1489 : * The node is overloaded and can't provide expected CPU cycles to all
1490 : * tasks.
1491 : */
1492 : node_overloaded
1493 : };
1494 :
1495 : /* Cached statistics for all CPUs within a node */
1496 : struct numa_stats {
1497 : unsigned long load;
1498 : unsigned long runnable;
1499 : unsigned long util;
1500 : /* Total compute capacity of CPUs on a node */
1501 : unsigned long compute_capacity;
1502 : unsigned int nr_running;
1503 : unsigned int weight;
1504 : enum numa_type node_type;
1505 : int idle_cpu;
1506 : };
1507 :
1508 : static inline bool is_core_idle(int cpu)
1509 : {
1510 : #ifdef CONFIG_SCHED_SMT
1511 : int sibling;
1512 :
1513 : for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1514 : if (cpu == sibling)
1515 : continue;
1516 :
1517 : if (!idle_cpu(cpu))
1518 : return false;
1519 : }
1520 : #endif
1521 :
1522 : return true;
1523 : }
1524 :
1525 : struct task_numa_env {
1526 : struct task_struct *p;
1527 :
1528 : int src_cpu, src_nid;
1529 : int dst_cpu, dst_nid;
1530 :
1531 : struct numa_stats src_stats, dst_stats;
1532 :
1533 : int imbalance_pct;
1534 : int dist;
1535 :
1536 : struct task_struct *best_task;
1537 : long best_imp;
1538 : int best_cpu;
1539 : };
1540 :
1541 : static unsigned long cpu_load(struct rq *rq);
1542 : static unsigned long cpu_runnable(struct rq *rq);
1543 : static unsigned long cpu_util(int cpu);
1544 : static inline long adjust_numa_imbalance(int imbalance,
1545 : int dst_running, int dst_weight);
1546 :
1547 : static inline enum
1548 : numa_type numa_classify(unsigned int imbalance_pct,
1549 : struct numa_stats *ns)
1550 : {
1551 : if ((ns->nr_running > ns->weight) &&
1552 : (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1553 : ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1554 : return node_overloaded;
1555 :
1556 : if ((ns->nr_running < ns->weight) ||
1557 : (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1558 : ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1559 : return node_has_spare;
1560 :
1561 : return node_fully_busy;
1562 : }
1563 :
1564 : #ifdef CONFIG_SCHED_SMT
1565 : /* Forward declarations of select_idle_sibling helpers */
1566 : static inline bool test_idle_cores(int cpu, bool def);
1567 : static inline int numa_idle_core(int idle_core, int cpu)
1568 : {
1569 : if (!static_branch_likely(&sched_smt_present) ||
1570 : idle_core >= 0 || !test_idle_cores(cpu, false))
1571 : return idle_core;
1572 :
1573 : /*
1574 : * Prefer cores instead of packing HT siblings
1575 : * and triggering future load balancing.
1576 : */
1577 : if (is_core_idle(cpu))
1578 : idle_core = cpu;
1579 :
1580 : return idle_core;
1581 : }
1582 : #else
1583 : static inline int numa_idle_core(int idle_core, int cpu)
1584 : {
1585 : return idle_core;
1586 : }
1587 : #endif
1588 :
1589 : /*
1590 : * Gather all necessary information to make NUMA balancing placement
1591 : * decisions that are compatible with standard load balancer. This
1592 : * borrows code and logic from update_sg_lb_stats but sharing a
1593 : * common implementation is impractical.
1594 : */
1595 : static void update_numa_stats(struct task_numa_env *env,
1596 : struct numa_stats *ns, int nid,
1597 : bool find_idle)
1598 : {
1599 : int cpu, idle_core = -1;
1600 :
1601 : memset(ns, 0, sizeof(*ns));
1602 : ns->idle_cpu = -1;
1603 :
1604 : rcu_read_lock();
1605 : for_each_cpu(cpu, cpumask_of_node(nid)) {
1606 : struct rq *rq = cpu_rq(cpu);
1607 :
1608 : ns->load += cpu_load(rq);
1609 : ns->runnable += cpu_runnable(rq);
1610 : ns->util += cpu_util(cpu);
1611 : ns->nr_running += rq->cfs.h_nr_running;
1612 : ns->compute_capacity += capacity_of(cpu);
1613 :
1614 : if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1615 : if (READ_ONCE(rq->numa_migrate_on) ||
1616 : !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1617 : continue;
1618 :
1619 : if (ns->idle_cpu == -1)
1620 : ns->idle_cpu = cpu;
1621 :
1622 : idle_core = numa_idle_core(idle_core, cpu);
1623 : }
1624 : }
1625 : rcu_read_unlock();
1626 :
1627 : ns->weight = cpumask_weight(cpumask_of_node(nid));
1628 :
1629 : ns->node_type = numa_classify(env->imbalance_pct, ns);
1630 :
1631 : if (idle_core >= 0)
1632 : ns->idle_cpu = idle_core;
1633 : }
1634 :
1635 : static void task_numa_assign(struct task_numa_env *env,
1636 : struct task_struct *p, long imp)
1637 : {
1638 : struct rq *rq = cpu_rq(env->dst_cpu);
1639 :
1640 : /* Check if run-queue part of active NUMA balance. */
1641 : if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1642 : int cpu;
1643 : int start = env->dst_cpu;
1644 :
1645 : /* Find alternative idle CPU. */
1646 : for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1647 : if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1648 : !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1649 : continue;
1650 : }
1651 :
1652 : env->dst_cpu = cpu;
1653 : rq = cpu_rq(env->dst_cpu);
1654 : if (!xchg(&rq->numa_migrate_on, 1))
1655 : goto assign;
1656 : }
1657 :
1658 : /* Failed to find an alternative idle CPU */
1659 : return;
1660 : }
1661 :
1662 : assign:
1663 : /*
1664 : * Clear previous best_cpu/rq numa-migrate flag, since task now
1665 : * found a better CPU to move/swap.
1666 : */
1667 : if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1668 : rq = cpu_rq(env->best_cpu);
1669 : WRITE_ONCE(rq->numa_migrate_on, 0);
1670 : }
1671 :
1672 : if (env->best_task)
1673 : put_task_struct(env->best_task);
1674 : if (p)
1675 : get_task_struct(p);
1676 :
1677 : env->best_task = p;
1678 : env->best_imp = imp;
1679 : env->best_cpu = env->dst_cpu;
1680 : }
1681 :
1682 : static bool load_too_imbalanced(long src_load, long dst_load,
1683 : struct task_numa_env *env)
1684 : {
1685 : long imb, old_imb;
1686 : long orig_src_load, orig_dst_load;
1687 : long src_capacity, dst_capacity;
1688 :
1689 : /*
1690 : * The load is corrected for the CPU capacity available on each node.
1691 : *
1692 : * src_load dst_load
1693 : * ------------ vs ---------
1694 : * src_capacity dst_capacity
1695 : */
1696 : src_capacity = env->src_stats.compute_capacity;
1697 : dst_capacity = env->dst_stats.compute_capacity;
1698 :
1699 : imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1700 :
1701 : orig_src_load = env->src_stats.load;
1702 : orig_dst_load = env->dst_stats.load;
1703 :
1704 : old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1705 :
1706 : /* Would this change make things worse? */
1707 : return (imb > old_imb);
1708 : }
1709 :
1710 : /*
1711 : * Maximum NUMA importance can be 1998 (2*999);
1712 : * SMALLIMP @ 30 would be close to 1998/64.
1713 : * Used to deter task migration.
1714 : */
1715 : #define SMALLIMP 30
1716 :
1717 : /*
1718 : * This checks if the overall compute and NUMA accesses of the system would
1719 : * be improved if the source tasks was migrated to the target dst_cpu taking
1720 : * into account that it might be best if task running on the dst_cpu should
1721 : * be exchanged with the source task
1722 : */
1723 : static bool task_numa_compare(struct task_numa_env *env,
1724 : long taskimp, long groupimp, bool maymove)
1725 : {
1726 : struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1727 : struct rq *dst_rq = cpu_rq(env->dst_cpu);
1728 : long imp = p_ng ? groupimp : taskimp;
1729 : struct task_struct *cur;
1730 : long src_load, dst_load;
1731 : int dist = env->dist;
1732 : long moveimp = imp;
1733 : long load;
1734 : bool stopsearch = false;
1735 :
1736 : if (READ_ONCE(dst_rq->numa_migrate_on))
1737 : return false;
1738 :
1739 : rcu_read_lock();
1740 : cur = rcu_dereference(dst_rq->curr);
1741 : if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1742 : cur = NULL;
1743 :
1744 : /*
1745 : * Because we have preemption enabled we can get migrated around and
1746 : * end try selecting ourselves (current == env->p) as a swap candidate.
1747 : */
1748 : if (cur == env->p) {
1749 : stopsearch = true;
1750 : goto unlock;
1751 : }
1752 :
1753 : if (!cur) {
1754 : if (maymove && moveimp >= env->best_imp)
1755 : goto assign;
1756 : else
1757 : goto unlock;
1758 : }
1759 :
1760 : /* Skip this swap candidate if cannot move to the source cpu. */
1761 : if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1762 : goto unlock;
1763 :
1764 : /*
1765 : * Skip this swap candidate if it is not moving to its preferred
1766 : * node and the best task is.
1767 : */
1768 : if (env->best_task &&
1769 : env->best_task->numa_preferred_nid == env->src_nid &&
1770 : cur->numa_preferred_nid != env->src_nid) {
1771 : goto unlock;
1772 : }
1773 :
1774 : /*
1775 : * "imp" is the fault differential for the source task between the
1776 : * source and destination node. Calculate the total differential for
1777 : * the source task and potential destination task. The more negative
1778 : * the value is, the more remote accesses that would be expected to
1779 : * be incurred if the tasks were swapped.
1780 : *
1781 : * If dst and source tasks are in the same NUMA group, or not
1782 : * in any group then look only at task weights.
1783 : */
1784 : cur_ng = rcu_dereference(cur->numa_group);
1785 : if (cur_ng == p_ng) {
1786 : imp = taskimp + task_weight(cur, env->src_nid, dist) -
1787 : task_weight(cur, env->dst_nid, dist);
1788 : /*
1789 : * Add some hysteresis to prevent swapping the
1790 : * tasks within a group over tiny differences.
1791 : */
1792 : if (cur_ng)
1793 : imp -= imp / 16;
1794 : } else {
1795 : /*
1796 : * Compare the group weights. If a task is all by itself
1797 : * (not part of a group), use the task weight instead.
1798 : */
1799 : if (cur_ng && p_ng)
1800 : imp += group_weight(cur, env->src_nid, dist) -
1801 : group_weight(cur, env->dst_nid, dist);
1802 : else
1803 : imp += task_weight(cur, env->src_nid, dist) -
1804 : task_weight(cur, env->dst_nid, dist);
1805 : }
1806 :
1807 : /* Discourage picking a task already on its preferred node */
1808 : if (cur->numa_preferred_nid == env->dst_nid)
1809 : imp -= imp / 16;
1810 :
1811 : /*
1812 : * Encourage picking a task that moves to its preferred node.
1813 : * This potentially makes imp larger than it's maximum of
1814 : * 1998 (see SMALLIMP and task_weight for why) but in this
1815 : * case, it does not matter.
1816 : */
1817 : if (cur->numa_preferred_nid == env->src_nid)
1818 : imp += imp / 8;
1819 :
1820 : if (maymove && moveimp > imp && moveimp > env->best_imp) {
1821 : imp = moveimp;
1822 : cur = NULL;
1823 : goto assign;
1824 : }
1825 :
1826 : /*
1827 : * Prefer swapping with a task moving to its preferred node over a
1828 : * task that is not.
1829 : */
1830 : if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1831 : env->best_task->numa_preferred_nid != env->src_nid) {
1832 : goto assign;
1833 : }
1834 :
1835 : /*
1836 : * If the NUMA importance is less than SMALLIMP,
1837 : * task migration might only result in ping pong
1838 : * of tasks and also hurt performance due to cache
1839 : * misses.
1840 : */
1841 : if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1842 : goto unlock;
1843 :
1844 : /*
1845 : * In the overloaded case, try and keep the load balanced.
1846 : */
1847 : load = task_h_load(env->p) - task_h_load(cur);
1848 : if (!load)
1849 : goto assign;
1850 :
1851 : dst_load = env->dst_stats.load + load;
1852 : src_load = env->src_stats.load - load;
1853 :
1854 : if (load_too_imbalanced(src_load, dst_load, env))
1855 : goto unlock;
1856 :
1857 : assign:
1858 : /* Evaluate an idle CPU for a task numa move. */
1859 : if (!cur) {
1860 : int cpu = env->dst_stats.idle_cpu;
1861 :
1862 : /* Nothing cached so current CPU went idle since the search. */
1863 : if (cpu < 0)
1864 : cpu = env->dst_cpu;
1865 :
1866 : /*
1867 : * If the CPU is no longer truly idle and the previous best CPU
1868 : * is, keep using it.
1869 : */
1870 : if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1871 : idle_cpu(env->best_cpu)) {
1872 : cpu = env->best_cpu;
1873 : }
1874 :
1875 : env->dst_cpu = cpu;
1876 : }
1877 :
1878 : task_numa_assign(env, cur, imp);
1879 :
1880 : /*
1881 : * If a move to idle is allowed because there is capacity or load
1882 : * balance improves then stop the search. While a better swap
1883 : * candidate may exist, a search is not free.
1884 : */
1885 : if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1886 : stopsearch = true;
1887 :
1888 : /*
1889 : * If a swap candidate must be identified and the current best task
1890 : * moves its preferred node then stop the search.
1891 : */
1892 : if (!maymove && env->best_task &&
1893 : env->best_task->numa_preferred_nid == env->src_nid) {
1894 : stopsearch = true;
1895 : }
1896 : unlock:
1897 : rcu_read_unlock();
1898 :
1899 : return stopsearch;
1900 : }
1901 :
1902 : static void task_numa_find_cpu(struct task_numa_env *env,
1903 : long taskimp, long groupimp)
1904 : {
1905 : bool maymove = false;
1906 : int cpu;
1907 :
1908 : /*
1909 : * If dst node has spare capacity, then check if there is an
1910 : * imbalance that would be overruled by the load balancer.
1911 : */
1912 : if (env->dst_stats.node_type == node_has_spare) {
1913 : unsigned int imbalance;
1914 : int src_running, dst_running;
1915 :
1916 : /*
1917 : * Would movement cause an imbalance? Note that if src has
1918 : * more running tasks that the imbalance is ignored as the
1919 : * move improves the imbalance from the perspective of the
1920 : * CPU load balancer.
1921 : * */
1922 : src_running = env->src_stats.nr_running - 1;
1923 : dst_running = env->dst_stats.nr_running + 1;
1924 : imbalance = max(0, dst_running - src_running);
1925 : imbalance = adjust_numa_imbalance(imbalance, dst_running,
1926 : env->dst_stats.weight);
1927 :
1928 : /* Use idle CPU if there is no imbalance */
1929 : if (!imbalance) {
1930 : maymove = true;
1931 : if (env->dst_stats.idle_cpu >= 0) {
1932 : env->dst_cpu = env->dst_stats.idle_cpu;
1933 : task_numa_assign(env, NULL, 0);
1934 : return;
1935 : }
1936 : }
1937 : } else {
1938 : long src_load, dst_load, load;
1939 : /*
1940 : * If the improvement from just moving env->p direction is better
1941 : * than swapping tasks around, check if a move is possible.
1942 : */
1943 : load = task_h_load(env->p);
1944 : dst_load = env->dst_stats.load + load;
1945 : src_load = env->src_stats.load - load;
1946 : maymove = !load_too_imbalanced(src_load, dst_load, env);
1947 : }
1948 :
1949 : for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1950 : /* Skip this CPU if the source task cannot migrate */
1951 : if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1952 : continue;
1953 :
1954 : env->dst_cpu = cpu;
1955 : if (task_numa_compare(env, taskimp, groupimp, maymove))
1956 : break;
1957 : }
1958 : }
1959 :
1960 : static int task_numa_migrate(struct task_struct *p)
1961 : {
1962 : struct task_numa_env env = {
1963 : .p = p,
1964 :
1965 : .src_cpu = task_cpu(p),
1966 : .src_nid = task_node(p),
1967 :
1968 : .imbalance_pct = 112,
1969 :
1970 : .best_task = NULL,
1971 : .best_imp = 0,
1972 : .best_cpu = -1,
1973 : };
1974 : unsigned long taskweight, groupweight;
1975 : struct sched_domain *sd;
1976 : long taskimp, groupimp;
1977 : struct numa_group *ng;
1978 : struct rq *best_rq;
1979 : int nid, ret, dist;
1980 :
1981 : /*
1982 : * Pick the lowest SD_NUMA domain, as that would have the smallest
1983 : * imbalance and would be the first to start moving tasks about.
1984 : *
1985 : * And we want to avoid any moving of tasks about, as that would create
1986 : * random movement of tasks -- counter the numa conditions we're trying
1987 : * to satisfy here.
1988 : */
1989 : rcu_read_lock();
1990 : sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1991 : if (sd)
1992 : env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1993 : rcu_read_unlock();
1994 :
1995 : /*
1996 : * Cpusets can break the scheduler domain tree into smaller
1997 : * balance domains, some of which do not cross NUMA boundaries.
1998 : * Tasks that are "trapped" in such domains cannot be migrated
1999 : * elsewhere, so there is no point in (re)trying.
2000 : */
2001 : if (unlikely(!sd)) {
2002 : sched_setnuma(p, task_node(p));
2003 : return -EINVAL;
2004 : }
2005 :
2006 : env.dst_nid = p->numa_preferred_nid;
2007 : dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2008 : taskweight = task_weight(p, env.src_nid, dist);
2009 : groupweight = group_weight(p, env.src_nid, dist);
2010 : update_numa_stats(&env, &env.src_stats, env.src_nid, false);
2011 : taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2012 : groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2013 : update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2014 :
2015 : /* Try to find a spot on the preferred nid. */
2016 : task_numa_find_cpu(&env, taskimp, groupimp);
2017 :
2018 : /*
2019 : * Look at other nodes in these cases:
2020 : * - there is no space available on the preferred_nid
2021 : * - the task is part of a numa_group that is interleaved across
2022 : * multiple NUMA nodes; in order to better consolidate the group,
2023 : * we need to check other locations.
2024 : */
2025 : ng = deref_curr_numa_group(p);
2026 : if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
2027 : for_each_online_node(nid) {
2028 : if (nid == env.src_nid || nid == p->numa_preferred_nid)
2029 : continue;
2030 :
2031 : dist = node_distance(env.src_nid, env.dst_nid);
2032 : if (sched_numa_topology_type == NUMA_BACKPLANE &&
2033 : dist != env.dist) {
2034 : taskweight = task_weight(p, env.src_nid, dist);
2035 : groupweight = group_weight(p, env.src_nid, dist);
2036 : }
2037 :
2038 : /* Only consider nodes where both task and groups benefit */
2039 : taskimp = task_weight(p, nid, dist) - taskweight;
2040 : groupimp = group_weight(p, nid, dist) - groupweight;
2041 : if (taskimp < 0 && groupimp < 0)
2042 : continue;
2043 :
2044 : env.dist = dist;
2045 : env.dst_nid = nid;
2046 : update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2047 : task_numa_find_cpu(&env, taskimp, groupimp);
2048 : }
2049 : }
2050 :
2051 : /*
2052 : * If the task is part of a workload that spans multiple NUMA nodes,
2053 : * and is migrating into one of the workload's active nodes, remember
2054 : * this node as the task's preferred numa node, so the workload can
2055 : * settle down.
2056 : * A task that migrated to a second choice node will be better off
2057 : * trying for a better one later. Do not set the preferred node here.
2058 : */
2059 : if (ng) {
2060 : if (env.best_cpu == -1)
2061 : nid = env.src_nid;
2062 : else
2063 : nid = cpu_to_node(env.best_cpu);
2064 :
2065 : if (nid != p->numa_preferred_nid)
2066 : sched_setnuma(p, nid);
2067 : }
2068 :
2069 : /* No better CPU than the current one was found. */
2070 : if (env.best_cpu == -1) {
2071 : trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2072 : return -EAGAIN;
2073 : }
2074 :
2075 : best_rq = cpu_rq(env.best_cpu);
2076 : if (env.best_task == NULL) {
2077 : ret = migrate_task_to(p, env.best_cpu);
2078 : WRITE_ONCE(best_rq->numa_migrate_on, 0);
2079 : if (ret != 0)
2080 : trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2081 : return ret;
2082 : }
2083 :
2084 : ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2085 : WRITE_ONCE(best_rq->numa_migrate_on, 0);
2086 :
2087 : if (ret != 0)
2088 : trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2089 : put_task_struct(env.best_task);
2090 : return ret;
2091 : }
2092 :
2093 : /* Attempt to migrate a task to a CPU on the preferred node. */
2094 : static void numa_migrate_preferred(struct task_struct *p)
2095 : {
2096 : unsigned long interval = HZ;
2097 :
2098 : /* This task has no NUMA fault statistics yet */
2099 : if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2100 : return;
2101 :
2102 : /* Periodically retry migrating the task to the preferred node */
2103 : interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2104 : p->numa_migrate_retry = jiffies + interval;
2105 :
2106 : /* Success if task is already running on preferred CPU */
2107 : if (task_node(p) == p->numa_preferred_nid)
2108 : return;
2109 :
2110 : /* Otherwise, try migrate to a CPU on the preferred node */
2111 : task_numa_migrate(p);
2112 : }
2113 :
2114 : /*
2115 : * Find out how many nodes on the workload is actively running on. Do this by
2116 : * tracking the nodes from which NUMA hinting faults are triggered. This can
2117 : * be different from the set of nodes where the workload's memory is currently
2118 : * located.
2119 : */
2120 : static void numa_group_count_active_nodes(struct numa_group *numa_group)
2121 : {
2122 : unsigned long faults, max_faults = 0;
2123 : int nid, active_nodes = 0;
2124 :
2125 : for_each_online_node(nid) {
2126 : faults = group_faults_cpu(numa_group, nid);
2127 : if (faults > max_faults)
2128 : max_faults = faults;
2129 : }
2130 :
2131 : for_each_online_node(nid) {
2132 : faults = group_faults_cpu(numa_group, nid);
2133 : if (faults * ACTIVE_NODE_FRACTION > max_faults)
2134 : active_nodes++;
2135 : }
2136 :
2137 : numa_group->max_faults_cpu = max_faults;
2138 : numa_group->active_nodes = active_nodes;
2139 : }
2140 :
2141 : /*
2142 : * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2143 : * increments. The more local the fault statistics are, the higher the scan
2144 : * period will be for the next scan window. If local/(local+remote) ratio is
2145 : * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2146 : * the scan period will decrease. Aim for 70% local accesses.
2147 : */
2148 : #define NUMA_PERIOD_SLOTS 10
2149 : #define NUMA_PERIOD_THRESHOLD 7
2150 :
2151 : /*
2152 : * Increase the scan period (slow down scanning) if the majority of
2153 : * our memory is already on our local node, or if the majority of
2154 : * the page accesses are shared with other processes.
2155 : * Otherwise, decrease the scan period.
2156 : */
2157 : static void update_task_scan_period(struct task_struct *p,
2158 : unsigned long shared, unsigned long private)
2159 : {
2160 : unsigned int period_slot;
2161 : int lr_ratio, ps_ratio;
2162 : int diff;
2163 :
2164 : unsigned long remote = p->numa_faults_locality[0];
2165 : unsigned long local = p->numa_faults_locality[1];
2166 :
2167 : /*
2168 : * If there were no record hinting faults then either the task is
2169 : * completely idle or all activity is areas that are not of interest
2170 : * to automatic numa balancing. Related to that, if there were failed
2171 : * migration then it implies we are migrating too quickly or the local
2172 : * node is overloaded. In either case, scan slower
2173 : */
2174 : if (local + shared == 0 || p->numa_faults_locality[2]) {
2175 : p->numa_scan_period = min(p->numa_scan_period_max,
2176 : p->numa_scan_period << 1);
2177 :
2178 : p->mm->numa_next_scan = jiffies +
2179 : msecs_to_jiffies(p->numa_scan_period);
2180 :
2181 : return;
2182 : }
2183 :
2184 : /*
2185 : * Prepare to scale scan period relative to the current period.
2186 : * == NUMA_PERIOD_THRESHOLD scan period stays the same
2187 : * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2188 : * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2189 : */
2190 : period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2191 : lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2192 : ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2193 :
2194 : if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2195 : /*
2196 : * Most memory accesses are local. There is no need to
2197 : * do fast NUMA scanning, since memory is already local.
2198 : */
2199 : int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2200 : if (!slot)
2201 : slot = 1;
2202 : diff = slot * period_slot;
2203 : } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2204 : /*
2205 : * Most memory accesses are shared with other tasks.
2206 : * There is no point in continuing fast NUMA scanning,
2207 : * since other tasks may just move the memory elsewhere.
2208 : */
2209 : int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2210 : if (!slot)
2211 : slot = 1;
2212 : diff = slot * period_slot;
2213 : } else {
2214 : /*
2215 : * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2216 : * yet they are not on the local NUMA node. Speed up
2217 : * NUMA scanning to get the memory moved over.
2218 : */
2219 : int ratio = max(lr_ratio, ps_ratio);
2220 : diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2221 : }
2222 :
2223 : p->numa_scan_period = clamp(p->numa_scan_period + diff,
2224 : task_scan_min(p), task_scan_max(p));
2225 : memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2226 : }
2227 :
2228 : /*
2229 : * Get the fraction of time the task has been running since the last
2230 : * NUMA placement cycle. The scheduler keeps similar statistics, but
2231 : * decays those on a 32ms period, which is orders of magnitude off
2232 : * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2233 : * stats only if the task is so new there are no NUMA statistics yet.
2234 : */
2235 : static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2236 : {
2237 : u64 runtime, delta, now;
2238 : /* Use the start of this time slice to avoid calculations. */
2239 : now = p->se.exec_start;
2240 : runtime = p->se.sum_exec_runtime;
2241 :
2242 : if (p->last_task_numa_placement) {
2243 : delta = runtime - p->last_sum_exec_runtime;
2244 : *period = now - p->last_task_numa_placement;
2245 :
2246 : /* Avoid time going backwards, prevent potential divide error: */
2247 : if (unlikely((s64)*period < 0))
2248 : *period = 0;
2249 : } else {
2250 : delta = p->se.avg.load_sum;
2251 : *period = LOAD_AVG_MAX;
2252 : }
2253 :
2254 : p->last_sum_exec_runtime = runtime;
2255 : p->last_task_numa_placement = now;
2256 :
2257 : return delta;
2258 : }
2259 :
2260 : /*
2261 : * Determine the preferred nid for a task in a numa_group. This needs to
2262 : * be done in a way that produces consistent results with group_weight,
2263 : * otherwise workloads might not converge.
2264 : */
2265 : static int preferred_group_nid(struct task_struct *p, int nid)
2266 : {
2267 : nodemask_t nodes;
2268 : int dist;
2269 :
2270 : /* Direct connections between all NUMA nodes. */
2271 : if (sched_numa_topology_type == NUMA_DIRECT)
2272 : return nid;
2273 :
2274 : /*
2275 : * On a system with glueless mesh NUMA topology, group_weight
2276 : * scores nodes according to the number of NUMA hinting faults on
2277 : * both the node itself, and on nearby nodes.
2278 : */
2279 : if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2280 : unsigned long score, max_score = 0;
2281 : int node, max_node = nid;
2282 :
2283 : dist = sched_max_numa_distance;
2284 :
2285 : for_each_online_node(node) {
2286 : score = group_weight(p, node, dist);
2287 : if (score > max_score) {
2288 : max_score = score;
2289 : max_node = node;
2290 : }
2291 : }
2292 : return max_node;
2293 : }
2294 :
2295 : /*
2296 : * Finding the preferred nid in a system with NUMA backplane
2297 : * interconnect topology is more involved. The goal is to locate
2298 : * tasks from numa_groups near each other in the system, and
2299 : * untangle workloads from different sides of the system. This requires
2300 : * searching down the hierarchy of node groups, recursively searching
2301 : * inside the highest scoring group of nodes. The nodemask tricks
2302 : * keep the complexity of the search down.
2303 : */
2304 : nodes = node_online_map;
2305 : for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2306 : unsigned long max_faults = 0;
2307 : nodemask_t max_group = NODE_MASK_NONE;
2308 : int a, b;
2309 :
2310 : /* Are there nodes at this distance from each other? */
2311 : if (!find_numa_distance(dist))
2312 : continue;
2313 :
2314 : for_each_node_mask(a, nodes) {
2315 : unsigned long faults = 0;
2316 : nodemask_t this_group;
2317 : nodes_clear(this_group);
2318 :
2319 : /* Sum group's NUMA faults; includes a==b case. */
2320 : for_each_node_mask(b, nodes) {
2321 : if (node_distance(a, b) < dist) {
2322 : faults += group_faults(p, b);
2323 : node_set(b, this_group);
2324 : node_clear(b, nodes);
2325 : }
2326 : }
2327 :
2328 : /* Remember the top group. */
2329 : if (faults > max_faults) {
2330 : max_faults = faults;
2331 : max_group = this_group;
2332 : /*
2333 : * subtle: at the smallest distance there is
2334 : * just one node left in each "group", the
2335 : * winner is the preferred nid.
2336 : */
2337 : nid = a;
2338 : }
2339 : }
2340 : /* Next round, evaluate the nodes within max_group. */
2341 : if (!max_faults)
2342 : break;
2343 : nodes = max_group;
2344 : }
2345 : return nid;
2346 : }
2347 :
2348 : static void task_numa_placement(struct task_struct *p)
2349 : {
2350 : int seq, nid, max_nid = NUMA_NO_NODE;
2351 : unsigned long max_faults = 0;
2352 : unsigned long fault_types[2] = { 0, 0 };
2353 : unsigned long total_faults;
2354 : u64 runtime, period;
2355 : spinlock_t *group_lock = NULL;
2356 : struct numa_group *ng;
2357 :
2358 : /*
2359 : * The p->mm->numa_scan_seq field gets updated without
2360 : * exclusive access. Use READ_ONCE() here to ensure
2361 : * that the field is read in a single access:
2362 : */
2363 : seq = READ_ONCE(p->mm->numa_scan_seq);
2364 : if (p->numa_scan_seq == seq)
2365 : return;
2366 : p->numa_scan_seq = seq;
2367 : p->numa_scan_period_max = task_scan_max(p);
2368 :
2369 : total_faults = p->numa_faults_locality[0] +
2370 : p->numa_faults_locality[1];
2371 : runtime = numa_get_avg_runtime(p, &period);
2372 :
2373 : /* If the task is part of a group prevent parallel updates to group stats */
2374 : ng = deref_curr_numa_group(p);
2375 : if (ng) {
2376 : group_lock = &ng->lock;
2377 : spin_lock_irq(group_lock);
2378 : }
2379 :
2380 : /* Find the node with the highest number of faults */
2381 : for_each_online_node(nid) {
2382 : /* Keep track of the offsets in numa_faults array */
2383 : int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2384 : unsigned long faults = 0, group_faults = 0;
2385 : int priv;
2386 :
2387 : for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2388 : long diff, f_diff, f_weight;
2389 :
2390 : mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2391 : membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2392 : cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2393 : cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2394 :
2395 : /* Decay existing window, copy faults since last scan */
2396 : diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2397 : fault_types[priv] += p->numa_faults[membuf_idx];
2398 : p->numa_faults[membuf_idx] = 0;
2399 :
2400 : /*
2401 : * Normalize the faults_from, so all tasks in a group
2402 : * count according to CPU use, instead of by the raw
2403 : * number of faults. Tasks with little runtime have
2404 : * little over-all impact on throughput, and thus their
2405 : * faults are less important.
2406 : */
2407 : f_weight = div64_u64(runtime << 16, period + 1);
2408 : f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2409 : (total_faults + 1);
2410 : f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2411 : p->numa_faults[cpubuf_idx] = 0;
2412 :
2413 : p->numa_faults[mem_idx] += diff;
2414 : p->numa_faults[cpu_idx] += f_diff;
2415 : faults += p->numa_faults[mem_idx];
2416 : p->total_numa_faults += diff;
2417 : if (ng) {
2418 : /*
2419 : * safe because we can only change our own group
2420 : *
2421 : * mem_idx represents the offset for a given
2422 : * nid and priv in a specific region because it
2423 : * is at the beginning of the numa_faults array.
2424 : */
2425 : ng->faults[mem_idx] += diff;
2426 : ng->faults_cpu[mem_idx] += f_diff;
2427 : ng->total_faults += diff;
2428 : group_faults += ng->faults[mem_idx];
2429 : }
2430 : }
2431 :
2432 : if (!ng) {
2433 : if (faults > max_faults) {
2434 : max_faults = faults;
2435 : max_nid = nid;
2436 : }
2437 : } else if (group_faults > max_faults) {
2438 : max_faults = group_faults;
2439 : max_nid = nid;
2440 : }
2441 : }
2442 :
2443 : if (ng) {
2444 : numa_group_count_active_nodes(ng);
2445 : spin_unlock_irq(group_lock);
2446 : max_nid = preferred_group_nid(p, max_nid);
2447 : }
2448 :
2449 : if (max_faults) {
2450 : /* Set the new preferred node */
2451 : if (max_nid != p->numa_preferred_nid)
2452 : sched_setnuma(p, max_nid);
2453 : }
2454 :
2455 : update_task_scan_period(p, fault_types[0], fault_types[1]);
2456 : }
2457 :
2458 : static inline int get_numa_group(struct numa_group *grp)
2459 : {
2460 : return refcount_inc_not_zero(&grp->refcount);
2461 : }
2462 :
2463 : static inline void put_numa_group(struct numa_group *grp)
2464 : {
2465 : if (refcount_dec_and_test(&grp->refcount))
2466 : kfree_rcu(grp, rcu);
2467 : }
2468 :
2469 : static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2470 : int *priv)
2471 : {
2472 : struct numa_group *grp, *my_grp;
2473 : struct task_struct *tsk;
2474 : bool join = false;
2475 : int cpu = cpupid_to_cpu(cpupid);
2476 : int i;
2477 :
2478 : if (unlikely(!deref_curr_numa_group(p))) {
2479 : unsigned int size = sizeof(struct numa_group) +
2480 : 4*nr_node_ids*sizeof(unsigned long);
2481 :
2482 : grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2483 : if (!grp)
2484 : return;
2485 :
2486 : refcount_set(&grp->refcount, 1);
2487 : grp->active_nodes = 1;
2488 : grp->max_faults_cpu = 0;
2489 : spin_lock_init(&grp->lock);
2490 : grp->gid = p->pid;
2491 : /* Second half of the array tracks nids where faults happen */
2492 : grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2493 : nr_node_ids;
2494 :
2495 : for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2496 : grp->faults[i] = p->numa_faults[i];
2497 :
2498 : grp->total_faults = p->total_numa_faults;
2499 :
2500 : grp->nr_tasks++;
2501 : rcu_assign_pointer(p->numa_group, grp);
2502 : }
2503 :
2504 : rcu_read_lock();
2505 : tsk = READ_ONCE(cpu_rq(cpu)->curr);
2506 :
2507 : if (!cpupid_match_pid(tsk, cpupid))
2508 : goto no_join;
2509 :
2510 : grp = rcu_dereference(tsk->numa_group);
2511 : if (!grp)
2512 : goto no_join;
2513 :
2514 : my_grp = deref_curr_numa_group(p);
2515 : if (grp == my_grp)
2516 : goto no_join;
2517 :
2518 : /*
2519 : * Only join the other group if its bigger; if we're the bigger group,
2520 : * the other task will join us.
2521 : */
2522 : if (my_grp->nr_tasks > grp->nr_tasks)
2523 : goto no_join;
2524 :
2525 : /*
2526 : * Tie-break on the grp address.
2527 : */
2528 : if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2529 : goto no_join;
2530 :
2531 : /* Always join threads in the same process. */
2532 : if (tsk->mm == current->mm)
2533 : join = true;
2534 :
2535 : /* Simple filter to avoid false positives due to PID collisions */
2536 : if (flags & TNF_SHARED)
2537 : join = true;
2538 :
2539 : /* Update priv based on whether false sharing was detected */
2540 : *priv = !join;
2541 :
2542 : if (join && !get_numa_group(grp))
2543 : goto no_join;
2544 :
2545 : rcu_read_unlock();
2546 :
2547 : if (!join)
2548 : return;
2549 :
2550 : BUG_ON(irqs_disabled());
2551 : double_lock_irq(&my_grp->lock, &grp->lock);
2552 :
2553 : for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2554 : my_grp->faults[i] -= p->numa_faults[i];
2555 : grp->faults[i] += p->numa_faults[i];
2556 : }
2557 : my_grp->total_faults -= p->total_numa_faults;
2558 : grp->total_faults += p->total_numa_faults;
2559 :
2560 : my_grp->nr_tasks--;
2561 : grp->nr_tasks++;
2562 :
2563 : spin_unlock(&my_grp->lock);
2564 : spin_unlock_irq(&grp->lock);
2565 :
2566 : rcu_assign_pointer(p->numa_group, grp);
2567 :
2568 : put_numa_group(my_grp);
2569 : return;
2570 :
2571 : no_join:
2572 : rcu_read_unlock();
2573 : return;
2574 : }
2575 :
2576 : /*
2577 : * Get rid of NUMA staticstics associated with a task (either current or dead).
2578 : * If @final is set, the task is dead and has reached refcount zero, so we can
2579 : * safely free all relevant data structures. Otherwise, there might be
2580 : * concurrent reads from places like load balancing and procfs, and we should
2581 : * reset the data back to default state without freeing ->numa_faults.
2582 : */
2583 : void task_numa_free(struct task_struct *p, bool final)
2584 : {
2585 : /* safe: p either is current or is being freed by current */
2586 : struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2587 : unsigned long *numa_faults = p->numa_faults;
2588 : unsigned long flags;
2589 : int i;
2590 :
2591 : if (!numa_faults)
2592 : return;
2593 :
2594 : if (grp) {
2595 : spin_lock_irqsave(&grp->lock, flags);
2596 : for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2597 : grp->faults[i] -= p->numa_faults[i];
2598 : grp->total_faults -= p->total_numa_faults;
2599 :
2600 : grp->nr_tasks--;
2601 : spin_unlock_irqrestore(&grp->lock, flags);
2602 : RCU_INIT_POINTER(p->numa_group, NULL);
2603 : put_numa_group(grp);
2604 : }
2605 :
2606 : if (final) {
2607 : p->numa_faults = NULL;
2608 : kfree(numa_faults);
2609 : } else {
2610 : p->total_numa_faults = 0;
2611 : for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2612 : numa_faults[i] = 0;
2613 : }
2614 : }
2615 :
2616 : /*
2617 : * Got a PROT_NONE fault for a page on @node.
2618 : */
2619 : void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2620 : {
2621 : struct task_struct *p = current;
2622 : bool migrated = flags & TNF_MIGRATED;
2623 : int cpu_node = task_node(current);
2624 : int local = !!(flags & TNF_FAULT_LOCAL);
2625 : struct numa_group *ng;
2626 : int priv;
2627 :
2628 : if (!static_branch_likely(&sched_numa_balancing))
2629 : return;
2630 :
2631 : /* for example, ksmd faulting in a user's mm */
2632 : if (!p->mm)
2633 : return;
2634 :
2635 : /* Allocate buffer to track faults on a per-node basis */
2636 : if (unlikely(!p->numa_faults)) {
2637 : int size = sizeof(*p->numa_faults) *
2638 : NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2639 :
2640 : p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2641 : if (!p->numa_faults)
2642 : return;
2643 :
2644 : p->total_numa_faults = 0;
2645 : memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2646 : }
2647 :
2648 : /*
2649 : * First accesses are treated as private, otherwise consider accesses
2650 : * to be private if the accessing pid has not changed
2651 : */
2652 : if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2653 : priv = 1;
2654 : } else {
2655 : priv = cpupid_match_pid(p, last_cpupid);
2656 : if (!priv && !(flags & TNF_NO_GROUP))
2657 : task_numa_group(p, last_cpupid, flags, &priv);
2658 : }
2659 :
2660 : /*
2661 : * If a workload spans multiple NUMA nodes, a shared fault that
2662 : * occurs wholly within the set of nodes that the workload is
2663 : * actively using should be counted as local. This allows the
2664 : * scan rate to slow down when a workload has settled down.
2665 : */
2666 : ng = deref_curr_numa_group(p);
2667 : if (!priv && !local && ng && ng->active_nodes > 1 &&
2668 : numa_is_active_node(cpu_node, ng) &&
2669 : numa_is_active_node(mem_node, ng))
2670 : local = 1;
2671 :
2672 : /*
2673 : * Retry to migrate task to preferred node periodically, in case it
2674 : * previously failed, or the scheduler moved us.
2675 : */
2676 : if (time_after(jiffies, p->numa_migrate_retry)) {
2677 : task_numa_placement(p);
2678 : numa_migrate_preferred(p);
2679 : }
2680 :
2681 : if (migrated)
2682 : p->numa_pages_migrated += pages;
2683 : if (flags & TNF_MIGRATE_FAIL)
2684 : p->numa_faults_locality[2] += pages;
2685 :
2686 : p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2687 : p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2688 : p->numa_faults_locality[local] += pages;
2689 : }
2690 :
2691 : static void reset_ptenuma_scan(struct task_struct *p)
2692 : {
2693 : /*
2694 : * We only did a read acquisition of the mmap sem, so
2695 : * p->mm->numa_scan_seq is written to without exclusive access
2696 : * and the update is not guaranteed to be atomic. That's not
2697 : * much of an issue though, since this is just used for
2698 : * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2699 : * expensive, to avoid any form of compiler optimizations:
2700 : */
2701 : WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2702 : p->mm->numa_scan_offset = 0;
2703 : }
2704 :
2705 : /*
2706 : * The expensive part of numa migration is done from task_work context.
2707 : * Triggered from task_tick_numa().
2708 : */
2709 : static void task_numa_work(struct callback_head *work)
2710 : {
2711 : unsigned long migrate, next_scan, now = jiffies;
2712 : struct task_struct *p = current;
2713 : struct mm_struct *mm = p->mm;
2714 : u64 runtime = p->se.sum_exec_runtime;
2715 : struct vm_area_struct *vma;
2716 : unsigned long start, end;
2717 : unsigned long nr_pte_updates = 0;
2718 : long pages, virtpages;
2719 :
2720 : SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2721 :
2722 : work->next = work;
2723 : /*
2724 : * Who cares about NUMA placement when they're dying.
2725 : *
2726 : * NOTE: make sure not to dereference p->mm before this check,
2727 : * exit_task_work() happens _after_ exit_mm() so we could be called
2728 : * without p->mm even though we still had it when we enqueued this
2729 : * work.
2730 : */
2731 : if (p->flags & PF_EXITING)
2732 : return;
2733 :
2734 : if (!mm->numa_next_scan) {
2735 : mm->numa_next_scan = now +
2736 : msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2737 : }
2738 :
2739 : /*
2740 : * Enforce maximal scan/migration frequency..
2741 : */
2742 : migrate = mm->numa_next_scan;
2743 : if (time_before(now, migrate))
2744 : return;
2745 :
2746 : if (p->numa_scan_period == 0) {
2747 : p->numa_scan_period_max = task_scan_max(p);
2748 : p->numa_scan_period = task_scan_start(p);
2749 : }
2750 :
2751 : next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2752 : if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2753 : return;
2754 :
2755 : /*
2756 : * Delay this task enough that another task of this mm will likely win
2757 : * the next time around.
2758 : */
2759 : p->node_stamp += 2 * TICK_NSEC;
2760 :
2761 : start = mm->numa_scan_offset;
2762 : pages = sysctl_numa_balancing_scan_size;
2763 : pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2764 : virtpages = pages * 8; /* Scan up to this much virtual space */
2765 : if (!pages)
2766 : return;
2767 :
2768 :
2769 : if (!mmap_read_trylock(mm))
2770 : return;
2771 : vma = find_vma(mm, start);
2772 : if (!vma) {
2773 : reset_ptenuma_scan(p);
2774 : start = 0;
2775 : vma = mm->mmap;
2776 : }
2777 : for (; vma; vma = vma->vm_next) {
2778 : if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2779 : is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2780 : continue;
2781 : }
2782 :
2783 : /*
2784 : * Shared library pages mapped by multiple processes are not
2785 : * migrated as it is expected they are cache replicated. Avoid
2786 : * hinting faults in read-only file-backed mappings or the vdso
2787 : * as migrating the pages will be of marginal benefit.
2788 : */
2789 : if (!vma->vm_mm ||
2790 : (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2791 : continue;
2792 :
2793 : /*
2794 : * Skip inaccessible VMAs to avoid any confusion between
2795 : * PROT_NONE and NUMA hinting ptes
2796 : */
2797 : if (!vma_is_accessible(vma))
2798 : continue;
2799 :
2800 : do {
2801 : start = max(start, vma->vm_start);
2802 : end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2803 : end = min(end, vma->vm_end);
2804 : nr_pte_updates = change_prot_numa(vma, start, end);
2805 :
2806 : /*
2807 : * Try to scan sysctl_numa_balancing_size worth of
2808 : * hpages that have at least one present PTE that
2809 : * is not already pte-numa. If the VMA contains
2810 : * areas that are unused or already full of prot_numa
2811 : * PTEs, scan up to virtpages, to skip through those
2812 : * areas faster.
2813 : */
2814 : if (nr_pte_updates)
2815 : pages -= (end - start) >> PAGE_SHIFT;
2816 : virtpages -= (end - start) >> PAGE_SHIFT;
2817 :
2818 : start = end;
2819 : if (pages <= 0 || virtpages <= 0)
2820 : goto out;
2821 :
2822 : cond_resched();
2823 : } while (end != vma->vm_end);
2824 : }
2825 :
2826 : out:
2827 : /*
2828 : * It is possible to reach the end of the VMA list but the last few
2829 : * VMAs are not guaranteed to the vma_migratable. If they are not, we
2830 : * would find the !migratable VMA on the next scan but not reset the
2831 : * scanner to the start so check it now.
2832 : */
2833 : if (vma)
2834 : mm->numa_scan_offset = start;
2835 : else
2836 : reset_ptenuma_scan(p);
2837 : mmap_read_unlock(mm);
2838 :
2839 : /*
2840 : * Make sure tasks use at least 32x as much time to run other code
2841 : * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2842 : * Usually update_task_scan_period slows down scanning enough; on an
2843 : * overloaded system we need to limit overhead on a per task basis.
2844 : */
2845 : if (unlikely(p->se.sum_exec_runtime != runtime)) {
2846 : u64 diff = p->se.sum_exec_runtime - runtime;
2847 : p->node_stamp += 32 * diff;
2848 : }
2849 : }
2850 :
2851 : void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2852 : {
2853 : int mm_users = 0;
2854 : struct mm_struct *mm = p->mm;
2855 :
2856 : if (mm) {
2857 : mm_users = atomic_read(&mm->mm_users);
2858 : if (mm_users == 1) {
2859 : mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2860 : mm->numa_scan_seq = 0;
2861 : }
2862 : }
2863 : p->node_stamp = 0;
2864 : p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2865 : p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2866 : /* Protect against double add, see task_tick_numa and task_numa_work */
2867 : p->numa_work.next = &p->numa_work;
2868 : p->numa_faults = NULL;
2869 : RCU_INIT_POINTER(p->numa_group, NULL);
2870 : p->last_task_numa_placement = 0;
2871 : p->last_sum_exec_runtime = 0;
2872 :
2873 : init_task_work(&p->numa_work, task_numa_work);
2874 :
2875 : /* New address space, reset the preferred nid */
2876 : if (!(clone_flags & CLONE_VM)) {
2877 : p->numa_preferred_nid = NUMA_NO_NODE;
2878 : return;
2879 : }
2880 :
2881 : /*
2882 : * New thread, keep existing numa_preferred_nid which should be copied
2883 : * already by arch_dup_task_struct but stagger when scans start.
2884 : */
2885 : if (mm) {
2886 : unsigned int delay;
2887 :
2888 : delay = min_t(unsigned int, task_scan_max(current),
2889 : current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2890 : delay += 2 * TICK_NSEC;
2891 : p->node_stamp = delay;
2892 : }
2893 : }
2894 :
2895 : /*
2896 : * Drive the periodic memory faults..
2897 : */
2898 : static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2899 : {
2900 : struct callback_head *work = &curr->numa_work;
2901 : u64 period, now;
2902 :
2903 : /*
2904 : * We don't care about NUMA placement if we don't have memory.
2905 : */
2906 : if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2907 : return;
2908 :
2909 : /*
2910 : * Using runtime rather than walltime has the dual advantage that
2911 : * we (mostly) drive the selection from busy threads and that the
2912 : * task needs to have done some actual work before we bother with
2913 : * NUMA placement.
2914 : */
2915 : now = curr->se.sum_exec_runtime;
2916 : period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2917 :
2918 : if (now > curr->node_stamp + period) {
2919 : if (!curr->node_stamp)
2920 : curr->numa_scan_period = task_scan_start(curr);
2921 : curr->node_stamp += period;
2922 :
2923 : if (!time_before(jiffies, curr->mm->numa_next_scan))
2924 : task_work_add(curr, work, TWA_RESUME);
2925 : }
2926 : }
2927 :
2928 : static void update_scan_period(struct task_struct *p, int new_cpu)
2929 : {
2930 : int src_nid = cpu_to_node(task_cpu(p));
2931 : int dst_nid = cpu_to_node(new_cpu);
2932 :
2933 : if (!static_branch_likely(&sched_numa_balancing))
2934 : return;
2935 :
2936 : if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2937 : return;
2938 :
2939 : if (src_nid == dst_nid)
2940 : return;
2941 :
2942 : /*
2943 : * Allow resets if faults have been trapped before one scan
2944 : * has completed. This is most likely due to a new task that
2945 : * is pulled cross-node due to wakeups or load balancing.
2946 : */
2947 : if (p->numa_scan_seq) {
2948 : /*
2949 : * Avoid scan adjustments if moving to the preferred
2950 : * node or if the task was not previously running on
2951 : * the preferred node.
2952 : */
2953 : if (dst_nid == p->numa_preferred_nid ||
2954 : (p->numa_preferred_nid != NUMA_NO_NODE &&
2955 : src_nid != p->numa_preferred_nid))
2956 : return;
2957 : }
2958 :
2959 : p->numa_scan_period = task_scan_start(p);
2960 : }
2961 :
2962 : #else
2963 : static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2964 : {
2965 : }
2966 :
2967 15764 : static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2968 : {
2969 15764 : }
2970 :
2971 15767 : static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2972 : {
2973 15767 : }
2974 :
2975 994 : static inline void update_scan_period(struct task_struct *p, int new_cpu)
2976 : {
2977 994 : }
2978 :
2979 : #endif /* CONFIG_NUMA_BALANCING */
2980 :
2981 : static void
2982 15764 : account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2983 : {
2984 15764 : update_load_add(&cfs_rq->load, se->load.weight);
2985 : #ifdef CONFIG_SMP
2986 15764 : if (entity_is_task(se)) {
2987 15764 : struct rq *rq = rq_of(cfs_rq);
2988 :
2989 15764 : account_numa_enqueue(rq, task_of(se));
2990 15764 : list_add(&se->group_node, &rq->cfs_tasks);
2991 : }
2992 : #endif
2993 15764 : cfs_rq->nr_running++;
2994 : }
2995 :
2996 : static void
2997 15767 : account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2998 : {
2999 15767 : update_load_sub(&cfs_rq->load, se->load.weight);
3000 : #ifdef CONFIG_SMP
3001 15767 : if (entity_is_task(se)) {
3002 15767 : account_numa_dequeue(rq_of(cfs_rq), task_of(se));
3003 15767 : list_del_init(&se->group_node);
3004 : }
3005 : #endif
3006 15767 : cfs_rq->nr_running--;
3007 : }
3008 :
3009 : /*
3010 : * Signed add and clamp on underflow.
3011 : *
3012 : * Explicitly do a load-store to ensure the intermediate value never hits
3013 : * memory. This allows lockless observations without ever seeing the negative
3014 : * values.
3015 : */
3016 : #define add_positive(_ptr, _val) do { \
3017 : typeof(_ptr) ptr = (_ptr); \
3018 : typeof(_val) val = (_val); \
3019 : typeof(*ptr) res, var = READ_ONCE(*ptr); \
3020 : \
3021 : res = var + val; \
3022 : \
3023 : if (val < 0 && res > var) \
3024 : res = 0; \
3025 : \
3026 : WRITE_ONCE(*ptr, res); \
3027 : } while (0)
3028 :
3029 : /*
3030 : * Unsigned subtract and clamp on underflow.
3031 : *
3032 : * Explicitly do a load-store to ensure the intermediate value never hits
3033 : * memory. This allows lockless observations without ever seeing the negative
3034 : * values.
3035 : */
3036 : #define sub_positive(_ptr, _val) do { \
3037 : typeof(_ptr) ptr = (_ptr); \
3038 : typeof(*ptr) val = (_val); \
3039 : typeof(*ptr) res, var = READ_ONCE(*ptr); \
3040 : res = var - val; \
3041 : if (res > var) \
3042 : res = 0; \
3043 : WRITE_ONCE(*ptr, res); \
3044 : } while (0)
3045 :
3046 : /*
3047 : * Remove and clamp on negative, from a local variable.
3048 : *
3049 : * A variant of sub_positive(), which does not use explicit load-store
3050 : * and is thus optimized for local variable updates.
3051 : */
3052 : #define lsub_positive(_ptr, _val) do { \
3053 : typeof(_ptr) ptr = (_ptr); \
3054 : *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3055 : } while (0)
3056 :
3057 : #ifdef CONFIG_SMP
3058 : static inline void
3059 2007 : enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3060 : {
3061 2007 : cfs_rq->avg.load_avg += se->avg.load_avg;
3062 2007 : cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3063 2007 : }
3064 :
3065 : static inline void
3066 916 : dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3067 : {
3068 916 : sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3069 1833 : sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3070 916 : }
3071 : #else
3072 : static inline void
3073 : enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3074 : static inline void
3075 : dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3076 : #endif
3077 :
3078 25 : static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3079 : unsigned long weight)
3080 : {
3081 25 : if (se->on_rq) {
3082 : /* commit outstanding execution time */
3083 0 : if (cfs_rq->curr == se)
3084 0 : update_curr(cfs_rq);
3085 0 : update_load_sub(&cfs_rq->load, se->load.weight);
3086 : }
3087 25 : dequeue_load_avg(cfs_rq, se);
3088 :
3089 25 : update_load_set(&se->load, weight);
3090 :
3091 : #ifdef CONFIG_SMP
3092 25 : do {
3093 25 : u32 divider = get_pelt_divider(&se->avg);
3094 :
3095 50 : se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3096 25 : } while (0);
3097 : #endif
3098 :
3099 25 : enqueue_load_avg(cfs_rq, se);
3100 25 : if (se->on_rq)
3101 0 : update_load_add(&cfs_rq->load, se->load.weight);
3102 :
3103 25 : }
3104 :
3105 25 : void reweight_task(struct task_struct *p, int prio)
3106 : {
3107 25 : struct sched_entity *se = &p->se;
3108 25 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
3109 25 : struct load_weight *load = &se->load;
3110 25 : unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3111 :
3112 25 : reweight_entity(cfs_rq, se, weight);
3113 25 : load->inv_weight = sched_prio_to_wmult[prio];
3114 25 : }
3115 :
3116 : #ifdef CONFIG_FAIR_GROUP_SCHED
3117 : #ifdef CONFIG_SMP
3118 : /*
3119 : * All this does is approximate the hierarchical proportion which includes that
3120 : * global sum we all love to hate.
3121 : *
3122 : * That is, the weight of a group entity, is the proportional share of the
3123 : * group weight based on the group runqueue weights. That is:
3124 : *
3125 : * tg->weight * grq->load.weight
3126 : * ge->load.weight = ----------------------------- (1)
3127 : * \Sum grq->load.weight
3128 : *
3129 : * Now, because computing that sum is prohibitively expensive to compute (been
3130 : * there, done that) we approximate it with this average stuff. The average
3131 : * moves slower and therefore the approximation is cheaper and more stable.
3132 : *
3133 : * So instead of the above, we substitute:
3134 : *
3135 : * grq->load.weight -> grq->avg.load_avg (2)
3136 : *
3137 : * which yields the following:
3138 : *
3139 : * tg->weight * grq->avg.load_avg
3140 : * ge->load.weight = ------------------------------ (3)
3141 : * tg->load_avg
3142 : *
3143 : * Where: tg->load_avg ~= \Sum grq->avg.load_avg
3144 : *
3145 : * That is shares_avg, and it is right (given the approximation (2)).
3146 : *
3147 : * The problem with it is that because the average is slow -- it was designed
3148 : * to be exactly that of course -- this leads to transients in boundary
3149 : * conditions. In specific, the case where the group was idle and we start the
3150 : * one task. It takes time for our CPU's grq->avg.load_avg to build up,
3151 : * yielding bad latency etc..
3152 : *
3153 : * Now, in that special case (1) reduces to:
3154 : *
3155 : * tg->weight * grq->load.weight
3156 : * ge->load.weight = ----------------------------- = tg->weight (4)
3157 : * grp->load.weight
3158 : *
3159 : * That is, the sum collapses because all other CPUs are idle; the UP scenario.
3160 : *
3161 : * So what we do is modify our approximation (3) to approach (4) in the (near)
3162 : * UP case, like:
3163 : *
3164 : * ge->load.weight =
3165 : *
3166 : * tg->weight * grq->load.weight
3167 : * --------------------------------------------------- (5)
3168 : * tg->load_avg - grq->avg.load_avg + grq->load.weight
3169 : *
3170 : * But because grq->load.weight can drop to 0, resulting in a divide by zero,
3171 : * we need to use grq->avg.load_avg as its lower bound, which then gives:
3172 : *
3173 : *
3174 : * tg->weight * grq->load.weight
3175 : * ge->load.weight = ----------------------------- (6)
3176 : * tg_load_avg'
3177 : *
3178 : * Where:
3179 : *
3180 : * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3181 : * max(grq->load.weight, grq->avg.load_avg)
3182 : *
3183 : * And that is shares_weight and is icky. In the (near) UP case it approaches
3184 : * (4) while in the normal case it approaches (3). It consistently
3185 : * overestimates the ge->load.weight and therefore:
3186 : *
3187 : * \Sum ge->load.weight >= tg->weight
3188 : *
3189 : * hence icky!
3190 : */
3191 : static long calc_group_shares(struct cfs_rq *cfs_rq)
3192 : {
3193 : long tg_weight, tg_shares, load, shares;
3194 : struct task_group *tg = cfs_rq->tg;
3195 :
3196 : tg_shares = READ_ONCE(tg->shares);
3197 :
3198 : load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3199 :
3200 : tg_weight = atomic_long_read(&tg->load_avg);
3201 :
3202 : /* Ensure tg_weight >= load */
3203 : tg_weight -= cfs_rq->tg_load_avg_contrib;
3204 : tg_weight += load;
3205 :
3206 : shares = (tg_shares * load);
3207 : if (tg_weight)
3208 : shares /= tg_weight;
3209 :
3210 : /*
3211 : * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3212 : * of a group with small tg->shares value. It is a floor value which is
3213 : * assigned as a minimum load.weight to the sched_entity representing
3214 : * the group on a CPU.
3215 : *
3216 : * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3217 : * on an 8-core system with 8 tasks each runnable on one CPU shares has
3218 : * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3219 : * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3220 : * instead of 0.
3221 : */
3222 : return clamp_t(long, shares, MIN_SHARES, tg_shares);
3223 : }
3224 : #endif /* CONFIG_SMP */
3225 :
3226 : static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3227 :
3228 : /*
3229 : * Recomputes the group entity based on the current state of its group
3230 : * runqueue.
3231 : */
3232 : static void update_cfs_group(struct sched_entity *se)
3233 : {
3234 : struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3235 : long shares;
3236 :
3237 : if (!gcfs_rq)
3238 : return;
3239 :
3240 : if (throttled_hierarchy(gcfs_rq))
3241 : return;
3242 :
3243 : #ifndef CONFIG_SMP
3244 : shares = READ_ONCE(gcfs_rq->tg->shares);
3245 :
3246 : if (likely(se->load.weight == shares))
3247 : return;
3248 : #else
3249 : shares = calc_group_shares(gcfs_rq);
3250 : #endif
3251 :
3252 : reweight_entity(cfs_rq_of(se), se, shares);
3253 : }
3254 :
3255 : #else /* CONFIG_FAIR_GROUP_SCHED */
3256 46286 : static inline void update_cfs_group(struct sched_entity *se)
3257 : {
3258 46286 : }
3259 : #endif /* CONFIG_FAIR_GROUP_SCHED */
3260 :
3261 2874 : static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3262 : {
3263 2874 : struct rq *rq = rq_of(cfs_rq);
3264 :
3265 2874 : if (&rq->cfs == cfs_rq) {
3266 : /*
3267 : * There are a few boundary cases this might miss but it should
3268 : * get called often enough that that should (hopefully) not be
3269 : * a real problem.
3270 : *
3271 : * It will not get called when we go idle, because the idle
3272 : * thread is a different class (!fair), nor will the utilization
3273 : * number include things like RT tasks.
3274 : *
3275 : * As is, the util number is not freq-invariant (we'd have to
3276 : * implement arch_scale_freq_capacity() for that).
3277 : *
3278 : * See cpu_util().
3279 : */
3280 2874 : cpufreq_update_util(rq, flags);
3281 : }
3282 : }
3283 :
3284 : #ifdef CONFIG_SMP
3285 : #ifdef CONFIG_FAIR_GROUP_SCHED
3286 : /**
3287 : * update_tg_load_avg - update the tg's load avg
3288 : * @cfs_rq: the cfs_rq whose avg changed
3289 : *
3290 : * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3291 : * However, because tg->load_avg is a global value there are performance
3292 : * considerations.
3293 : *
3294 : * In order to avoid having to look at the other cfs_rq's, we use a
3295 : * differential update where we store the last value we propagated. This in
3296 : * turn allows skipping updates if the differential is 'small'.
3297 : *
3298 : * Updating tg's load_avg is necessary before update_cfs_share().
3299 : */
3300 : static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3301 : {
3302 : long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3303 :
3304 : /*
3305 : * No need to update load_avg for root_task_group as it is not used.
3306 : */
3307 : if (cfs_rq->tg == &root_task_group)
3308 : return;
3309 :
3310 : if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3311 : atomic_long_add(delta, &cfs_rq->tg->load_avg);
3312 : cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3313 : }
3314 : }
3315 :
3316 : /*
3317 : * Called within set_task_rq() right before setting a task's CPU. The
3318 : * caller only guarantees p->pi_lock is held; no other assumptions,
3319 : * including the state of rq->lock, should be made.
3320 : */
3321 : void set_task_rq_fair(struct sched_entity *se,
3322 : struct cfs_rq *prev, struct cfs_rq *next)
3323 : {
3324 : u64 p_last_update_time;
3325 : u64 n_last_update_time;
3326 :
3327 : if (!sched_feat(ATTACH_AGE_LOAD))
3328 : return;
3329 :
3330 : /*
3331 : * We are supposed to update the task to "current" time, then its up to
3332 : * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3333 : * getting what current time is, so simply throw away the out-of-date
3334 : * time. This will result in the wakee task is less decayed, but giving
3335 : * the wakee more load sounds not bad.
3336 : */
3337 : if (!(se->avg.last_update_time && prev))
3338 : return;
3339 :
3340 : #ifndef CONFIG_64BIT
3341 : {
3342 : u64 p_last_update_time_copy;
3343 : u64 n_last_update_time_copy;
3344 :
3345 : do {
3346 : p_last_update_time_copy = prev->load_last_update_time_copy;
3347 : n_last_update_time_copy = next->load_last_update_time_copy;
3348 :
3349 : smp_rmb();
3350 :
3351 : p_last_update_time = prev->avg.last_update_time;
3352 : n_last_update_time = next->avg.last_update_time;
3353 :
3354 : } while (p_last_update_time != p_last_update_time_copy ||
3355 : n_last_update_time != n_last_update_time_copy);
3356 : }
3357 : #else
3358 : p_last_update_time = prev->avg.last_update_time;
3359 : n_last_update_time = next->avg.last_update_time;
3360 : #endif
3361 : __update_load_avg_blocked_se(p_last_update_time, se);
3362 : se->avg.last_update_time = n_last_update_time;
3363 : }
3364 :
3365 :
3366 : /*
3367 : * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3368 : * propagate its contribution. The key to this propagation is the invariant
3369 : * that for each group:
3370 : *
3371 : * ge->avg == grq->avg (1)
3372 : *
3373 : * _IFF_ we look at the pure running and runnable sums. Because they
3374 : * represent the very same entity, just at different points in the hierarchy.
3375 : *
3376 : * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
3377 : * and simply copies the running/runnable sum over (but still wrong, because
3378 : * the group entity and group rq do not have their PELT windows aligned).
3379 : *
3380 : * However, update_tg_cfs_load() is more complex. So we have:
3381 : *
3382 : * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3383 : *
3384 : * And since, like util, the runnable part should be directly transferable,
3385 : * the following would _appear_ to be the straight forward approach:
3386 : *
3387 : * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
3388 : *
3389 : * And per (1) we have:
3390 : *
3391 : * ge->avg.runnable_avg == grq->avg.runnable_avg
3392 : *
3393 : * Which gives:
3394 : *
3395 : * ge->load.weight * grq->avg.load_avg
3396 : * ge->avg.load_avg = ----------------------------------- (4)
3397 : * grq->load.weight
3398 : *
3399 : * Except that is wrong!
3400 : *
3401 : * Because while for entities historical weight is not important and we
3402 : * really only care about our future and therefore can consider a pure
3403 : * runnable sum, runqueues can NOT do this.
3404 : *
3405 : * We specifically want runqueues to have a load_avg that includes
3406 : * historical weights. Those represent the blocked load, the load we expect
3407 : * to (shortly) return to us. This only works by keeping the weights as
3408 : * integral part of the sum. We therefore cannot decompose as per (3).
3409 : *
3410 : * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3411 : * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3412 : * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3413 : * runnable section of these tasks overlap (or not). If they were to perfectly
3414 : * align the rq as a whole would be runnable 2/3 of the time. If however we
3415 : * always have at least 1 runnable task, the rq as a whole is always runnable.
3416 : *
3417 : * So we'll have to approximate.. :/
3418 : *
3419 : * Given the constraint:
3420 : *
3421 : * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
3422 : *
3423 : * We can construct a rule that adds runnable to a rq by assuming minimal
3424 : * overlap.
3425 : *
3426 : * On removal, we'll assume each task is equally runnable; which yields:
3427 : *
3428 : * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
3429 : *
3430 : * XXX: only do this for the part of runnable > running ?
3431 : *
3432 : */
3433 :
3434 : static inline void
3435 : update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3436 : {
3437 : long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3438 : u32 divider;
3439 :
3440 : /* Nothing to update */
3441 : if (!delta)
3442 : return;
3443 :
3444 : /*
3445 : * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3446 : * See ___update_load_avg() for details.
3447 : */
3448 : divider = get_pelt_divider(&cfs_rq->avg);
3449 :
3450 : /* Set new sched_entity's utilization */
3451 : se->avg.util_avg = gcfs_rq->avg.util_avg;
3452 : se->avg.util_sum = se->avg.util_avg * divider;
3453 :
3454 : /* Update parent cfs_rq utilization */
3455 : add_positive(&cfs_rq->avg.util_avg, delta);
3456 : cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3457 : }
3458 :
3459 : static inline void
3460 : update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3461 : {
3462 : long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3463 : u32 divider;
3464 :
3465 : /* Nothing to update */
3466 : if (!delta)
3467 : return;
3468 :
3469 : /*
3470 : * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3471 : * See ___update_load_avg() for details.
3472 : */
3473 : divider = get_pelt_divider(&cfs_rq->avg);
3474 :
3475 : /* Set new sched_entity's runnable */
3476 : se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3477 : se->avg.runnable_sum = se->avg.runnable_avg * divider;
3478 :
3479 : /* Update parent cfs_rq runnable */
3480 : add_positive(&cfs_rq->avg.runnable_avg, delta);
3481 : cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3482 : }
3483 :
3484 : static inline void
3485 : update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3486 : {
3487 : long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3488 : unsigned long load_avg;
3489 : u64 load_sum = 0;
3490 : s64 delta_sum;
3491 : u32 divider;
3492 :
3493 : if (!runnable_sum)
3494 : return;
3495 :
3496 : gcfs_rq->prop_runnable_sum = 0;
3497 :
3498 : /*
3499 : * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3500 : * See ___update_load_avg() for details.
3501 : */
3502 : divider = get_pelt_divider(&cfs_rq->avg);
3503 :
3504 : if (runnable_sum >= 0) {
3505 : /*
3506 : * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3507 : * the CPU is saturated running == runnable.
3508 : */
3509 : runnable_sum += se->avg.load_sum;
3510 : runnable_sum = min_t(long, runnable_sum, divider);
3511 : } else {
3512 : /*
3513 : * Estimate the new unweighted runnable_sum of the gcfs_rq by
3514 : * assuming all tasks are equally runnable.
3515 : */
3516 : if (scale_load_down(gcfs_rq->load.weight)) {
3517 : load_sum = div_s64(gcfs_rq->avg.load_sum,
3518 : scale_load_down(gcfs_rq->load.weight));
3519 : }
3520 :
3521 : /* But make sure to not inflate se's runnable */
3522 : runnable_sum = min(se->avg.load_sum, load_sum);
3523 : }
3524 :
3525 : /*
3526 : * runnable_sum can't be lower than running_sum
3527 : * Rescale running sum to be in the same range as runnable sum
3528 : * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
3529 : * runnable_sum is in [0 : LOAD_AVG_MAX]
3530 : */
3531 : running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3532 : runnable_sum = max(runnable_sum, running_sum);
3533 :
3534 : load_sum = (s64)se_weight(se) * runnable_sum;
3535 : load_avg = div_s64(load_sum, divider);
3536 :
3537 : delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3538 : delta_avg = load_avg - se->avg.load_avg;
3539 :
3540 : se->avg.load_sum = runnable_sum;
3541 : se->avg.load_avg = load_avg;
3542 : add_positive(&cfs_rq->avg.load_avg, delta_avg);
3543 : add_positive(&cfs_rq->avg.load_sum, delta_sum);
3544 : }
3545 :
3546 : static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3547 : {
3548 : cfs_rq->propagate = 1;
3549 : cfs_rq->prop_runnable_sum += runnable_sum;
3550 : }
3551 :
3552 : /* Update task and its cfs_rq load average */
3553 : static inline int propagate_entity_load_avg(struct sched_entity *se)
3554 : {
3555 : struct cfs_rq *cfs_rq, *gcfs_rq;
3556 :
3557 : if (entity_is_task(se))
3558 : return 0;
3559 :
3560 : gcfs_rq = group_cfs_rq(se);
3561 : if (!gcfs_rq->propagate)
3562 : return 0;
3563 :
3564 : gcfs_rq->propagate = 0;
3565 :
3566 : cfs_rq = cfs_rq_of(se);
3567 :
3568 : add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3569 :
3570 : update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3571 : update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3572 : update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3573 :
3574 : trace_pelt_cfs_tp(cfs_rq);
3575 : trace_pelt_se_tp(se);
3576 :
3577 : return 1;
3578 : }
3579 :
3580 : /*
3581 : * Check if we need to update the load and the utilization of a blocked
3582 : * group_entity:
3583 : */
3584 : static inline bool skip_blocked_update(struct sched_entity *se)
3585 : {
3586 : struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3587 :
3588 : /*
3589 : * If sched_entity still have not zero load or utilization, we have to
3590 : * decay it:
3591 : */
3592 : if (se->avg.load_avg || se->avg.util_avg)
3593 : return false;
3594 :
3595 : /*
3596 : * If there is a pending propagation, we have to update the load and
3597 : * the utilization of the sched_entity:
3598 : */
3599 : if (gcfs_rq->propagate)
3600 : return false;
3601 :
3602 : /*
3603 : * Otherwise, the load and the utilization of the sched_entity is
3604 : * already zero and there is no pending propagation, so it will be a
3605 : * waste of time to try to decay it:
3606 : */
3607 : return true;
3608 : }
3609 :
3610 : #else /* CONFIG_FAIR_GROUP_SCHED */
3611 :
3612 2872 : static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3613 :
3614 75448 : static inline int propagate_entity_load_avg(struct sched_entity *se)
3615 : {
3616 75448 : return 0;
3617 : }
3618 :
3619 3879 : static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3620 :
3621 : #endif /* CONFIG_FAIR_GROUP_SCHED */
3622 :
3623 : /**
3624 : * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3625 : * @now: current time, as per cfs_rq_clock_pelt()
3626 : * @cfs_rq: cfs_rq to update
3627 : *
3628 : * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3629 : * avg. The immediate corollary is that all (fair) tasks must be attached, see
3630 : * post_init_entity_util_avg().
3631 : *
3632 : * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3633 : *
3634 : * Returns true if the load decayed or we removed load.
3635 : *
3636 : * Since both these conditions indicate a changed cfs_rq->avg.load we should
3637 : * call update_tg_load_avg() when this function returns true.
3638 : */
3639 : static inline int
3640 87683 : update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3641 : {
3642 87683 : unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3643 87683 : struct sched_avg *sa = &cfs_rq->avg;
3644 87683 : int decayed = 0;
3645 :
3646 87683 : if (cfs_rq->removed.nr) {
3647 1005 : unsigned long r;
3648 1005 : u32 divider = get_pelt_divider(&cfs_rq->avg);
3649 :
3650 1005 : raw_spin_lock(&cfs_rq->removed.lock);
3651 1006 : swap(cfs_rq->removed.util_avg, removed_util);
3652 1006 : swap(cfs_rq->removed.load_avg, removed_load);
3653 1006 : swap(cfs_rq->removed.runnable_avg, removed_runnable);
3654 1006 : cfs_rq->removed.nr = 0;
3655 1006 : raw_spin_unlock(&cfs_rq->removed.lock);
3656 :
3657 1005 : r = removed_load;
3658 1005 : sub_positive(&sa->load_avg, r);
3659 1005 : sub_positive(&sa->load_sum, r * divider);
3660 :
3661 1005 : r = removed_util;
3662 1005 : sub_positive(&sa->util_avg, r);
3663 1005 : sub_positive(&sa->util_sum, r * divider);
3664 :
3665 1005 : r = removed_runnable;
3666 1005 : sub_positive(&sa->runnable_avg, r);
3667 1005 : sub_positive(&sa->runnable_sum, r * divider);
3668 :
3669 : /*
3670 : * removed_runnable is the unweighted version of removed_load so we
3671 : * can use it to estimate removed_load_sum.
3672 : */
3673 1005 : add_tg_cfs_propagate(cfs_rq,
3674 1005 : -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3675 :
3676 1005 : decayed = 1;
3677 : }
3678 :
3679 87683 : decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3680 :
3681 : #ifndef CONFIG_64BIT
3682 : smp_wmb();
3683 : cfs_rq->load_last_update_time_copy = sa->last_update_time;
3684 : #endif
3685 :
3686 87651 : return decayed;
3687 : }
3688 :
3689 : /**
3690 : * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3691 : * @cfs_rq: cfs_rq to attach to
3692 : * @se: sched_entity to attach
3693 : *
3694 : * Must call update_cfs_rq_load_avg() before this, since we rely on
3695 : * cfs_rq->avg.last_update_time being current.
3696 : */
3697 1982 : static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3698 : {
3699 : /*
3700 : * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3701 : * See ___update_load_avg() for details.
3702 : */
3703 1982 : u32 divider = get_pelt_divider(&cfs_rq->avg);
3704 :
3705 : /*
3706 : * When we attach the @se to the @cfs_rq, we must align the decay
3707 : * window because without that, really weird and wonderful things can
3708 : * happen.
3709 : *
3710 : * XXX illustrate
3711 : */
3712 1982 : se->avg.last_update_time = cfs_rq->avg.last_update_time;
3713 1982 : se->avg.period_contrib = cfs_rq->avg.period_contrib;
3714 :
3715 : /*
3716 : * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3717 : * period_contrib. This isn't strictly correct, but since we're
3718 : * entirely outside of the PELT hierarchy, nobody cares if we truncate
3719 : * _sum a little.
3720 : */
3721 1982 : se->avg.util_sum = se->avg.util_avg * divider;
3722 :
3723 1982 : se->avg.runnable_sum = se->avg.runnable_avg * divider;
3724 :
3725 1982 : se->avg.load_sum = divider;
3726 3964 : if (se_weight(se)) {
3727 1982 : se->avg.load_sum =
3728 1982 : div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3729 : }
3730 :
3731 1982 : enqueue_load_avg(cfs_rq, se);
3732 1982 : cfs_rq->avg.util_avg += se->avg.util_avg;
3733 1982 : cfs_rq->avg.util_sum += se->avg.util_sum;
3734 1982 : cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3735 1982 : cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3736 :
3737 1982 : add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3738 :
3739 1982 : cfs_rq_util_change(cfs_rq, 0);
3740 :
3741 1982 : trace_pelt_cfs_tp(cfs_rq);
3742 1981 : }
3743 :
3744 : /**
3745 : * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3746 : * @cfs_rq: cfs_rq to detach from
3747 : * @se: sched_entity to detach
3748 : *
3749 : * Must call update_cfs_rq_load_avg() before this, since we rely on
3750 : * cfs_rq->avg.last_update_time being current.
3751 : */
3752 892 : static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3753 : {
3754 892 : dequeue_load_avg(cfs_rq, se);
3755 892 : sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3756 892 : sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3757 892 : sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3758 892 : sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3759 :
3760 892 : add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3761 :
3762 892 : cfs_rq_util_change(cfs_rq, 0);
3763 :
3764 892 : trace_pelt_cfs_tp(cfs_rq);
3765 892 : }
3766 :
3767 : /*
3768 : * Optional action to be done while updating the load average
3769 : */
3770 : #define UPDATE_TG 0x1
3771 : #define SKIP_AGE_LOAD 0x2
3772 : #define DO_ATTACH 0x4
3773 :
3774 : /* Update task and its cfs_rq load average */
3775 75517 : static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3776 : {
3777 75517 : u64 now = cfs_rq_clock_pelt(cfs_rq);
3778 75605 : int decayed;
3779 :
3780 : /*
3781 : * Track task load average for carrying it to new CPU after migrated, and
3782 : * track group sched_entity load average for task_h_load calc in migration
3783 : */
3784 75605 : if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3785 73647 : __update_load_avg_se(now, cfs_rq, se);
3786 :
3787 75365 : decayed = update_cfs_rq_load_avg(now, cfs_rq);
3788 75448 : decayed |= propagate_entity_load_avg(se);
3789 :
3790 75448 : if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3791 :
3792 : /*
3793 : * DO_ATTACH means we're here from enqueue_entity().
3794 : * !last_update_time means we've passed through
3795 : * migrate_task_rq_fair() indicating we migrated.
3796 : *
3797 : * IOW we're enqueueing a task on a new CPU.
3798 : */
3799 995 : attach_entity_load_avg(cfs_rq, se);
3800 995 : update_tg_load_avg(cfs_rq);
3801 :
3802 : } else if (decayed) {
3803 : cfs_rq_util_change(cfs_rq, 0);
3804 :
3805 : if (flags & UPDATE_TG)
3806 75447 : update_tg_load_avg(cfs_rq);
3807 : }
3808 75447 : }
3809 :
3810 : #ifndef CONFIG_64BIT
3811 : static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3812 : {
3813 : u64 last_update_time_copy;
3814 : u64 last_update_time;
3815 :
3816 : do {
3817 : last_update_time_copy = cfs_rq->load_last_update_time_copy;
3818 : smp_rmb();
3819 : last_update_time = cfs_rq->avg.last_update_time;
3820 : } while (last_update_time != last_update_time_copy);
3821 :
3822 : return last_update_time;
3823 : }
3824 : #else
3825 1642 : static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3826 : {
3827 1642 : return cfs_rq->avg.last_update_time;
3828 : }
3829 : #endif
3830 :
3831 : /*
3832 : * Synchronize entity load avg of dequeued entity without locking
3833 : * the previous rq.
3834 : */
3835 1642 : static void sync_entity_load_avg(struct sched_entity *se)
3836 : {
3837 1642 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
3838 1642 : u64 last_update_time;
3839 :
3840 1642 : last_update_time = cfs_rq_last_update_time(cfs_rq);
3841 1642 : __update_load_avg_blocked_se(last_update_time, se);
3842 1642 : }
3843 :
3844 : /*
3845 : * Task first catches up with cfs_rq, and then subtract
3846 : * itself from the cfs_rq (task must be off the queue now).
3847 : */
3848 1017 : static void remove_entity_load_avg(struct sched_entity *se)
3849 : {
3850 1017 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
3851 1017 : unsigned long flags;
3852 :
3853 : /*
3854 : * tasks cannot exit without having gone through wake_up_new_task() ->
3855 : * post_init_entity_util_avg() which will have added things to the
3856 : * cfs_rq, so we can remove unconditionally.
3857 : */
3858 :
3859 1017 : sync_entity_load_avg(se);
3860 :
3861 1017 : raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3862 1017 : ++cfs_rq->removed.nr;
3863 1017 : cfs_rq->removed.util_avg += se->avg.util_avg;
3864 1017 : cfs_rq->removed.load_avg += se->avg.load_avg;
3865 1017 : cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
3866 1017 : raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3867 1017 : }
3868 :
3869 43262 : static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3870 : {
3871 43262 : return cfs_rq->avg.runnable_avg;
3872 : }
3873 :
3874 43863 : static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3875 : {
3876 43863 : return cfs_rq->avg.load_avg;
3877 : }
3878 :
3879 : static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
3880 :
3881 22274 : static inline unsigned long task_util(struct task_struct *p)
3882 : {
3883 22274 : return READ_ONCE(p->se.avg.util_avg);
3884 : }
3885 :
3886 33622 : static inline unsigned long _task_util_est(struct task_struct *p)
3887 : {
3888 33622 : struct util_est ue = READ_ONCE(p->se.avg.util_est);
3889 :
3890 33622 : return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
3891 : }
3892 :
3893 1465 : static inline unsigned long task_util_est(struct task_struct *p)
3894 : {
3895 1465 : return max(task_util(p), _task_util_est(p));
3896 : }
3897 :
3898 : #ifdef CONFIG_UCLAMP_TASK
3899 : static inline unsigned long uclamp_task_util(struct task_struct *p)
3900 : {
3901 : return clamp(task_util_est(p),
3902 : uclamp_eff_value(p, UCLAMP_MIN),
3903 : uclamp_eff_value(p, UCLAMP_MAX));
3904 : }
3905 : #else
3906 0 : static inline unsigned long uclamp_task_util(struct task_struct *p)
3907 : {
3908 0 : return task_util_est(p);
3909 : }
3910 : #endif
3911 :
3912 15763 : static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3913 : struct task_struct *p)
3914 : {
3915 15763 : unsigned int enqueued;
3916 :
3917 15763 : if (!sched_feat(UTIL_EST))
3918 : return;
3919 :
3920 : /* Update root cfs_rq's estimated utilization */
3921 15763 : enqueued = cfs_rq->avg.util_est.enqueued;
3922 15763 : enqueued += _task_util_est(p);
3923 15763 : WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3924 :
3925 15763 : trace_sched_util_est_cfs_tp(cfs_rq);
3926 : }
3927 :
3928 15769 : static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3929 : struct task_struct *p)
3930 : {
3931 15769 : unsigned int enqueued;
3932 :
3933 15769 : if (!sched_feat(UTIL_EST))
3934 : return;
3935 :
3936 : /* Update root cfs_rq's estimated utilization */
3937 15769 : enqueued = cfs_rq->avg.util_est.enqueued;
3938 15769 : enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
3939 15769 : WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3940 :
3941 15769 : trace_sched_util_est_cfs_tp(cfs_rq);
3942 : }
3943 :
3944 : /*
3945 : * Check if a (signed) value is within a specified (unsigned) margin,
3946 : * based on the observation that:
3947 : *
3948 : * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3949 : *
3950 : * NOTE: this only works when value + maring < INT_MAX.
3951 : */
3952 10516 : static inline bool within_margin(int value, int margin)
3953 : {
3954 10516 : return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3955 : }
3956 :
3957 15768 : static inline void util_est_update(struct cfs_rq *cfs_rq,
3958 : struct task_struct *p,
3959 : bool task_sleep)
3960 : {
3961 15768 : long last_ewma_diff;
3962 15768 : struct util_est ue;
3963 :
3964 15768 : if (!sched_feat(UTIL_EST))
3965 15767 : return;
3966 :
3967 : /*
3968 : * Skip update of task's estimated utilization when the task has not
3969 : * yet completed an activation, e.g. being migrated.
3970 : */
3971 15768 : if (!task_sleep)
3972 : return;
3973 :
3974 : /*
3975 : * If the PELT values haven't changed since enqueue time,
3976 : * skip the util_est update.
3977 : */
3978 14866 : ue = p->se.avg.util_est;
3979 14866 : if (ue.enqueued & UTIL_AVG_UNCHANGED)
3980 : return;
3981 :
3982 : /*
3983 : * Reset EWMA on utilization increases, the moving average is used only
3984 : * to smooth utilization decreases.
3985 : */
3986 14160 : ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3987 14160 : if (sched_feat(UTIL_EST_FASTUP)) {
3988 14160 : if (ue.ewma < ue.enqueued) {
3989 3644 : ue.ewma = ue.enqueued;
3990 3644 : goto done;
3991 : }
3992 : }
3993 :
3994 : /*
3995 : * Skip update of task's estimated utilization when its EWMA is
3996 : * already ~1% close to its last activation value.
3997 : */
3998 10516 : last_ewma_diff = ue.enqueued - ue.ewma;
3999 10516 : if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
4000 : return;
4001 :
4002 : /*
4003 : * To avoid overestimation of actual task utilization, skip updates if
4004 : * we cannot grant there is idle time in this CPU.
4005 : */
4006 6024 : if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
4007 : return;
4008 :
4009 : /*
4010 : * Update Task's estimated utilization
4011 : *
4012 : * When *p completes an activation we can consolidate another sample
4013 : * of the task size. This is done by storing the current PELT value
4014 : * as ue.enqueued and by using this value to update the Exponential
4015 : * Weighted Moving Average (EWMA):
4016 : *
4017 : * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4018 : * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4019 : * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4020 : * = w * ( last_ewma_diff ) + ewma(t-1)
4021 : * = w * (last_ewma_diff + ewma(t-1) / w)
4022 : *
4023 : * Where 'w' is the weight of new samples, which is configured to be
4024 : * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4025 : */
4026 6024 : ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4027 6024 : ue.ewma += last_ewma_diff;
4028 6024 : ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4029 9668 : done:
4030 9668 : WRITE_ONCE(p->se.avg.util_est, ue);
4031 :
4032 9668 : trace_sched_util_est_se_tp(&p->se);
4033 : }
4034 :
4035 0 : static inline int task_fits_capacity(struct task_struct *p, long capacity)
4036 : {
4037 0 : return fits_capacity(uclamp_task_util(p), capacity);
4038 : }
4039 :
4040 43208 : static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
4041 : {
4042 43208 : if (!static_branch_unlikely(&sched_asym_cpucapacity))
4043 : return;
4044 :
4045 0 : if (!p || p->nr_cpus_allowed == 1) {
4046 0 : rq->misfit_task_load = 0;
4047 0 : return;
4048 : }
4049 :
4050 0 : if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4051 0 : rq->misfit_task_load = 0;
4052 0 : return;
4053 : }
4054 :
4055 : /*
4056 : * Make sure that misfit_task_load will not be null even if
4057 : * task_h_load() returns 0.
4058 : */
4059 0 : rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
4060 : }
4061 :
4062 : #else /* CONFIG_SMP */
4063 :
4064 : #define UPDATE_TG 0x0
4065 : #define SKIP_AGE_LOAD 0x0
4066 : #define DO_ATTACH 0x0
4067 :
4068 : static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
4069 : {
4070 : cfs_rq_util_change(cfs_rq, 0);
4071 : }
4072 :
4073 : static inline void remove_entity_load_avg(struct sched_entity *se) {}
4074 :
4075 : static inline void
4076 : attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4077 : static inline void
4078 : detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4079 :
4080 : static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
4081 : {
4082 : return 0;
4083 : }
4084 :
4085 : static inline void
4086 : util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4087 :
4088 : static inline void
4089 : util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4090 :
4091 : static inline void
4092 : util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4093 : bool task_sleep) {}
4094 : static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
4095 :
4096 : #endif /* CONFIG_SMP */
4097 :
4098 36854 : static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4099 : {
4100 : #ifdef CONFIG_SCHED_DEBUG
4101 : s64 d = se->vruntime - cfs_rq->min_vruntime;
4102 :
4103 : if (d < 0)
4104 : d = -d;
4105 :
4106 : if (d > 3*sysctl_sched_latency)
4107 : schedstat_inc(cfs_rq->nr_spread_over);
4108 : #endif
4109 36854 : }
4110 :
4111 : static void
4112 14871 : place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4113 : {
4114 14871 : u64 vruntime = cfs_rq->min_vruntime;
4115 :
4116 : /*
4117 : * The 'current' period is already promised to the current tasks,
4118 : * however the extra weight of the new task will slow them down a
4119 : * little, place the new task so that it fits in the slot that
4120 : * stays open at the end.
4121 : */
4122 14871 : if (initial && sched_feat(START_DEBIT))
4123 990 : vruntime += sched_vslice(cfs_rq, se);
4124 :
4125 : /* sleeps up to a single latency don't count. */
4126 14871 : if (!initial) {
4127 13874 : unsigned long thresh = sysctl_sched_latency;
4128 :
4129 : /*
4130 : * Halve their sleep time's effect, to allow
4131 : * for a gentler effect of sleepers:
4132 : */
4133 13874 : if (sched_feat(GENTLE_FAIR_SLEEPERS))
4134 13874 : thresh >>= 1;
4135 :
4136 13874 : vruntime -= thresh;
4137 : }
4138 :
4139 : /* ensure we never gain time by being placed backwards. */
4140 14871 : se->vruntime = max_vruntime(se->vruntime, vruntime);
4141 14871 : }
4142 :
4143 : static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4144 :
4145 15767 : static inline void check_schedstat_required(void)
4146 : {
4147 : #ifdef CONFIG_SCHEDSTATS
4148 : if (schedstat_enabled())
4149 : return;
4150 :
4151 : /* Force schedstat enabled if a dependent tracepoint is active */
4152 : if (trace_sched_stat_wait_enabled() ||
4153 : trace_sched_stat_sleep_enabled() ||
4154 : trace_sched_stat_iowait_enabled() ||
4155 : trace_sched_stat_blocked_enabled() ||
4156 : trace_sched_stat_runtime_enabled()) {
4157 : printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
4158 : "stat_blocked and stat_runtime require the "
4159 : "kernel parameter schedstats=enable or "
4160 : "kernel.sched_schedstats=1\n");
4161 : }
4162 : #endif
4163 15767 : }
4164 :
4165 : static inline bool cfs_bandwidth_used(void);
4166 :
4167 : /*
4168 : * MIGRATION
4169 : *
4170 : * dequeue
4171 : * update_curr()
4172 : * update_min_vruntime()
4173 : * vruntime -= min_vruntime
4174 : *
4175 : * enqueue
4176 : * update_curr()
4177 : * update_min_vruntime()
4178 : * vruntime += min_vruntime
4179 : *
4180 : * this way the vruntime transition between RQs is done when both
4181 : * min_vruntime are up-to-date.
4182 : *
4183 : * WAKEUP (remote)
4184 : *
4185 : * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
4186 : * vruntime -= min_vruntime
4187 : *
4188 : * enqueue
4189 : * update_curr()
4190 : * update_min_vruntime()
4191 : * vruntime += min_vruntime
4192 : *
4193 : * this way we don't have the most up-to-date min_vruntime on the originating
4194 : * CPU and an up-to-date min_vruntime on the destination CPU.
4195 : */
4196 :
4197 : static void
4198 15764 : enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4199 : {
4200 15764 : bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4201 15764 : bool curr = cfs_rq->curr == se;
4202 :
4203 : /*
4204 : * If we're the current task, we must renormalise before calling
4205 : * update_curr().
4206 : */
4207 15764 : if (renorm && curr)
4208 0 : se->vruntime += cfs_rq->min_vruntime;
4209 :
4210 15764 : update_curr(cfs_rq);
4211 :
4212 : /*
4213 : * Otherwise, renormalise after, such that we're placed at the current
4214 : * moment in time, instead of some random moment in the past. Being
4215 : * placed in the past could significantly boost this task to the
4216 : * fairness detriment of existing tasks.
4217 : */
4218 15769 : if (renorm && !curr)
4219 1997 : se->vruntime += cfs_rq->min_vruntime;
4220 :
4221 : /*
4222 : * When enqueuing a sched_entity, we must:
4223 : * - Update loads to have both entity and cfs_rq synced with now.
4224 : * - Add its load to cfs_rq->runnable_avg
4225 : * - For group_entity, update its weight to reflect the new share of
4226 : * its group cfs_rq
4227 : * - Add its new weight to cfs_rq->load.weight
4228 : */
4229 15769 : update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4230 15764 : se_update_runnable(se);
4231 15764 : update_cfs_group(se);
4232 15764 : account_entity_enqueue(cfs_rq, se);
4233 :
4234 15764 : if (flags & ENQUEUE_WAKEUP)
4235 13875 : place_entity(cfs_rq, se, 0);
4236 :
4237 15767 : check_schedstat_required();
4238 15767 : update_stats_enqueue(cfs_rq, se, flags);
4239 15767 : check_spread(cfs_rq, se);
4240 15767 : if (!curr)
4241 15767 : __enqueue_entity(cfs_rq, se);
4242 15762 : se->on_rq = 1;
4243 :
4244 : /*
4245 : * When bandwidth control is enabled, cfs might have been removed
4246 : * because of a parent been throttled but cfs->nr_running > 1. Try to
4247 : * add it unconditionnally.
4248 : */
4249 15762 : if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
4250 15762 : list_add_leaf_cfs_rq(cfs_rq);
4251 :
4252 15762 : if (cfs_rq->nr_running == 1)
4253 15762 : check_enqueue_throttle(cfs_rq);
4254 15762 : }
4255 :
4256 0 : static void __clear_buddies_last(struct sched_entity *se)
4257 : {
4258 0 : for_each_sched_entity(se) {
4259 0 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
4260 0 : if (cfs_rq->last != se)
4261 : break;
4262 :
4263 0 : cfs_rq->last = NULL;
4264 : }
4265 0 : }
4266 :
4267 5098 : static void __clear_buddies_next(struct sched_entity *se)
4268 : {
4269 5098 : for_each_sched_entity(se) {
4270 5098 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
4271 5098 : if (cfs_rq->next != se)
4272 : break;
4273 :
4274 5098 : cfs_rq->next = NULL;
4275 : }
4276 5098 : }
4277 :
4278 0 : static void __clear_buddies_skip(struct sched_entity *se)
4279 : {
4280 0 : for_each_sched_entity(se) {
4281 0 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
4282 0 : if (cfs_rq->skip != se)
4283 : break;
4284 :
4285 0 : cfs_rq->skip = NULL;
4286 : }
4287 0 : }
4288 :
4289 37719 : static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4290 : {
4291 37719 : if (cfs_rq->last == se)
4292 0 : __clear_buddies_last(se);
4293 :
4294 37719 : if (cfs_rq->next == se)
4295 5098 : __clear_buddies_next(se);
4296 :
4297 37719 : if (cfs_rq->skip == se)
4298 0 : __clear_buddies_skip(se);
4299 37719 : }
4300 :
4301 : static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4302 :
4303 : static void
4304 15767 : dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4305 : {
4306 : /*
4307 : * Update run-time statistics of the 'current'.
4308 : */
4309 15767 : update_curr(cfs_rq);
4310 :
4311 : /*
4312 : * When dequeuing a sched_entity, we must:
4313 : * - Update loads to have both entity and cfs_rq synced with now.
4314 : * - Subtract its load from the cfs_rq->runnable_avg.
4315 : * - Subtract its previous weight from cfs_rq->load.weight.
4316 : * - For group entity, update its weight to reflect the new share
4317 : * of its group cfs_rq.
4318 : */
4319 15768 : update_load_avg(cfs_rq, se, UPDATE_TG);
4320 15768 : se_update_runnable(se);
4321 :
4322 15768 : update_stats_dequeue(cfs_rq, se, flags);
4323 :
4324 15768 : clear_buddies(cfs_rq, se);
4325 :
4326 15768 : if (se != cfs_rq->curr)
4327 888 : __dequeue_entity(cfs_rq, se);
4328 15767 : se->on_rq = 0;
4329 15767 : account_entity_dequeue(cfs_rq, se);
4330 :
4331 : /*
4332 : * Normalize after update_curr(); which will also have moved
4333 : * min_vruntime if @se is the one holding it back. But before doing
4334 : * update_min_vruntime() again, which will discount @se's position and
4335 : * can move min_vruntime forward still more.
4336 : */
4337 15767 : if (!(flags & DEQUEUE_SLEEP))
4338 902 : se->vruntime -= cfs_rq->min_vruntime;
4339 :
4340 : /* return excess runtime on last dequeue */
4341 15767 : return_cfs_rq_runtime(cfs_rq);
4342 :
4343 15767 : update_cfs_group(se);
4344 :
4345 : /*
4346 : * Now advance min_vruntime if @se was the entity holding it back,
4347 : * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4348 : * put back on, and if we advance min_vruntime, we'll be placed back
4349 : * further than we started -- ie. we'll be penalized.
4350 : */
4351 15767 : if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4352 15752 : update_min_vruntime(cfs_rq);
4353 15768 : }
4354 :
4355 : /*
4356 : * Preempt the current task with a newly woken task if needed:
4357 : */
4358 : static void
4359 4128 : check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4360 : {
4361 4128 : unsigned long ideal_runtime, delta_exec;
4362 4128 : struct sched_entity *se;
4363 4128 : s64 delta;
4364 :
4365 4128 : ideal_runtime = sched_slice(cfs_rq, curr);
4366 4169 : delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4367 4169 : if (delta_exec > ideal_runtime) {
4368 868 : resched_curr(rq_of(cfs_rq));
4369 : /*
4370 : * The current task ran long enough, ensure it doesn't get
4371 : * re-elected due to buddy favours.
4372 : */
4373 874 : clear_buddies(cfs_rq, curr);
4374 874 : return;
4375 : }
4376 :
4377 : /*
4378 : * Ensure that a task that missed wakeup preemption by a
4379 : * narrow margin doesn't have to wait for a full slice.
4380 : * This also mitigates buddy induced latencies under load.
4381 : */
4382 3301 : if (delta_exec < sysctl_sched_min_granularity)
4383 : return;
4384 :
4385 1942 : se = __pick_first_entity(cfs_rq);
4386 1942 : delta = curr->vruntime - se->vruntime;
4387 :
4388 1942 : if (delta < 0)
4389 : return;
4390 :
4391 1069 : if (delta > ideal_runtime)
4392 92 : resched_curr(rq_of(cfs_rq));
4393 : }
4394 :
4395 : static void
4396 21088 : set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4397 : {
4398 : /* 'current' is not kept within the tree. */
4399 21088 : if (se->on_rq) {
4400 : /*
4401 : * Any task has to be enqueued before it get to execute on
4402 : * a CPU. So account for the time it spent waiting on the
4403 : * runqueue.
4404 : */
4405 21093 : update_stats_wait_end(cfs_rq, se);
4406 21093 : __dequeue_entity(cfs_rq, se);
4407 21090 : update_load_avg(cfs_rq, se, UPDATE_TG);
4408 : }
4409 :
4410 21087 : update_stats_curr_start(cfs_rq, se);
4411 21092 : cfs_rq->curr = se;
4412 :
4413 : /*
4414 : * Track our maximum slice length, if the CPU's load is at
4415 : * least twice that of our own weight (i.e. dont track it
4416 : * when there are only lesser-weight tasks around):
4417 : */
4418 21092 : if (schedstat_enabled() &&
4419 : rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4420 21092 : schedstat_set(se->statistics.slice_max,
4421 : max((u64)schedstat_val(se->statistics.slice_max),
4422 : se->sum_exec_runtime - se->prev_sum_exec_runtime));
4423 : }
4424 :
4425 21092 : se->prev_sum_exec_runtime = se->sum_exec_runtime;
4426 21092 : }
4427 :
4428 : static int
4429 : wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4430 :
4431 : /*
4432 : * Pick the next process, keeping these things in mind, in this order:
4433 : * 1) keep things fair between processes/task groups
4434 : * 2) pick the "next" process, since someone really wants that to run
4435 : * 3) pick the "last" process, for cache locality
4436 : * 4) do not run the "skip" process, if something else is available
4437 : */
4438 : static struct sched_entity *
4439 21075 : pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4440 : {
4441 21075 : struct sched_entity *left = __pick_first_entity(cfs_rq);
4442 21075 : struct sched_entity *se;
4443 :
4444 : /*
4445 : * If curr is set we have to see if its left of the leftmost entity
4446 : * still in the tree, provided there was anything in the tree at all.
4447 : */
4448 21075 : if (!left || (curr && entity_before(curr, left)))
4449 : left = curr;
4450 :
4451 21075 : se = left; /* ideally we run the leftmost entity */
4452 :
4453 : /*
4454 : * Avoid running the skip buddy, if running something else can
4455 : * be done without getting too unfair.
4456 : */
4457 21075 : if (cfs_rq->skip == se) {
4458 0 : struct sched_entity *second;
4459 :
4460 0 : if (se == curr) {
4461 0 : second = __pick_first_entity(cfs_rq);
4462 : } else {
4463 0 : second = __pick_next_entity(se);
4464 0 : if (!second || (curr && entity_before(curr, second)))
4465 : second = curr;
4466 : }
4467 :
4468 0 : if (second && wakeup_preempt_entity(second, left) < 1)
4469 0 : se = second;
4470 : }
4471 :
4472 21075 : if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4473 : /*
4474 : * Someone really wants this to run. If it's not unfair, run it.
4475 : */
4476 4936 : se = cfs_rq->next;
4477 16139 : } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4478 : /*
4479 : * Prefer last buddy, try to return the CPU to a preempted task.
4480 : */
4481 0 : se = cfs_rq->last;
4482 : }
4483 :
4484 21075 : clear_buddies(cfs_rq, se);
4485 :
4486 21077 : return se;
4487 : }
4488 :
4489 : static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4490 :
4491 21087 : static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4492 : {
4493 : /*
4494 : * If still on the runqueue then deactivate_task()
4495 : * was not called and update_curr() has to be done:
4496 : */
4497 21087 : if (prev->on_rq)
4498 6213 : update_curr(cfs_rq);
4499 :
4500 : /* throttle cfs_rqs exceeding runtime */
4501 21087 : check_cfs_rq_runtime(cfs_rq);
4502 :
4503 21087 : check_spread(cfs_rq, prev);
4504 :
4505 21087 : if (prev->on_rq) {
4506 6213 : update_stats_wait_start(cfs_rq, prev);
4507 : /* Put 'current' back into the tree. */
4508 6213 : __enqueue_entity(cfs_rq, prev);
4509 : /* in !on_rq case, update occurred at dequeue */
4510 6213 : update_load_avg(cfs_rq, prev, 0);
4511 : }
4512 21087 : cfs_rq->curr = NULL;
4513 21087 : }
4514 :
4515 : static void
4516 14657 : entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4517 : {
4518 : /*
4519 : * Update run-time statistics of the 'current'.
4520 : */
4521 14657 : update_curr(cfs_rq);
4522 :
4523 : /*
4524 : * Ensure that runnable average is periodically updated.
4525 : */
4526 14847 : update_load_avg(cfs_rq, curr, UPDATE_TG);
4527 14755 : update_cfs_group(curr);
4528 :
4529 : #ifdef CONFIG_SCHED_HRTICK
4530 : /*
4531 : * queued ticks are scheduled to match the slice, so don't bother
4532 : * validating it and just reschedule.
4533 : */
4534 : if (queued) {
4535 : resched_curr(rq_of(cfs_rq));
4536 : return;
4537 : }
4538 : /*
4539 : * don't let the period tick interfere with the hrtick preemption
4540 : */
4541 : if (!sched_feat(DOUBLE_TICK) &&
4542 : hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4543 : return;
4544 : #endif
4545 :
4546 14755 : if (cfs_rq->nr_running > 1)
4547 4142 : check_preempt_tick(cfs_rq, curr);
4548 14763 : }
4549 :
4550 :
4551 : /**************************************************
4552 : * CFS bandwidth control machinery
4553 : */
4554 :
4555 : #ifdef CONFIG_CFS_BANDWIDTH
4556 :
4557 : #ifdef CONFIG_JUMP_LABEL
4558 : static struct static_key __cfs_bandwidth_used;
4559 :
4560 : static inline bool cfs_bandwidth_used(void)
4561 : {
4562 : return static_key_false(&__cfs_bandwidth_used);
4563 : }
4564 :
4565 : void cfs_bandwidth_usage_inc(void)
4566 : {
4567 : static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4568 : }
4569 :
4570 : void cfs_bandwidth_usage_dec(void)
4571 : {
4572 : static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4573 : }
4574 : #else /* CONFIG_JUMP_LABEL */
4575 : static bool cfs_bandwidth_used(void)
4576 : {
4577 : return true;
4578 : }
4579 :
4580 : void cfs_bandwidth_usage_inc(void) {}
4581 : void cfs_bandwidth_usage_dec(void) {}
4582 : #endif /* CONFIG_JUMP_LABEL */
4583 :
4584 : /*
4585 : * default period for cfs group bandwidth.
4586 : * default: 0.1s, units: nanoseconds
4587 : */
4588 : static inline u64 default_cfs_period(void)
4589 : {
4590 : return 100000000ULL;
4591 : }
4592 :
4593 : static inline u64 sched_cfs_bandwidth_slice(void)
4594 : {
4595 : return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4596 : }
4597 :
4598 : /*
4599 : * Replenish runtime according to assigned quota. We use sched_clock_cpu
4600 : * directly instead of rq->clock to avoid adding additional synchronization
4601 : * around rq->lock.
4602 : *
4603 : * requires cfs_b->lock
4604 : */
4605 : void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4606 : {
4607 : if (cfs_b->quota != RUNTIME_INF)
4608 : cfs_b->runtime = cfs_b->quota;
4609 : }
4610 :
4611 : static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4612 : {
4613 : return &tg->cfs_bandwidth;
4614 : }
4615 :
4616 : /* returns 0 on failure to allocate runtime */
4617 : static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4618 : struct cfs_rq *cfs_rq, u64 target_runtime)
4619 : {
4620 : u64 min_amount, amount = 0;
4621 :
4622 : lockdep_assert_held(&cfs_b->lock);
4623 :
4624 : /* note: this is a positive sum as runtime_remaining <= 0 */
4625 : min_amount = target_runtime - cfs_rq->runtime_remaining;
4626 :
4627 : if (cfs_b->quota == RUNTIME_INF)
4628 : amount = min_amount;
4629 : else {
4630 : start_cfs_bandwidth(cfs_b);
4631 :
4632 : if (cfs_b->runtime > 0) {
4633 : amount = min(cfs_b->runtime, min_amount);
4634 : cfs_b->runtime -= amount;
4635 : cfs_b->idle = 0;
4636 : }
4637 : }
4638 :
4639 : cfs_rq->runtime_remaining += amount;
4640 :
4641 : return cfs_rq->runtime_remaining > 0;
4642 : }
4643 :
4644 : /* returns 0 on failure to allocate runtime */
4645 : static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4646 : {
4647 : struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4648 : int ret;
4649 :
4650 : raw_spin_lock(&cfs_b->lock);
4651 : ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4652 : raw_spin_unlock(&cfs_b->lock);
4653 :
4654 : return ret;
4655 : }
4656 :
4657 : static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4658 : {
4659 : /* dock delta_exec before expiring quota (as it could span periods) */
4660 : cfs_rq->runtime_remaining -= delta_exec;
4661 :
4662 : if (likely(cfs_rq->runtime_remaining > 0))
4663 : return;
4664 :
4665 : if (cfs_rq->throttled)
4666 : return;
4667 : /*
4668 : * if we're unable to extend our runtime we resched so that the active
4669 : * hierarchy can be throttled
4670 : */
4671 : if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4672 : resched_curr(rq_of(cfs_rq));
4673 : }
4674 :
4675 : static __always_inline
4676 : void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4677 : {
4678 : if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4679 : return;
4680 :
4681 : __account_cfs_rq_runtime(cfs_rq, delta_exec);
4682 : }
4683 :
4684 : static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4685 : {
4686 : return cfs_bandwidth_used() && cfs_rq->throttled;
4687 : }
4688 :
4689 : /* check whether cfs_rq, or any parent, is throttled */
4690 : static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4691 : {
4692 : return cfs_bandwidth_used() && cfs_rq->throttle_count;
4693 : }
4694 :
4695 : /*
4696 : * Ensure that neither of the group entities corresponding to src_cpu or
4697 : * dest_cpu are members of a throttled hierarchy when performing group
4698 : * load-balance operations.
4699 : */
4700 : static inline int throttled_lb_pair(struct task_group *tg,
4701 : int src_cpu, int dest_cpu)
4702 : {
4703 : struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4704 :
4705 : src_cfs_rq = tg->cfs_rq[src_cpu];
4706 : dest_cfs_rq = tg->cfs_rq[dest_cpu];
4707 :
4708 : return throttled_hierarchy(src_cfs_rq) ||
4709 : throttled_hierarchy(dest_cfs_rq);
4710 : }
4711 :
4712 : static int tg_unthrottle_up(struct task_group *tg, void *data)
4713 : {
4714 : struct rq *rq = data;
4715 : struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4716 :
4717 : cfs_rq->throttle_count--;
4718 : if (!cfs_rq->throttle_count) {
4719 : cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4720 : cfs_rq->throttled_clock_task;
4721 :
4722 : /* Add cfs_rq with already running entity in the list */
4723 : if (cfs_rq->nr_running >= 1)
4724 : list_add_leaf_cfs_rq(cfs_rq);
4725 : }
4726 :
4727 : return 0;
4728 : }
4729 :
4730 : static int tg_throttle_down(struct task_group *tg, void *data)
4731 : {
4732 : struct rq *rq = data;
4733 : struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4734 :
4735 : /* group is entering throttled state, stop time */
4736 : if (!cfs_rq->throttle_count) {
4737 : cfs_rq->throttled_clock_task = rq_clock_task(rq);
4738 : list_del_leaf_cfs_rq(cfs_rq);
4739 : }
4740 : cfs_rq->throttle_count++;
4741 :
4742 : return 0;
4743 : }
4744 :
4745 : static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
4746 : {
4747 : struct rq *rq = rq_of(cfs_rq);
4748 : struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4749 : struct sched_entity *se;
4750 : long task_delta, idle_task_delta, dequeue = 1;
4751 :
4752 : raw_spin_lock(&cfs_b->lock);
4753 : /* This will start the period timer if necessary */
4754 : if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
4755 : /*
4756 : * We have raced with bandwidth becoming available, and if we
4757 : * actually throttled the timer might not unthrottle us for an
4758 : * entire period. We additionally needed to make sure that any
4759 : * subsequent check_cfs_rq_runtime calls agree not to throttle
4760 : * us, as we may commit to do cfs put_prev+pick_next, so we ask
4761 : * for 1ns of runtime rather than just check cfs_b.
4762 : */
4763 : dequeue = 0;
4764 : } else {
4765 : list_add_tail_rcu(&cfs_rq->throttled_list,
4766 : &cfs_b->throttled_cfs_rq);
4767 : }
4768 : raw_spin_unlock(&cfs_b->lock);
4769 :
4770 : if (!dequeue)
4771 : return false; /* Throttle no longer required. */
4772 :
4773 : se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4774 :
4775 : /* freeze hierarchy runnable averages while throttled */
4776 : rcu_read_lock();
4777 : walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4778 : rcu_read_unlock();
4779 :
4780 : task_delta = cfs_rq->h_nr_running;
4781 : idle_task_delta = cfs_rq->idle_h_nr_running;
4782 : for_each_sched_entity(se) {
4783 : struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4784 : /* throttled entity or throttle-on-deactivate */
4785 : if (!se->on_rq)
4786 : goto done;
4787 :
4788 : dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4789 :
4790 : qcfs_rq->h_nr_running -= task_delta;
4791 : qcfs_rq->idle_h_nr_running -= idle_task_delta;
4792 :
4793 : if (qcfs_rq->load.weight) {
4794 : /* Avoid re-evaluating load for this entity: */
4795 : se = parent_entity(se);
4796 : break;
4797 : }
4798 : }
4799 :
4800 : for_each_sched_entity(se) {
4801 : struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4802 : /* throttled entity or throttle-on-deactivate */
4803 : if (!se->on_rq)
4804 : goto done;
4805 :
4806 : update_load_avg(qcfs_rq, se, 0);
4807 : se_update_runnable(se);
4808 :
4809 : qcfs_rq->h_nr_running -= task_delta;
4810 : qcfs_rq->idle_h_nr_running -= idle_task_delta;
4811 : }
4812 :
4813 : /* At this point se is NULL and we are at root level*/
4814 : sub_nr_running(rq, task_delta);
4815 :
4816 : done:
4817 : /*
4818 : * Note: distribution will already see us throttled via the
4819 : * throttled-list. rq->lock protects completion.
4820 : */
4821 : cfs_rq->throttled = 1;
4822 : cfs_rq->throttled_clock = rq_clock(rq);
4823 : return true;
4824 : }
4825 :
4826 : void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4827 : {
4828 : struct rq *rq = rq_of(cfs_rq);
4829 : struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4830 : struct sched_entity *se;
4831 : long task_delta, idle_task_delta;
4832 :
4833 : se = cfs_rq->tg->se[cpu_of(rq)];
4834 :
4835 : cfs_rq->throttled = 0;
4836 :
4837 : update_rq_clock(rq);
4838 :
4839 : raw_spin_lock(&cfs_b->lock);
4840 : cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4841 : list_del_rcu(&cfs_rq->throttled_list);
4842 : raw_spin_unlock(&cfs_b->lock);
4843 :
4844 : /* update hierarchical throttle state */
4845 : walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4846 :
4847 : if (!cfs_rq->load.weight)
4848 : return;
4849 :
4850 : task_delta = cfs_rq->h_nr_running;
4851 : idle_task_delta = cfs_rq->idle_h_nr_running;
4852 : for_each_sched_entity(se) {
4853 : if (se->on_rq)
4854 : break;
4855 : cfs_rq = cfs_rq_of(se);
4856 : enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4857 :
4858 : cfs_rq->h_nr_running += task_delta;
4859 : cfs_rq->idle_h_nr_running += idle_task_delta;
4860 :
4861 : /* end evaluation on encountering a throttled cfs_rq */
4862 : if (cfs_rq_throttled(cfs_rq))
4863 : goto unthrottle_throttle;
4864 : }
4865 :
4866 : for_each_sched_entity(se) {
4867 : cfs_rq = cfs_rq_of(se);
4868 :
4869 : update_load_avg(cfs_rq, se, UPDATE_TG);
4870 : se_update_runnable(se);
4871 :
4872 : cfs_rq->h_nr_running += task_delta;
4873 : cfs_rq->idle_h_nr_running += idle_task_delta;
4874 :
4875 :
4876 : /* end evaluation on encountering a throttled cfs_rq */
4877 : if (cfs_rq_throttled(cfs_rq))
4878 : goto unthrottle_throttle;
4879 :
4880 : /*
4881 : * One parent has been throttled and cfs_rq removed from the
4882 : * list. Add it back to not break the leaf list.
4883 : */
4884 : if (throttled_hierarchy(cfs_rq))
4885 : list_add_leaf_cfs_rq(cfs_rq);
4886 : }
4887 :
4888 : /* At this point se is NULL and we are at root level*/
4889 : add_nr_running(rq, task_delta);
4890 :
4891 : unthrottle_throttle:
4892 : /*
4893 : * The cfs_rq_throttled() breaks in the above iteration can result in
4894 : * incomplete leaf list maintenance, resulting in triggering the
4895 : * assertion below.
4896 : */
4897 : for_each_sched_entity(se) {
4898 : cfs_rq = cfs_rq_of(se);
4899 :
4900 : if (list_add_leaf_cfs_rq(cfs_rq))
4901 : break;
4902 : }
4903 :
4904 : assert_list_leaf_cfs_rq(rq);
4905 :
4906 : /* Determine whether we need to wake up potentially idle CPU: */
4907 : if (rq->curr == rq->idle && rq->cfs.nr_running)
4908 : resched_curr(rq);
4909 : }
4910 :
4911 : static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
4912 : {
4913 : struct cfs_rq *cfs_rq;
4914 : u64 runtime, remaining = 1;
4915 :
4916 : rcu_read_lock();
4917 : list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4918 : throttled_list) {
4919 : struct rq *rq = rq_of(cfs_rq);
4920 : struct rq_flags rf;
4921 :
4922 : rq_lock_irqsave(rq, &rf);
4923 : if (!cfs_rq_throttled(cfs_rq))
4924 : goto next;
4925 :
4926 : /* By the above check, this should never be true */
4927 : SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4928 :
4929 : raw_spin_lock(&cfs_b->lock);
4930 : runtime = -cfs_rq->runtime_remaining + 1;
4931 : if (runtime > cfs_b->runtime)
4932 : runtime = cfs_b->runtime;
4933 : cfs_b->runtime -= runtime;
4934 : remaining = cfs_b->runtime;
4935 : raw_spin_unlock(&cfs_b->lock);
4936 :
4937 : cfs_rq->runtime_remaining += runtime;
4938 :
4939 : /* we check whether we're throttled above */
4940 : if (cfs_rq->runtime_remaining > 0)
4941 : unthrottle_cfs_rq(cfs_rq);
4942 :
4943 : next:
4944 : rq_unlock_irqrestore(rq, &rf);
4945 :
4946 : if (!remaining)
4947 : break;
4948 : }
4949 : rcu_read_unlock();
4950 : }
4951 :
4952 : /*
4953 : * Responsible for refilling a task_group's bandwidth and unthrottling its
4954 : * cfs_rqs as appropriate. If there has been no activity within the last
4955 : * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4956 : * used to track this state.
4957 : */
4958 : static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4959 : {
4960 : int throttled;
4961 :
4962 : /* no need to continue the timer with no bandwidth constraint */
4963 : if (cfs_b->quota == RUNTIME_INF)
4964 : goto out_deactivate;
4965 :
4966 : throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4967 : cfs_b->nr_periods += overrun;
4968 :
4969 : /*
4970 : * idle depends on !throttled (for the case of a large deficit), and if
4971 : * we're going inactive then everything else can be deferred
4972 : */
4973 : if (cfs_b->idle && !throttled)
4974 : goto out_deactivate;
4975 :
4976 : __refill_cfs_bandwidth_runtime(cfs_b);
4977 :
4978 : if (!throttled) {
4979 : /* mark as potentially idle for the upcoming period */
4980 : cfs_b->idle = 1;
4981 : return 0;
4982 : }
4983 :
4984 : /* account preceding periods in which throttling occurred */
4985 : cfs_b->nr_throttled += overrun;
4986 :
4987 : /*
4988 : * This check is repeated as we release cfs_b->lock while we unthrottle.
4989 : */
4990 : while (throttled && cfs_b->runtime > 0) {
4991 : raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4992 : /* we can't nest cfs_b->lock while distributing bandwidth */
4993 : distribute_cfs_runtime(cfs_b);
4994 : raw_spin_lock_irqsave(&cfs_b->lock, flags);
4995 :
4996 : throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4997 : }
4998 :
4999 : /*
5000 : * While we are ensured activity in the period following an
5001 : * unthrottle, this also covers the case in which the new bandwidth is
5002 : * insufficient to cover the existing bandwidth deficit. (Forcing the
5003 : * timer to remain active while there are any throttled entities.)
5004 : */
5005 : cfs_b->idle = 0;
5006 :
5007 : return 0;
5008 :
5009 : out_deactivate:
5010 : return 1;
5011 : }
5012 :
5013 : /* a cfs_rq won't donate quota below this amount */
5014 : static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5015 : /* minimum remaining period time to redistribute slack quota */
5016 : static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5017 : /* how long we wait to gather additional slack before distributing */
5018 : static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5019 :
5020 : /*
5021 : * Are we near the end of the current quota period?
5022 : *
5023 : * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
5024 : * hrtimer base being cleared by hrtimer_start. In the case of
5025 : * migrate_hrtimers, base is never cleared, so we are fine.
5026 : */
5027 : static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5028 : {
5029 : struct hrtimer *refresh_timer = &cfs_b->period_timer;
5030 : u64 remaining;
5031 :
5032 : /* if the call-back is running a quota refresh is already occurring */
5033 : if (hrtimer_callback_running(refresh_timer))
5034 : return 1;
5035 :
5036 : /* is a quota refresh about to occur? */
5037 : remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5038 : if (remaining < min_expire)
5039 : return 1;
5040 :
5041 : return 0;
5042 : }
5043 :
5044 : static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5045 : {
5046 : u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5047 :
5048 : /* if there's a quota refresh soon don't bother with slack */
5049 : if (runtime_refresh_within(cfs_b, min_left))
5050 : return;
5051 :
5052 : /* don't push forwards an existing deferred unthrottle */
5053 : if (cfs_b->slack_started)
5054 : return;
5055 : cfs_b->slack_started = true;
5056 :
5057 : hrtimer_start(&cfs_b->slack_timer,
5058 : ns_to_ktime(cfs_bandwidth_slack_period),
5059 : HRTIMER_MODE_REL);
5060 : }
5061 :
5062 : /* we know any runtime found here is valid as update_curr() precedes return */
5063 : static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5064 : {
5065 : struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5066 : s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5067 :
5068 : if (slack_runtime <= 0)
5069 : return;
5070 :
5071 : raw_spin_lock(&cfs_b->lock);
5072 : if (cfs_b->quota != RUNTIME_INF) {
5073 : cfs_b->runtime += slack_runtime;
5074 :
5075 : /* we are under rq->lock, defer unthrottling using a timer */
5076 : if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5077 : !list_empty(&cfs_b->throttled_cfs_rq))
5078 : start_cfs_slack_bandwidth(cfs_b);
5079 : }
5080 : raw_spin_unlock(&cfs_b->lock);
5081 :
5082 : /* even if it's not valid for return we don't want to try again */
5083 : cfs_rq->runtime_remaining -= slack_runtime;
5084 : }
5085 :
5086 : static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5087 : {
5088 : if (!cfs_bandwidth_used())
5089 : return;
5090 :
5091 : if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5092 : return;
5093 :
5094 : __return_cfs_rq_runtime(cfs_rq);
5095 : }
5096 :
5097 : /*
5098 : * This is done with a timer (instead of inline with bandwidth return) since
5099 : * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
5100 : */
5101 : static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5102 : {
5103 : u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5104 : unsigned long flags;
5105 :
5106 : /* confirm we're still not at a refresh boundary */
5107 : raw_spin_lock_irqsave(&cfs_b->lock, flags);
5108 : cfs_b->slack_started = false;
5109 :
5110 : if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5111 : raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5112 : return;
5113 : }
5114 :
5115 : if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5116 : runtime = cfs_b->runtime;
5117 :
5118 : raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5119 :
5120 : if (!runtime)
5121 : return;
5122 :
5123 : distribute_cfs_runtime(cfs_b);
5124 : }
5125 :
5126 : /*
5127 : * When a group wakes up we want to make sure that its quota is not already
5128 : * expired/exceeded, otherwise it may be allowed to steal additional ticks of
5129 : * runtime as update_curr() throttling can not trigger until it's on-rq.
5130 : */
5131 : static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5132 : {
5133 : if (!cfs_bandwidth_used())
5134 : return;
5135 :
5136 : /* an active group must be handled by the update_curr()->put() path */
5137 : if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5138 : return;
5139 :
5140 : /* ensure the group is not already throttled */
5141 : if (cfs_rq_throttled(cfs_rq))
5142 : return;
5143 :
5144 : /* update runtime allocation */
5145 : account_cfs_rq_runtime(cfs_rq, 0);
5146 : if (cfs_rq->runtime_remaining <= 0)
5147 : throttle_cfs_rq(cfs_rq);
5148 : }
5149 :
5150 : static void sync_throttle(struct task_group *tg, int cpu)
5151 : {
5152 : struct cfs_rq *pcfs_rq, *cfs_rq;
5153 :
5154 : if (!cfs_bandwidth_used())
5155 : return;
5156 :
5157 : if (!tg->parent)
5158 : return;
5159 :
5160 : cfs_rq = tg->cfs_rq[cpu];
5161 : pcfs_rq = tg->parent->cfs_rq[cpu];
5162 :
5163 : cfs_rq->throttle_count = pcfs_rq->throttle_count;
5164 : cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
5165 : }
5166 :
5167 : /* conditionally throttle active cfs_rq's from put_prev_entity() */
5168 : static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5169 : {
5170 : if (!cfs_bandwidth_used())
5171 : return false;
5172 :
5173 : if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5174 : return false;
5175 :
5176 : /*
5177 : * it's possible for a throttled entity to be forced into a running
5178 : * state (e.g. set_curr_task), in this case we're finished.
5179 : */
5180 : if (cfs_rq_throttled(cfs_rq))
5181 : return true;
5182 :
5183 : return throttle_cfs_rq(cfs_rq);
5184 : }
5185 :
5186 : static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5187 : {
5188 : struct cfs_bandwidth *cfs_b =
5189 : container_of(timer, struct cfs_bandwidth, slack_timer);
5190 :
5191 : do_sched_cfs_slack_timer(cfs_b);
5192 :
5193 : return HRTIMER_NORESTART;
5194 : }
5195 :
5196 : extern const u64 max_cfs_quota_period;
5197 :
5198 : static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5199 : {
5200 : struct cfs_bandwidth *cfs_b =
5201 : container_of(timer, struct cfs_bandwidth, period_timer);
5202 : unsigned long flags;
5203 : int overrun;
5204 : int idle = 0;
5205 : int count = 0;
5206 :
5207 : raw_spin_lock_irqsave(&cfs_b->lock, flags);
5208 : for (;;) {
5209 : overrun = hrtimer_forward_now(timer, cfs_b->period);
5210 : if (!overrun)
5211 : break;
5212 :
5213 : idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5214 :
5215 : if (++count > 3) {
5216 : u64 new, old = ktime_to_ns(cfs_b->period);
5217 :
5218 : /*
5219 : * Grow period by a factor of 2 to avoid losing precision.
5220 : * Precision loss in the quota/period ratio can cause __cfs_schedulable
5221 : * to fail.
5222 : */
5223 : new = old * 2;
5224 : if (new < max_cfs_quota_period) {
5225 : cfs_b->period = ns_to_ktime(new);
5226 : cfs_b->quota *= 2;
5227 :
5228 : pr_warn_ratelimited(
5229 : "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5230 : smp_processor_id(),
5231 : div_u64(new, NSEC_PER_USEC),
5232 : div_u64(cfs_b->quota, NSEC_PER_USEC));
5233 : } else {
5234 : pr_warn_ratelimited(
5235 : "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5236 : smp_processor_id(),
5237 : div_u64(old, NSEC_PER_USEC),
5238 : div_u64(cfs_b->quota, NSEC_PER_USEC));
5239 : }
5240 :
5241 : /* reset count so we don't come right back in here */
5242 : count = 0;
5243 : }
5244 : }
5245 : if (idle)
5246 : cfs_b->period_active = 0;
5247 : raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5248 :
5249 : return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5250 : }
5251 :
5252 : void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5253 : {
5254 : raw_spin_lock_init(&cfs_b->lock);
5255 : cfs_b->runtime = 0;
5256 : cfs_b->quota = RUNTIME_INF;
5257 : cfs_b->period = ns_to_ktime(default_cfs_period());
5258 :
5259 : INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5260 : hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5261 : cfs_b->period_timer.function = sched_cfs_period_timer;
5262 : hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5263 : cfs_b->slack_timer.function = sched_cfs_slack_timer;
5264 : cfs_b->slack_started = false;
5265 : }
5266 :
5267 : static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5268 : {
5269 : cfs_rq->runtime_enabled = 0;
5270 : INIT_LIST_HEAD(&cfs_rq->throttled_list);
5271 : }
5272 :
5273 : void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5274 : {
5275 : lockdep_assert_held(&cfs_b->lock);
5276 :
5277 : if (cfs_b->period_active)
5278 : return;
5279 :
5280 : cfs_b->period_active = 1;
5281 : hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5282 : hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5283 : }
5284 :
5285 : static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5286 : {
5287 : /* init_cfs_bandwidth() was not called */
5288 : if (!cfs_b->throttled_cfs_rq.next)
5289 : return;
5290 :
5291 : hrtimer_cancel(&cfs_b->period_timer);
5292 : hrtimer_cancel(&cfs_b->slack_timer);
5293 : }
5294 :
5295 : /*
5296 : * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5297 : *
5298 : * The race is harmless, since modifying bandwidth settings of unhooked group
5299 : * bits doesn't do much.
5300 : */
5301 :
5302 : /* cpu online calback */
5303 : static void __maybe_unused update_runtime_enabled(struct rq *rq)
5304 : {
5305 : struct task_group *tg;
5306 :
5307 : lockdep_assert_held(&rq->lock);
5308 :
5309 : rcu_read_lock();
5310 : list_for_each_entry_rcu(tg, &task_groups, list) {
5311 : struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5312 : struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5313 :
5314 : raw_spin_lock(&cfs_b->lock);
5315 : cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5316 : raw_spin_unlock(&cfs_b->lock);
5317 : }
5318 : rcu_read_unlock();
5319 : }
5320 :
5321 : /* cpu offline callback */
5322 : static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5323 : {
5324 : struct task_group *tg;
5325 :
5326 : lockdep_assert_held(&rq->lock);
5327 :
5328 : rcu_read_lock();
5329 : list_for_each_entry_rcu(tg, &task_groups, list) {
5330 : struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5331 :
5332 : if (!cfs_rq->runtime_enabled)
5333 : continue;
5334 :
5335 : /*
5336 : * clock_task is not advancing so we just need to make sure
5337 : * there's some valid quota amount
5338 : */
5339 : cfs_rq->runtime_remaining = 1;
5340 : /*
5341 : * Offline rq is schedulable till CPU is completely disabled
5342 : * in take_cpu_down(), so we prevent new cfs throttling here.
5343 : */
5344 : cfs_rq->runtime_enabled = 0;
5345 :
5346 : if (cfs_rq_throttled(cfs_rq))
5347 : unthrottle_cfs_rq(cfs_rq);
5348 : }
5349 : rcu_read_unlock();
5350 : }
5351 :
5352 : #else /* CONFIG_CFS_BANDWIDTH */
5353 :
5354 15764 : static inline bool cfs_bandwidth_used(void)
5355 : {
5356 15764 : return false;
5357 : }
5358 :
5359 41013 : static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5360 21087 : static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5361 : static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5362 : static inline void sync_throttle(struct task_group *tg, int cpu) {}
5363 15767 : static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5364 :
5365 31526 : static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5366 : {
5367 31526 : return 0;
5368 : }
5369 :
5370 8210 : static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5371 : {
5372 8210 : return 0;
5373 : }
5374 :
5375 5145 : static inline int throttled_lb_pair(struct task_group *tg,
5376 : int src_cpu, int dest_cpu)
5377 : {
5378 5145 : return 0;
5379 : }
5380 :
5381 0 : void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5382 :
5383 : #ifdef CONFIG_FAIR_GROUP_SCHED
5384 : static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5385 : #endif
5386 :
5387 : static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5388 : {
5389 : return NULL;
5390 : }
5391 : static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5392 8 : static inline void update_runtime_enabled(struct rq *rq) {}
5393 4 : static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5394 :
5395 : #endif /* CONFIG_CFS_BANDWIDTH */
5396 :
5397 : /**************************************************
5398 : * CFS operations on tasks:
5399 : */
5400 :
5401 : #ifdef CONFIG_SCHED_HRTICK
5402 : static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5403 : {
5404 : struct sched_entity *se = &p->se;
5405 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
5406 :
5407 : SCHED_WARN_ON(task_rq(p) != rq);
5408 :
5409 : if (rq->cfs.h_nr_running > 1) {
5410 : u64 slice = sched_slice(cfs_rq, se);
5411 : u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5412 : s64 delta = slice - ran;
5413 :
5414 : if (delta < 0) {
5415 : if (task_current(rq, p))
5416 : resched_curr(rq);
5417 : return;
5418 : }
5419 : hrtick_start(rq, delta);
5420 : }
5421 : }
5422 :
5423 : /*
5424 : * called from enqueue/dequeue and updates the hrtick when the
5425 : * current task is from our class and nr_running is low enough
5426 : * to matter.
5427 : */
5428 : static void hrtick_update(struct rq *rq)
5429 : {
5430 : struct task_struct *curr = rq->curr;
5431 :
5432 : if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
5433 : return;
5434 :
5435 : if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5436 : hrtick_start_fair(rq, curr);
5437 : }
5438 : #else /* !CONFIG_SCHED_HRTICK */
5439 : static inline void
5440 : hrtick_start_fair(struct rq *rq, struct task_struct *p)
5441 : {
5442 : }
5443 :
5444 31528 : static inline void hrtick_update(struct rq *rq)
5445 : {
5446 31528 : }
5447 : #endif
5448 :
5449 : #ifdef CONFIG_SMP
5450 : static inline unsigned long cpu_util(int cpu);
5451 :
5452 44200 : static inline bool cpu_overutilized(int cpu)
5453 : {
5454 44200 : return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5455 : }
5456 :
5457 28605 : static inline void update_overutilized_status(struct rq *rq)
5458 : {
5459 28605 : if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5460 168 : WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5461 168 : trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5462 : }
5463 28608 : }
5464 : #else
5465 : static inline void update_overutilized_status(struct rq *rq) { }
5466 : #endif
5467 :
5468 : /* Runqueue only has SCHED_IDLE tasks enqueued */
5469 41440 : static int sched_idle_rq(struct rq *rq)
5470 : {
5471 17721 : return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5472 : rq->nr_running);
5473 : }
5474 :
5475 : #ifdef CONFIG_SMP
5476 9908 : static int sched_idle_cpu(int cpu)
5477 : {
5478 9908 : return sched_idle_rq(cpu_rq(cpu));
5479 : }
5480 : #endif
5481 :
5482 : /*
5483 : * The enqueue_task method is called before nr_running is
5484 : * increased. Here we update the fair scheduling stats and
5485 : * then put the task into the rbtree:
5486 : */
5487 : static void
5488 15759 : enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5489 : {
5490 15759 : struct cfs_rq *cfs_rq;
5491 15759 : struct sched_entity *se = &p->se;
5492 15759 : int idle_h_nr_running = task_has_idle_policy(p);
5493 15759 : int task_new = !(flags & ENQUEUE_WAKEUP);
5494 :
5495 : /*
5496 : * The code below (indirectly) updates schedutil which looks at
5497 : * the cfs_rq utilization to select a frequency.
5498 : * Let's add the task's estimated utilization to the cfs_rq's
5499 : * estimated utilization, before we update schedutil.
5500 : */
5501 15759 : util_est_enqueue(&rq->cfs, p);
5502 :
5503 : /*
5504 : * If in_iowait is set, the code below may not trigger any cpufreq
5505 : * utilization updates, so do it here explicitly with the IOWAIT flag
5506 : * passed.
5507 : */
5508 15762 : if (p->in_iowait)
5509 15762 : cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5510 :
5511 15762 : for_each_sched_entity(se) {
5512 15762 : if (se->on_rq)
5513 : break;
5514 15765 : cfs_rq = cfs_rq_of(se);
5515 15765 : enqueue_entity(cfs_rq, se, flags);
5516 :
5517 15760 : cfs_rq->h_nr_running++;
5518 15760 : cfs_rq->idle_h_nr_running += idle_h_nr_running;
5519 :
5520 : /* end evaluation on encountering a throttled cfs_rq */
5521 15760 : if (cfs_rq_throttled(cfs_rq))
5522 : goto enqueue_throttle;
5523 :
5524 15760 : flags = ENQUEUE_WAKEUP;
5525 : }
5526 :
5527 15757 : for_each_sched_entity(se) {
5528 0 : cfs_rq = cfs_rq_of(se);
5529 :
5530 0 : update_load_avg(cfs_rq, se, UPDATE_TG);
5531 0 : se_update_runnable(se);
5532 0 : update_cfs_group(se);
5533 :
5534 0 : cfs_rq->h_nr_running++;
5535 0 : cfs_rq->idle_h_nr_running += idle_h_nr_running;
5536 :
5537 : /* end evaluation on encountering a throttled cfs_rq */
5538 0 : if (cfs_rq_throttled(cfs_rq))
5539 : goto enqueue_throttle;
5540 :
5541 : /*
5542 : * One parent has been throttled and cfs_rq removed from the
5543 : * list. Add it back to not break the leaf list.
5544 : */
5545 0 : if (throttled_hierarchy(cfs_rq))
5546 0 : list_add_leaf_cfs_rq(cfs_rq);
5547 : }
5548 :
5549 : /* At this point se is NULL and we are at root level*/
5550 15757 : add_nr_running(rq, 1);
5551 :
5552 : /*
5553 : * Since new tasks are assigned an initial util_avg equal to
5554 : * half of the spare capacity of their CPU, tiny tasks have the
5555 : * ability to cross the overutilized threshold, which will
5556 : * result in the load balancer ruining all the task placement
5557 : * done by EAS. As a way to mitigate that effect, do not account
5558 : * for the first enqueue operation of new tasks during the
5559 : * overutilized flag detection.
5560 : *
5561 : * A better way of solving this problem would be to wait for
5562 : * the PELT signals of tasks to converge before taking them
5563 : * into account, but that is not straightforward to implement,
5564 : * and the following generally works well enough in practice.
5565 : */
5566 15765 : if (!task_new)
5567 13877 : update_overutilized_status(rq);
5568 :
5569 1888 : enqueue_throttle:
5570 15764 : if (cfs_bandwidth_used()) {
5571 : /*
5572 : * When bandwidth control is enabled; the cfs_rq_throttled()
5573 : * breaks in the above iteration can result in incomplete
5574 : * leaf list maintenance, resulting in triggering the assertion
5575 : * below.
5576 : */
5577 : for_each_sched_entity(se) {
5578 : cfs_rq = cfs_rq_of(se);
5579 :
5580 : if (list_add_leaf_cfs_rq(cfs_rq))
5581 : break;
5582 : }
5583 : }
5584 :
5585 15764 : assert_list_leaf_cfs_rq(rq);
5586 :
5587 15764 : hrtick_update(rq);
5588 15764 : }
5589 :
5590 : static void set_next_buddy(struct sched_entity *se);
5591 :
5592 : /*
5593 : * The dequeue_task method is called before nr_running is
5594 : * decreased. We remove the task from the rbtree and
5595 : * update the fair scheduling stats:
5596 : */
5597 15766 : static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5598 : {
5599 15766 : struct cfs_rq *cfs_rq;
5600 15766 : struct sched_entity *se = &p->se;
5601 15766 : int task_sleep = flags & DEQUEUE_SLEEP;
5602 15766 : int idle_h_nr_running = task_has_idle_policy(p);
5603 15766 : bool was_sched_idle = sched_idle_rq(rq);
5604 :
5605 15766 : util_est_dequeue(&rq->cfs, p);
5606 :
5607 15766 : for_each_sched_entity(se) {
5608 15766 : cfs_rq = cfs_rq_of(se);
5609 15766 : dequeue_entity(cfs_rq, se, flags);
5610 :
5611 15766 : cfs_rq->h_nr_running--;
5612 15766 : cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5613 :
5614 : /* end evaluation on encountering a throttled cfs_rq */
5615 15766 : if (cfs_rq_throttled(cfs_rq))
5616 : goto dequeue_throttle;
5617 :
5618 : /* Don't dequeue parent if it has other entities besides us */
5619 15766 : if (cfs_rq->load.weight) {
5620 : /* Avoid re-evaluating load for this entity: */
5621 15766 : se = parent_entity(se);
5622 : /*
5623 : * Bias pick_next to pick a task from this cfs_rq, as
5624 : * p is sleeping when it is within its sched_slice.
5625 : */
5626 : if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5627 : set_next_buddy(se);
5628 : break;
5629 : }
5630 15766 : flags |= DEQUEUE_SLEEP;
5631 : }
5632 :
5633 15766 : for_each_sched_entity(se) {
5634 0 : cfs_rq = cfs_rq_of(se);
5635 :
5636 0 : update_load_avg(cfs_rq, se, UPDATE_TG);
5637 0 : se_update_runnable(se);
5638 0 : update_cfs_group(se);
5639 :
5640 0 : cfs_rq->h_nr_running--;
5641 0 : cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5642 :
5643 : /* end evaluation on encountering a throttled cfs_rq */
5644 0 : if (cfs_rq_throttled(cfs_rq))
5645 : goto dequeue_throttle;
5646 :
5647 : }
5648 :
5649 : /* At this point se is NULL and we are at root level*/
5650 15766 : sub_nr_running(rq, 1);
5651 :
5652 : /* balance early to pull high priority tasks */
5653 31532 : if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5654 0 : rq->next_balance = jiffies;
5655 :
5656 15766 : dequeue_throttle:
5657 15766 : util_est_update(&rq->cfs, p, task_sleep);
5658 15764 : hrtick_update(rq);
5659 15764 : }
5660 :
5661 : #ifdef CONFIG_SMP
5662 :
5663 : /* Working cpumask for: load_balance, load_balance_newidle. */
5664 : DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5665 : DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5666 :
5667 : #ifdef CONFIG_NO_HZ_COMMON
5668 :
5669 : static struct {
5670 : cpumask_var_t idle_cpus_mask;
5671 : atomic_t nr_cpus;
5672 : int has_blocked; /* Idle CPUS has blocked load */
5673 : unsigned long next_balance; /* in jiffy units */
5674 : unsigned long next_blocked; /* Next update of blocked load in jiffies */
5675 : } nohz ____cacheline_aligned;
5676 :
5677 : #endif /* CONFIG_NO_HZ_COMMON */
5678 :
5679 43863 : static unsigned long cpu_load(struct rq *rq)
5680 : {
5681 43863 : return cfs_rq_load_avg(&rq->cfs);
5682 : }
5683 :
5684 : /*
5685 : * cpu_load_without - compute CPU load without any contributions from *p
5686 : * @cpu: the CPU which load is requested
5687 : * @p: the task which load should be discounted
5688 : *
5689 : * The load of a CPU is defined by the load of tasks currently enqueued on that
5690 : * CPU as well as tasks which are currently sleeping after an execution on that
5691 : * CPU.
5692 : *
5693 : * This method returns the load of the specified CPU by discounting the load of
5694 : * the specified task, whenever the task is currently contributing to the CPU
5695 : * load.
5696 : */
5697 6336 : static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
5698 : {
5699 6336 : struct cfs_rq *cfs_rq;
5700 6336 : unsigned int load;
5701 :
5702 : /* Task has no contribution or is new */
5703 1584 : if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5704 5711 : return cpu_load(rq);
5705 :
5706 625 : cfs_rq = &rq->cfs;
5707 625 : load = READ_ONCE(cfs_rq->avg.load_avg);
5708 :
5709 : /* Discount task's util from CPU's util */
5710 625 : lsub_positive(&load, task_h_load(p));
5711 :
5712 625 : return load;
5713 : }
5714 :
5715 43262 : static unsigned long cpu_runnable(struct rq *rq)
5716 : {
5717 43262 : return cfs_rq_runnable_avg(&rq->cfs);
5718 : }
5719 :
5720 6336 : static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5721 : {
5722 6336 : struct cfs_rq *cfs_rq;
5723 6336 : unsigned int runnable;
5724 :
5725 : /* Task has no contribution or is new */
5726 1584 : if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5727 5711 : return cpu_runnable(rq);
5728 :
5729 625 : cfs_rq = &rq->cfs;
5730 625 : runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5731 :
5732 : /* Discount task's runnable from CPU's runnable */
5733 625 : lsub_positive(&runnable, p->se.avg.runnable_avg);
5734 :
5735 625 : return runnable;
5736 : }
5737 :
5738 47079 : static unsigned long capacity_of(int cpu)
5739 : {
5740 0 : return cpu_rq(cpu)->cpu_capacity;
5741 : }
5742 :
5743 10113 : static void record_wakee(struct task_struct *p)
5744 : {
5745 : /*
5746 : * Only decay a single time; tasks that have less then 1 wakeup per
5747 : * jiffy will not have built up many flips.
5748 : */
5749 10113 : if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5750 643 : current->wakee_flips >>= 1;
5751 643 : current->wakee_flip_decay_ts = jiffies;
5752 : }
5753 :
5754 10113 : if (current->last_wakee != p) {
5755 3732 : current->last_wakee = p;
5756 3732 : current->wakee_flips++;
5757 : }
5758 10113 : }
5759 :
5760 : /*
5761 : * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5762 : *
5763 : * A waker of many should wake a different task than the one last awakened
5764 : * at a frequency roughly N times higher than one of its wakees.
5765 : *
5766 : * In order to determine whether we should let the load spread vs consolidating
5767 : * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5768 : * partner, and a factor of lls_size higher frequency in the other.
5769 : *
5770 : * With both conditions met, we can be relatively sure that the relationship is
5771 : * non-monogamous, with partner count exceeding socket size.
5772 : *
5773 : * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5774 : * whatever is irrelevant, spread criteria is apparent partner count exceeds
5775 : * socket size.
5776 : */
5777 10111 : static int wake_wide(struct task_struct *p)
5778 : {
5779 10111 : unsigned int master = current->wakee_flips;
5780 10111 : unsigned int slave = p->wakee_flips;
5781 10111 : int factor = __this_cpu_read(sd_llc_size);
5782 :
5783 10111 : if (master < slave)
5784 3740 : swap(master, slave);
5785 10111 : if (slave < factor || master < slave * factor)
5786 937 : return 0;
5787 : return 1;
5788 : }
5789 :
5790 : /*
5791 : * The purpose of wake_affine() is to quickly determine on which CPU we can run
5792 : * soonest. For the purpose of speed we only consider the waking and previous
5793 : * CPU.
5794 : *
5795 : * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5796 : * cache-affine and is (or will be) idle.
5797 : *
5798 : * wake_affine_weight() - considers the weight to reflect the average
5799 : * scheduling latency of the CPUs. This seems to work
5800 : * for the overloaded case.
5801 : */
5802 : static int
5803 272 : wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5804 : {
5805 : /*
5806 : * If this_cpu is idle, it implies the wakeup is from interrupt
5807 : * context. Only allow the move if cache is shared. Otherwise an
5808 : * interrupt intensive workload could force all tasks onto one
5809 : * node depending on the IO topology or IRQ affinity settings.
5810 : *
5811 : * If the prev_cpu is idle and cache affine then avoid a migration.
5812 : * There is no guarantee that the cache hot data from an interrupt
5813 : * is more important than cache hot data on the prev_cpu and from
5814 : * a cpufreq perspective, it's better to have higher utilisation
5815 : * on one CPU.
5816 : */
5817 272 : if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5818 0 : return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5819 :
5820 272 : if (sync && cpu_rq(this_cpu)->nr_running == 1)
5821 : return this_cpu;
5822 :
5823 264 : if (available_idle_cpu(prev_cpu))
5824 15 : return prev_cpu;
5825 :
5826 : return nr_cpumask_bits;
5827 : }
5828 :
5829 : static int
5830 249 : wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5831 : int this_cpu, int prev_cpu, int sync)
5832 : {
5833 249 : s64 this_eff_load, prev_eff_load;
5834 249 : unsigned long task_load;
5835 :
5836 249 : this_eff_load = cpu_load(cpu_rq(this_cpu));
5837 :
5838 249 : if (sync) {
5839 7 : unsigned long current_load = task_h_load(current);
5840 :
5841 7 : if (current_load > this_eff_load)
5842 : return this_cpu;
5843 :
5844 7 : this_eff_load -= current_load;
5845 : }
5846 :
5847 249 : task_load = task_h_load(p);
5848 :
5849 249 : this_eff_load += task_load;
5850 249 : if (sched_feat(WA_BIAS))
5851 249 : this_eff_load *= 100;
5852 249 : this_eff_load *= capacity_of(prev_cpu);
5853 :
5854 249 : prev_eff_load = cpu_load(cpu_rq(prev_cpu));
5855 249 : prev_eff_load -= task_load;
5856 249 : if (sched_feat(WA_BIAS))
5857 249 : prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5858 249 : prev_eff_load *= capacity_of(this_cpu);
5859 :
5860 : /*
5861 : * If sync, adjust the weight of prev_eff_load such that if
5862 : * prev_eff == this_eff that select_idle_sibling() will consider
5863 : * stacking the wakee on top of the waker if no other CPU is
5864 : * idle.
5865 : */
5866 249 : if (sync)
5867 7 : prev_eff_load += 1;
5868 :
5869 249 : return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5870 : }
5871 :
5872 272 : static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5873 : int this_cpu, int prev_cpu, int sync)
5874 : {
5875 272 : int target = nr_cpumask_bits;
5876 :
5877 272 : if (sched_feat(WA_IDLE))
5878 272 : target = wake_affine_idle(this_cpu, prev_cpu, sync);
5879 :
5880 272 : if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5881 249 : target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5882 :
5883 272 : schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5884 272 : if (target == nr_cpumask_bits)
5885 170 : return prev_cpu;
5886 :
5887 : schedstat_inc(sd->ttwu_move_affine);
5888 : schedstat_inc(p->se.statistics.nr_wakeups_affine);
5889 : return target;
5890 : }
5891 :
5892 : static struct sched_group *
5893 : find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
5894 :
5895 : /*
5896 : * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5897 : */
5898 : static int
5899 831 : find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5900 : {
5901 831 : unsigned long load, min_load = ULONG_MAX;
5902 831 : unsigned int min_exit_latency = UINT_MAX;
5903 831 : u64 latest_idle_timestamp = 0;
5904 831 : int least_loaded_cpu = this_cpu;
5905 831 : int shallowest_idle_cpu = -1;
5906 831 : int i;
5907 :
5908 : /* Check if we have any choice: */
5909 831 : if (group->group_weight == 1)
5910 831 : return cpumask_first(sched_group_span(group));
5911 :
5912 : /* Traverse only the allowed CPUs */
5913 0 : for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5914 0 : if (sched_idle_cpu(i))
5915 0 : return i;
5916 :
5917 0 : if (available_idle_cpu(i)) {
5918 0 : struct rq *rq = cpu_rq(i);
5919 0 : struct cpuidle_state *idle = idle_get_state(rq);
5920 0 : if (idle && idle->exit_latency < min_exit_latency) {
5921 : /*
5922 : * We give priority to a CPU whose idle state
5923 : * has the smallest exit latency irrespective
5924 : * of any idle timestamp.
5925 : */
5926 : min_exit_latency = idle->exit_latency;
5927 : latest_idle_timestamp = rq->idle_stamp;
5928 : shallowest_idle_cpu = i;
5929 0 : } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5930 0 : rq->idle_stamp > latest_idle_timestamp) {
5931 : /*
5932 : * If equal or no active idle state, then
5933 : * the most recently idled CPU might have
5934 : * a warmer cache.
5935 : */
5936 0 : latest_idle_timestamp = rq->idle_stamp;
5937 0 : shallowest_idle_cpu = i;
5938 : }
5939 0 : } else if (shallowest_idle_cpu == -1) {
5940 0 : load = cpu_load(cpu_rq(i));
5941 0 : if (load < min_load) {
5942 0 : min_load = load;
5943 0 : least_loaded_cpu = i;
5944 : }
5945 : }
5946 : }
5947 :
5948 0 : return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5949 : }
5950 :
5951 1584 : static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5952 : int cpu, int prev_cpu, int sd_flag)
5953 : {
5954 1584 : int new_cpu = cpu;
5955 :
5956 1584 : if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
5957 : return prev_cpu;
5958 :
5959 : /*
5960 : * We need task's util for cpu_util_without, sync it up to
5961 : * prev_cpu's last_update_time.
5962 : */
5963 1584 : if (!(sd_flag & SD_BALANCE_FORK))
5964 625 : sync_entity_load_avg(&p->se);
5965 :
5966 3168 : while (sd) {
5967 1584 : struct sched_group *group;
5968 1584 : struct sched_domain *tmp;
5969 1584 : int weight;
5970 :
5971 1584 : if (!(sd->flags & sd_flag)) {
5972 0 : sd = sd->child;
5973 0 : continue;
5974 : }
5975 :
5976 1584 : group = find_idlest_group(sd, p, cpu);
5977 1584 : if (!group) {
5978 753 : sd = sd->child;
5979 753 : continue;
5980 : }
5981 :
5982 831 : new_cpu = find_idlest_group_cpu(group, p, cpu);
5983 831 : if (new_cpu == cpu) {
5984 : /* Now try balancing at a lower domain level of 'cpu': */
5985 0 : sd = sd->child;
5986 0 : continue;
5987 : }
5988 :
5989 : /* Now try balancing at a lower domain level of 'new_cpu': */
5990 831 : cpu = new_cpu;
5991 831 : weight = sd->span_weight;
5992 831 : sd = NULL;
5993 1662 : for_each_domain(cpu, tmp) {
5994 831 : if (weight <= tmp->span_weight)
5995 : break;
5996 0 : if (tmp->flags & sd_flag)
5997 0 : sd = tmp;
5998 : }
5999 : }
6000 :
6001 : return new_cpu;
6002 : }
6003 :
6004 0 : static inline int __select_idle_cpu(int cpu)
6005 : {
6006 0 : if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
6007 0 : return cpu;
6008 :
6009 : return -1;
6010 : }
6011 :
6012 : #ifdef CONFIG_SCHED_SMT
6013 : DEFINE_STATIC_KEY_FALSE(sched_smt_present);
6014 : EXPORT_SYMBOL_GPL(sched_smt_present);
6015 :
6016 0 : static inline void set_idle_cores(int cpu, int val)
6017 : {
6018 0 : struct sched_domain_shared *sds;
6019 :
6020 0 : sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6021 0 : if (sds)
6022 0 : WRITE_ONCE(sds->has_idle_cores, val);
6023 0 : }
6024 :
6025 0 : static inline bool test_idle_cores(int cpu, bool def)
6026 : {
6027 0 : struct sched_domain_shared *sds;
6028 :
6029 0 : sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6030 0 : if (sds)
6031 0 : return READ_ONCE(sds->has_idle_cores);
6032 :
6033 : return def;
6034 : }
6035 :
6036 : /*
6037 : * Scans the local SMT mask to see if the entire core is idle, and records this
6038 : * information in sd_llc_shared->has_idle_cores.
6039 : *
6040 : * Since SMT siblings share all cache levels, inspecting this limited remote
6041 : * state should be fairly cheap.
6042 : */
6043 0 : void __update_idle_core(struct rq *rq)
6044 : {
6045 0 : int core = cpu_of(rq);
6046 0 : int cpu;
6047 :
6048 0 : rcu_read_lock();
6049 0 : if (test_idle_cores(core, true))
6050 0 : goto unlock;
6051 :
6052 0 : for_each_cpu(cpu, cpu_smt_mask(core)) {
6053 0 : if (cpu == core)
6054 0 : continue;
6055 :
6056 0 : if (!available_idle_cpu(cpu))
6057 0 : goto unlock;
6058 : }
6059 :
6060 0 : set_idle_cores(core, 1);
6061 0 : unlock:
6062 0 : rcu_read_unlock();
6063 0 : }
6064 :
6065 : /*
6066 : * Scan the entire LLC domain for idle cores; this dynamically switches off if
6067 : * there are no idle cores left in the system; tracked through
6068 : * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
6069 : */
6070 0 : static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6071 : {
6072 0 : bool idle = true;
6073 0 : int cpu;
6074 :
6075 0 : if (!static_branch_likely(&sched_smt_present))
6076 0 : return __select_idle_cpu(core);
6077 :
6078 0 : for_each_cpu(cpu, cpu_smt_mask(core)) {
6079 0 : if (!available_idle_cpu(cpu)) {
6080 0 : idle = false;
6081 0 : if (*idle_cpu == -1) {
6082 0 : if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
6083 0 : *idle_cpu = cpu;
6084 0 : break;
6085 : }
6086 0 : continue;
6087 : }
6088 : break;
6089 : }
6090 0 : if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
6091 0 : *idle_cpu = cpu;
6092 : }
6093 :
6094 0 : if (idle)
6095 : return core;
6096 :
6097 0 : cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
6098 0 : return -1;
6099 : }
6100 :
6101 : #else /* CONFIG_SCHED_SMT */
6102 :
6103 : static inline void set_idle_cores(int cpu, int val)
6104 : {
6105 : }
6106 :
6107 : static inline bool test_idle_cores(int cpu, bool def)
6108 : {
6109 : return def;
6110 : }
6111 :
6112 : static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6113 : {
6114 : return __select_idle_cpu(core);
6115 : }
6116 :
6117 : #endif /* CONFIG_SCHED_SMT */
6118 :
6119 : /*
6120 : * Scan the LLC domain for idle CPUs; this is dynamically regulated by
6121 : * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
6122 : * average idle time for this rq (as found in rq->avg_idle).
6123 : */
6124 0 : static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6125 : {
6126 0 : struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6127 0 : int i, cpu, idle_cpu = -1, nr = INT_MAX;
6128 0 : bool smt = test_idle_cores(target, false);
6129 0 : int this = smp_processor_id();
6130 0 : struct sched_domain *this_sd;
6131 0 : u64 time;
6132 :
6133 0 : this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6134 0 : if (!this_sd)
6135 : return -1;
6136 :
6137 0 : cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6138 :
6139 0 : if (sched_feat(SIS_PROP) && !smt) {
6140 0 : u64 avg_cost, avg_idle, span_avg;
6141 :
6142 : /*
6143 : * Due to large variance we need a large fuzz factor;
6144 : * hackbench in particularly is sensitive here.
6145 : */
6146 0 : avg_idle = this_rq()->avg_idle / 512;
6147 0 : avg_cost = this_sd->avg_scan_cost + 1;
6148 :
6149 0 : span_avg = sd->span_weight * avg_idle;
6150 0 : if (span_avg > 4*avg_cost)
6151 0 : nr = div_u64(span_avg, avg_cost);
6152 : else
6153 : nr = 4;
6154 :
6155 0 : time = cpu_clock(this);
6156 : }
6157 :
6158 0 : for_each_cpu_wrap(cpu, cpus, target) {
6159 0 : if (smt) {
6160 0 : i = select_idle_core(p, cpu, cpus, &idle_cpu);
6161 0 : if ((unsigned int)i < nr_cpumask_bits)
6162 0 : return i;
6163 :
6164 : } else {
6165 0 : if (!--nr)
6166 : return -1;
6167 0 : idle_cpu = __select_idle_cpu(cpu);
6168 0 : if ((unsigned int)idle_cpu < nr_cpumask_bits)
6169 : break;
6170 : }
6171 : }
6172 :
6173 0 : if (smt)
6174 0 : set_idle_cores(this, false);
6175 :
6176 0 : if (sched_feat(SIS_PROP) && !smt) {
6177 0 : time = cpu_clock(this) - time;
6178 0 : update_avg(&this_sd->avg_scan_cost, time);
6179 : }
6180 :
6181 0 : return idle_cpu;
6182 : }
6183 :
6184 : /*
6185 : * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
6186 : * the task fits. If no CPU is big enough, but there are idle ones, try to
6187 : * maximize capacity.
6188 : */
6189 : static int
6190 0 : select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6191 : {
6192 0 : unsigned long task_util, best_cap = 0;
6193 0 : int cpu, best_cpu = -1;
6194 0 : struct cpumask *cpus;
6195 :
6196 0 : cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6197 0 : cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6198 :
6199 0 : task_util = uclamp_task_util(p);
6200 :
6201 0 : for_each_cpu_wrap(cpu, cpus, target) {
6202 0 : unsigned long cpu_cap = capacity_of(cpu);
6203 :
6204 0 : if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6205 0 : continue;
6206 0 : if (fits_capacity(task_util, cpu_cap))
6207 0 : return cpu;
6208 :
6209 0 : if (cpu_cap > best_cap) {
6210 0 : best_cap = cpu_cap;
6211 0 : best_cpu = cpu;
6212 : }
6213 : }
6214 :
6215 : return best_cpu;
6216 : }
6217 :
6218 3321 : static inline bool asym_fits_capacity(int task_util, int cpu)
6219 : {
6220 3321 : if (static_branch_unlikely(&sched_asym_cpucapacity))
6221 0 : return fits_capacity(task_util, capacity_of(cpu));
6222 :
6223 : return true;
6224 : }
6225 :
6226 : /*
6227 : * Try and locate an idle core/thread in the LLC cache domain.
6228 : */
6229 10108 : static int select_idle_sibling(struct task_struct *p, int prev, int target)
6230 : {
6231 10108 : struct sched_domain *sd;
6232 10108 : unsigned long task_util;
6233 10108 : int i, recent_used_cpu;
6234 :
6235 : /*
6236 : * On asymmetric system, update task utilization because we will check
6237 : * that the task fits with cpu's capacity.
6238 : */
6239 10108 : if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6240 0 : sync_entity_load_avg(&p->se);
6241 0 : task_util = uclamp_task_util(p);
6242 : }
6243 :
6244 13436 : if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6245 3324 : asym_fits_capacity(task_util, target))
6246 : return target;
6247 :
6248 : /*
6249 : * If the previous CPU is cache affine and idle, don't be stupid:
6250 : */
6251 6787 : if (prev != target && cpus_share_cache(prev, target) &&
6252 0 : (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6253 0 : asym_fits_capacity(task_util, prev))
6254 : return prev;
6255 :
6256 : /*
6257 : * Allow a per-cpu kthread to stack with the wakee if the
6258 : * kworker thread and the tasks previous CPUs are the same.
6259 : * The assumption is that the wakee queued work for the
6260 : * per-cpu kthread that is now complete and the wakeup is
6261 : * essentially a sync wakeup. An obvious example of this
6262 : * pattern is IO completions.
6263 : */
6264 6787 : if (is_per_cpu_kthread(current) &&
6265 1392 : prev == smp_processor_id() &&
6266 657 : this_rq()->nr_running <= 1) {
6267 : return prev;
6268 : }
6269 :
6270 : /* Check a recently used CPU as a potential idle candidate: */
6271 6408 : recent_used_cpu = p->recent_used_cpu;
6272 6408 : if (recent_used_cpu != prev &&
6273 10985 : recent_used_cpu != target &&
6274 4577 : cpus_share_cache(recent_used_cpu, target) &&
6275 0 : (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6276 0 : cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6277 0 : asym_fits_capacity(task_util, recent_used_cpu)) {
6278 : /*
6279 : * Replace recent_used_cpu with prev as it is a potential
6280 : * candidate for the next wake:
6281 : */
6282 0 : p->recent_used_cpu = prev;
6283 0 : return recent_used_cpu;
6284 : }
6285 :
6286 : /*
6287 : * For asymmetric CPU capacity systems, our domain of interest is
6288 : * sd_asym_cpucapacity rather than sd_llc.
6289 : */
6290 6408 : if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6291 0 : sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6292 : /*
6293 : * On an asymmetric CPU capacity system where an exclusive
6294 : * cpuset defines a symmetric island (i.e. one unique
6295 : * capacity_orig value through the cpuset), the key will be set
6296 : * but the CPUs within that cpuset will not have a domain with
6297 : * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
6298 : * capacity path.
6299 : */
6300 0 : if (sd) {
6301 0 : i = select_idle_capacity(p, sd, target);
6302 0 : return ((unsigned)i < nr_cpumask_bits) ? i : target;
6303 : }
6304 : }
6305 :
6306 6408 : sd = rcu_dereference(per_cpu(sd_llc, target));
6307 6408 : if (!sd)
6308 : return target;
6309 :
6310 0 : i = select_idle_cpu(p, sd, target);
6311 0 : if ((unsigned)i < nr_cpumask_bits)
6312 0 : return i;
6313 :
6314 : return target;
6315 : }
6316 :
6317 : /**
6318 : * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
6319 : * @cpu: the CPU to get the utilization of
6320 : *
6321 : * The unit of the return value must be the one of capacity so we can compare
6322 : * the utilization with the capacity of the CPU that is available for CFS task
6323 : * (ie cpu_capacity).
6324 : *
6325 : * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
6326 : * recent utilization of currently non-runnable tasks on a CPU. It represents
6327 : * the amount of utilization of a CPU in the range [0..capacity_orig] where
6328 : * capacity_orig is the cpu_capacity available at the highest frequency
6329 : * (arch_scale_freq_capacity()).
6330 : * The utilization of a CPU converges towards a sum equal to or less than the
6331 : * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6332 : * the running time on this CPU scaled by capacity_curr.
6333 : *
6334 : * The estimated utilization of a CPU is defined to be the maximum between its
6335 : * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
6336 : * currently RUNNABLE on that CPU.
6337 : * This allows to properly represent the expected utilization of a CPU which
6338 : * has just got a big task running since a long sleep period. At the same time
6339 : * however it preserves the benefits of the "blocked utilization" in
6340 : * describing the potential for other tasks waking up on the same CPU.
6341 : *
6342 : * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6343 : * higher than capacity_orig because of unfortunate rounding in
6344 : * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
6345 : * the average stabilizes with the new running time. We need to check that the
6346 : * utilization stays within the range of [0..capacity_orig] and cap it if
6347 : * necessary. Without utilization capping, a group could be seen as overloaded
6348 : * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
6349 : * available capacity. We allow utilization to overshoot capacity_curr (but not
6350 : * capacity_orig) as it useful for predicting the capacity required after task
6351 : * migrations (scheduler-driven DVFS).
6352 : *
6353 : * Return: the (estimated) utilization for the specified CPU
6354 : */
6355 89113 : static inline unsigned long cpu_util(int cpu)
6356 : {
6357 89113 : struct cfs_rq *cfs_rq;
6358 89113 : unsigned int util;
6359 :
6360 89113 : cfs_rq = &cpu_rq(cpu)->cfs;
6361 89113 : util = READ_ONCE(cfs_rq->avg.util_avg);
6362 :
6363 89113 : if (sched_feat(UTIL_EST))
6364 89113 : util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6365 :
6366 89113 : return min_t(unsigned long, util, capacity_orig_of(cpu));
6367 : }
6368 :
6369 : /*
6370 : * cpu_util_without: compute cpu utilization without any contributions from *p
6371 : * @cpu: the CPU which utilization is requested
6372 : * @p: the task which utilization should be discounted
6373 : *
6374 : * The utilization of a CPU is defined by the utilization of tasks currently
6375 : * enqueued on that CPU as well as tasks which are currently sleeping after an
6376 : * execution on that CPU.
6377 : *
6378 : * This method returns the utilization of the specified CPU by discounting the
6379 : * utilization of the specified task, whenever the task is currently
6380 : * contributing to the CPU utilization.
6381 : */
6382 6336 : static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6383 : {
6384 6336 : struct cfs_rq *cfs_rq;
6385 6336 : unsigned int util;
6386 :
6387 : /* Task has no contribution or is new */
6388 6336 : if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6389 5711 : return cpu_util(cpu);
6390 :
6391 625 : cfs_rq = &cpu_rq(cpu)->cfs;
6392 625 : util = READ_ONCE(cfs_rq->avg.util_avg);
6393 :
6394 : /* Discount task's util from CPU's util */
6395 625 : lsub_positive(&util, task_util(p));
6396 :
6397 : /*
6398 : * Covered cases:
6399 : *
6400 : * a) if *p is the only task sleeping on this CPU, then:
6401 : * cpu_util (== task_util) > util_est (== 0)
6402 : * and thus we return:
6403 : * cpu_util_without = (cpu_util - task_util) = 0
6404 : *
6405 : * b) if other tasks are SLEEPING on this CPU, which is now exiting
6406 : * IDLE, then:
6407 : * cpu_util >= task_util
6408 : * cpu_util > util_est (== 0)
6409 : * and thus we discount *p's blocked utilization to return:
6410 : * cpu_util_without = (cpu_util - task_util) >= 0
6411 : *
6412 : * c) if other tasks are RUNNABLE on that CPU and
6413 : * util_est > cpu_util
6414 : * then we use util_est since it returns a more restrictive
6415 : * estimation of the spare capacity on that CPU, by just
6416 : * considering the expected utilization of tasks already
6417 : * runnable on that CPU.
6418 : *
6419 : * Cases a) and b) are covered by the above code, while case c) is
6420 : * covered by the following code when estimated utilization is
6421 : * enabled.
6422 : */
6423 625 : if (sched_feat(UTIL_EST)) {
6424 625 : unsigned int estimated =
6425 625 : READ_ONCE(cfs_rq->avg.util_est.enqueued);
6426 :
6427 : /*
6428 : * Despite the following checks we still have a small window
6429 : * for a possible race, when an execl's select_task_rq_fair()
6430 : * races with LB's detach_task():
6431 : *
6432 : * detach_task()
6433 : * p->on_rq = TASK_ON_RQ_MIGRATING;
6434 : * ---------------------------------- A
6435 : * deactivate_task() \
6436 : * dequeue_task() + RaceTime
6437 : * util_est_dequeue() /
6438 : * ---------------------------------- B
6439 : *
6440 : * The additional check on "current == p" it's required to
6441 : * properly fix the execl regression and it helps in further
6442 : * reducing the chances for the above race.
6443 : */
6444 625 : if (unlikely(task_on_rq_queued(p) || current == p))
6445 625 : lsub_positive(&estimated, _task_util_est(p));
6446 :
6447 625 : util = max(util, estimated);
6448 : }
6449 :
6450 : /*
6451 : * Utilization (estimated) can exceed the CPU capacity, thus let's
6452 : * clamp to the maximum CPU capacity to ensure consistency with
6453 : * the cpu_util call.
6454 : */
6455 625 : return min_t(unsigned long, util, capacity_orig_of(cpu));
6456 : }
6457 :
6458 : /*
6459 : * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
6460 : * to @dst_cpu.
6461 : */
6462 : static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6463 : {
6464 : struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6465 : unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6466 :
6467 : /*
6468 : * If @p migrates from @cpu to another, remove its contribution. Or,
6469 : * if @p migrates from another CPU to @cpu, add its contribution. In
6470 : * the other cases, @cpu is not impacted by the migration, so the
6471 : * util_avg should already be correct.
6472 : */
6473 : if (task_cpu(p) == cpu && dst_cpu != cpu)
6474 : sub_positive(&util, task_util(p));
6475 : else if (task_cpu(p) != cpu && dst_cpu == cpu)
6476 : util += task_util(p);
6477 :
6478 : if (sched_feat(UTIL_EST)) {
6479 : util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6480 :
6481 : /*
6482 : * During wake-up, the task isn't enqueued yet and doesn't
6483 : * appear in the cfs_rq->avg.util_est.enqueued of any rq,
6484 : * so just add it (if needed) to "simulate" what will be
6485 : * cpu_util() after the task has been enqueued.
6486 : */
6487 : if (dst_cpu == cpu)
6488 : util_est += _task_util_est(p);
6489 :
6490 : util = max(util, util_est);
6491 : }
6492 :
6493 : return min(util, capacity_orig_of(cpu));
6494 : }
6495 :
6496 : /*
6497 : * compute_energy(): Estimates the energy that @pd would consume if @p was
6498 : * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
6499 : * landscape of @pd's CPUs after the task migration, and uses the Energy Model
6500 : * to compute what would be the energy if we decided to actually migrate that
6501 : * task.
6502 : */
6503 : static long
6504 : compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6505 : {
6506 : struct cpumask *pd_mask = perf_domain_span(pd);
6507 : unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6508 : unsigned long max_util = 0, sum_util = 0;
6509 : int cpu;
6510 :
6511 : /*
6512 : * The capacity state of CPUs of the current rd can be driven by CPUs
6513 : * of another rd if they belong to the same pd. So, account for the
6514 : * utilization of these CPUs too by masking pd with cpu_online_mask
6515 : * instead of the rd span.
6516 : *
6517 : * If an entire pd is outside of the current rd, it will not appear in
6518 : * its pd list and will not be accounted by compute_energy().
6519 : */
6520 : for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6521 : unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6522 : struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
6523 :
6524 : /*
6525 : * Busy time computation: utilization clamping is not
6526 : * required since the ratio (sum_util / cpu_capacity)
6527 : * is already enough to scale the EM reported power
6528 : * consumption at the (eventually clamped) cpu_capacity.
6529 : */
6530 : sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap,
6531 : ENERGY_UTIL, NULL);
6532 :
6533 : /*
6534 : * Performance domain frequency: utilization clamping
6535 : * must be considered since it affects the selection
6536 : * of the performance domain frequency.
6537 : * NOTE: in case RT tasks are running, by default the
6538 : * FREQUENCY_UTIL's utilization can be max OPP.
6539 : */
6540 : cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap,
6541 : FREQUENCY_UTIL, tsk);
6542 : max_util = max(max_util, cpu_util);
6543 : }
6544 :
6545 : return em_cpu_energy(pd->em_pd, max_util, sum_util);
6546 : }
6547 :
6548 : /*
6549 : * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
6550 : * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
6551 : * spare capacity in each performance domain and uses it as a potential
6552 : * candidate to execute the task. Then, it uses the Energy Model to figure
6553 : * out which of the CPU candidates is the most energy-efficient.
6554 : *
6555 : * The rationale for this heuristic is as follows. In a performance domain,
6556 : * all the most energy efficient CPU candidates (according to the Energy
6557 : * Model) are those for which we'll request a low frequency. When there are
6558 : * several CPUs for which the frequency request will be the same, we don't
6559 : * have enough data to break the tie between them, because the Energy Model
6560 : * only includes active power costs. With this model, if we assume that
6561 : * frequency requests follow utilization (e.g. using schedutil), the CPU with
6562 : * the maximum spare capacity in a performance domain is guaranteed to be among
6563 : * the best candidates of the performance domain.
6564 : *
6565 : * In practice, it could be preferable from an energy standpoint to pack
6566 : * small tasks on a CPU in order to let other CPUs go in deeper idle states,
6567 : * but that could also hurt our chances to go cluster idle, and we have no
6568 : * ways to tell with the current Energy Model if this is actually a good
6569 : * idea or not. So, find_energy_efficient_cpu() basically favors
6570 : * cluster-packing, and spreading inside a cluster. That should at least be
6571 : * a good thing for latency, and this is consistent with the idea that most
6572 : * of the energy savings of EAS come from the asymmetry of the system, and
6573 : * not so much from breaking the tie between identical CPUs. That's also the
6574 : * reason why EAS is enabled in the topology code only for systems where
6575 : * SD_ASYM_CPUCAPACITY is set.
6576 : *
6577 : * NOTE: Forkees are not accepted in the energy-aware wake-up path because
6578 : * they don't have any useful utilization data yet and it's not possible to
6579 : * forecast their impact on energy consumption. Consequently, they will be
6580 : * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
6581 : * to be energy-inefficient in some use-cases. The alternative would be to
6582 : * bias new tasks towards specific types of CPUs first, or to try to infer
6583 : * their util_avg from the parent task, but those heuristics could hurt
6584 : * other use-cases too. So, until someone finds a better way to solve this,
6585 : * let's keep things simple by re-using the existing slow path.
6586 : */
6587 : static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6588 : {
6589 : unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6590 : struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6591 : unsigned long cpu_cap, util, base_energy = 0;
6592 : int cpu, best_energy_cpu = prev_cpu;
6593 : struct sched_domain *sd;
6594 : struct perf_domain *pd;
6595 :
6596 : rcu_read_lock();
6597 : pd = rcu_dereference(rd->pd);
6598 : if (!pd || READ_ONCE(rd->overutilized))
6599 : goto fail;
6600 :
6601 : /*
6602 : * Energy-aware wake-up happens on the lowest sched_domain starting
6603 : * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
6604 : */
6605 : sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6606 : while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6607 : sd = sd->parent;
6608 : if (!sd)
6609 : goto fail;
6610 :
6611 : sync_entity_load_avg(&p->se);
6612 : if (!task_util_est(p))
6613 : goto unlock;
6614 :
6615 : for (; pd; pd = pd->next) {
6616 : unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6617 : unsigned long base_energy_pd;
6618 : int max_spare_cap_cpu = -1;
6619 :
6620 : /* Compute the 'base' energy of the pd, without @p */
6621 : base_energy_pd = compute_energy(p, -1, pd);
6622 : base_energy += base_energy_pd;
6623 :
6624 : for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6625 : if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6626 : continue;
6627 :
6628 : util = cpu_util_next(cpu, p, cpu);
6629 : cpu_cap = capacity_of(cpu);
6630 : spare_cap = cpu_cap;
6631 : lsub_positive(&spare_cap, util);
6632 :
6633 : /*
6634 : * Skip CPUs that cannot satisfy the capacity request.
6635 : * IOW, placing the task there would make the CPU
6636 : * overutilized. Take uclamp into account to see how
6637 : * much capacity we can get out of the CPU; this is
6638 : * aligned with sched_cpu_util().
6639 : */
6640 : util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6641 : if (!fits_capacity(util, cpu_cap))
6642 : continue;
6643 :
6644 : /* Always use prev_cpu as a candidate. */
6645 : if (cpu == prev_cpu) {
6646 : prev_delta = compute_energy(p, prev_cpu, pd);
6647 : prev_delta -= base_energy_pd;
6648 : best_delta = min(best_delta, prev_delta);
6649 : }
6650 :
6651 : /*
6652 : * Find the CPU with the maximum spare capacity in
6653 : * the performance domain
6654 : */
6655 : if (spare_cap > max_spare_cap) {
6656 : max_spare_cap = spare_cap;
6657 : max_spare_cap_cpu = cpu;
6658 : }
6659 : }
6660 :
6661 : /* Evaluate the energy impact of using this CPU. */
6662 : if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
6663 : cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6664 : cur_delta -= base_energy_pd;
6665 : if (cur_delta < best_delta) {
6666 : best_delta = cur_delta;
6667 : best_energy_cpu = max_spare_cap_cpu;
6668 : }
6669 : }
6670 : }
6671 : unlock:
6672 : rcu_read_unlock();
6673 :
6674 : /*
6675 : * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6676 : * least 6% of the energy used by prev_cpu.
6677 : */
6678 : if (prev_delta == ULONG_MAX)
6679 : return best_energy_cpu;
6680 :
6681 : if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6682 : return best_energy_cpu;
6683 :
6684 : return prev_cpu;
6685 :
6686 : fail:
6687 : rcu_read_unlock();
6688 :
6689 : return -1;
6690 : }
6691 :
6692 : /*
6693 : * select_task_rq_fair: Select target runqueue for the waking task in domains
6694 : * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
6695 : * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6696 : *
6697 : * Balances load by selecting the idlest CPU in the idlest group, or under
6698 : * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
6699 : *
6700 : * Returns the target CPU number.
6701 : *
6702 : * preempt must be disabled.
6703 : */
6704 : static int
6705 11719 : select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
6706 : {
6707 11719 : int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6708 11719 : struct sched_domain *tmp, *sd = NULL;
6709 11719 : int cpu = smp_processor_id();
6710 11719 : int new_cpu = prev_cpu;
6711 11719 : int want_affine = 0;
6712 : /* SD_flags and WF_flags share the first nibble */
6713 11719 : int sd_flag = wake_flags & 0xF;
6714 :
6715 11719 : if (wake_flags & WF_TTWU) {
6716 10109 : record_wakee(p);
6717 :
6718 10110 : if (sched_energy_enabled()) {
6719 : new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6720 : if (new_cpu >= 0)
6721 : return new_cpu;
6722 : new_cpu = prev_cpu;
6723 : }
6724 :
6725 10110 : want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
6726 : }
6727 :
6728 11720 : rcu_read_lock();
6729 36754 : for_each_domain(cpu, tmp) {
6730 : /*
6731 : * If both 'cpu' and 'prev_cpu' are part of this domain,
6732 : * cpu is a valid SD_WAKE_AFFINE target.
6733 : */
6734 11662 : if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6735 937 : cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6736 937 : if (cpu != prev_cpu)
6737 272 : new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6738 :
6739 : sd = NULL; /* Prefer wake_affine over balance flags */
6740 : break;
6741 : }
6742 :
6743 10722 : if (tmp->flags & sd_flag)
6744 : sd = tmp;
6745 9138 : else if (!want_affine)
6746 : break;
6747 : }
6748 :
6749 11720 : if (unlikely(sd)) {
6750 : /* Slow path */
6751 1584 : new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6752 10136 : } else if (wake_flags & WF_TTWU) { /* XXX always ? */
6753 : /* Fast path */
6754 10111 : new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6755 :
6756 10112 : if (want_affine)
6757 937 : current->recent_used_cpu = cpu;
6758 : }
6759 11721 : rcu_read_unlock();
6760 :
6761 11723 : return new_cpu;
6762 : }
6763 :
6764 : static void detach_entity_cfs_rq(struct sched_entity *se);
6765 :
6766 : /*
6767 : * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
6768 : * cfs_rq_of(p) references at time of call are still valid and identify the
6769 : * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6770 : */
6771 993 : static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6772 : {
6773 : /*
6774 : * As blocked tasks retain absolute vruntime the migration needs to
6775 : * deal with this by subtracting the old and adding the new
6776 : * min_vruntime -- the latter is done by enqueue_entity() when placing
6777 : * the task on the new runqueue.
6778 : */
6779 993 : if (p->state == TASK_WAKING) {
6780 107 : struct sched_entity *se = &p->se;
6781 107 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
6782 107 : u64 min_vruntime;
6783 :
6784 : #ifndef CONFIG_64BIT
6785 : u64 min_vruntime_copy;
6786 :
6787 : do {
6788 : min_vruntime_copy = cfs_rq->min_vruntime_copy;
6789 : smp_rmb();
6790 : min_vruntime = cfs_rq->min_vruntime;
6791 : } while (min_vruntime != min_vruntime_copy);
6792 : #else
6793 107 : min_vruntime = cfs_rq->min_vruntime;
6794 : #endif
6795 :
6796 107 : se->vruntime -= min_vruntime;
6797 : }
6798 :
6799 993 : if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6800 : /*
6801 : * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
6802 : * rq->lock and can modify state directly.
6803 : */
6804 1773 : lockdep_assert_held(&task_rq(p)->lock);
6805 886 : detach_entity_cfs_rq(&p->se);
6806 :
6807 : } else {
6808 : /*
6809 : * We are supposed to update the task to "current" time, then
6810 : * its up to date and ready to go to new CPU/cfs_rq. But we
6811 : * have difficulty in getting what current time is, so simply
6812 : * throw away the out-of-date time. This will result in the
6813 : * wakee task is less decayed, but giving the wakee more load
6814 : * sounds not bad.
6815 : */
6816 107 : remove_entity_load_avg(&p->se);
6817 : }
6818 :
6819 : /* Tell new CPU we are migrated */
6820 994 : p->se.avg.last_update_time = 0;
6821 :
6822 : /* We have migrated, no longer consider this task hot */
6823 994 : p->se.exec_start = 0;
6824 :
6825 994 : update_scan_period(p, new_cpu);
6826 994 : }
6827 :
6828 910 : static void task_dead_fair(struct task_struct *p)
6829 : {
6830 910 : remove_entity_load_avg(&p->se);
6831 910 : }
6832 :
6833 : static int
6834 73 : balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6835 : {
6836 73 : if (rq->nr_running)
6837 : return 1;
6838 :
6839 7 : return newidle_balance(rq, rf) != 0;
6840 : }
6841 : #endif /* CONFIG_SMP */
6842 :
6843 6402 : static unsigned long wakeup_gran(struct sched_entity *se)
6844 : {
6845 6402 : unsigned long gran = sysctl_sched_wakeup_granularity;
6846 :
6847 : /*
6848 : * Since its curr running now, convert the gran from real-time
6849 : * to virtual-time in his units.
6850 : *
6851 : * By using 'se' instead of 'curr' we penalize light tasks, so
6852 : * they get preempted easier. That is, if 'se' < 'curr' then
6853 : * the resulting gran will be larger, therefore penalizing the
6854 : * lighter, if otoh 'se' > 'curr' then the resulting gran will
6855 : * be smaller, again penalizing the lighter task.
6856 : *
6857 : * This is especially important for buddies when the leftmost
6858 : * task is higher priority than the buddy.
6859 : */
6860 6402 : return calc_delta_fair(gran, se);
6861 : }
6862 :
6863 : /*
6864 : * Should 'se' preempt 'curr'.
6865 : *
6866 : * |s1
6867 : * |s2
6868 : * |s3
6869 : * g
6870 : * |<--->|c
6871 : *
6872 : * w(c, s1) = -1
6873 : * w(c, s2) = 0
6874 : * w(c, s3) = 1
6875 : *
6876 : */
6877 : static int
6878 12654 : wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6879 : {
6880 12654 : s64 gran, vdiff = curr->vruntime - se->vruntime;
6881 :
6882 12654 : if (vdiff <= 0)
6883 : return -1;
6884 :
6885 6402 : gran = wakeup_gran(se);
6886 6402 : if (vdiff > gran)
6887 5103 : return 1;
6888 :
6889 : return 0;
6890 : }
6891 :
6892 0 : static void set_last_buddy(struct sched_entity *se)
6893 : {
6894 0 : if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6895 : return;
6896 :
6897 0 : for_each_sched_entity(se) {
6898 0 : if (SCHED_WARN_ON(!se->on_rq))
6899 : return;
6900 0 : cfs_rq_of(se)->last = se;
6901 : }
6902 : }
6903 :
6904 5097 : static void set_next_buddy(struct sched_entity *se)
6905 : {
6906 5097 : if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6907 : return;
6908 :
6909 5097 : for_each_sched_entity(se) {
6910 5097 : if (SCHED_WARN_ON(!se->on_rq))
6911 : return;
6912 5097 : cfs_rq_of(se)->next = se;
6913 : }
6914 : }
6915 :
6916 0 : static void set_skip_buddy(struct sched_entity *se)
6917 : {
6918 0 : for_each_sched_entity(se)
6919 0 : cfs_rq_of(se)->skip = se;
6920 : }
6921 :
6922 : /*
6923 : * Preempt the current task with a newly woken task if needed:
6924 : */
6925 8372 : static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6926 : {
6927 8372 : struct task_struct *curr = rq->curr;
6928 8372 : struct sched_entity *se = &curr->se, *pse = &p->se;
6929 8372 : struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6930 8372 : int scale = cfs_rq->nr_running >= sched_nr_latency;
6931 8372 : int next_buddy_marked = 0;
6932 :
6933 8372 : if (unlikely(se == pse))
6934 8373 : return;
6935 :
6936 : /*
6937 : * This is possible from callers such as attach_tasks(), in which we
6938 : * unconditionally check_prempt_curr() after an enqueue (which may have
6939 : * lead to a throttle). This both saves work and prevents false
6940 : * next-buddy nomination below.
6941 : */
6942 8210 : if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6943 : return;
6944 :
6945 8210 : if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6946 : set_next_buddy(pse);
6947 : next_buddy_marked = 1;
6948 : }
6949 :
6950 : /*
6951 : * We can come here with TIF_NEED_RESCHED already set from new task
6952 : * wake up path.
6953 : *
6954 : * Note: this also catches the edge-case of curr being in a throttled
6955 : * group (e.g. via set_curr_task), since update_curr() (in the
6956 : * enqueue of curr) will have resulted in resched being set. This
6957 : * prevents us from potentially nominating it as a false LAST_BUDDY
6958 : * below.
6959 : */
6960 8210 : if (test_tsk_need_resched(curr))
6961 : return;
6962 :
6963 : /* Idle tasks are by definition preempted by non-idle tasks. */
6964 7714 : if (unlikely(task_has_idle_policy(curr)) &&
6965 0 : likely(!task_has_idle_policy(p)))
6966 0 : goto preempt;
6967 :
6968 : /*
6969 : * Batch and idle tasks do not preempt non-idle tasks (their preemption
6970 : * is driven by the tick):
6971 : */
6972 7714 : if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6973 : return;
6974 :
6975 7714 : find_matching_se(&se, &pse);
6976 7714 : update_curr(cfs_rq_of(se));
6977 7714 : BUG_ON(!pse);
6978 7714 : if (wakeup_preempt_entity(se, pse) == 1) {
6979 : /*
6980 : * Bias pick_next to pick the sched entity that is
6981 : * triggering this preemption.
6982 : */
6983 5097 : if (!next_buddy_marked)
6984 5097 : set_next_buddy(pse);
6985 5097 : goto preempt;
6986 : }
6987 :
6988 : return;
6989 :
6990 5097 : preempt:
6991 5097 : resched_curr(rq);
6992 : /*
6993 : * Only set the backward buddy when the current task is still
6994 : * on the rq. This can happen when a wakeup gets interleaved
6995 : * with schedule on the ->pre_schedule() or idle_balance()
6996 : * point, either of which can * drop the rq lock.
6997 : *
6998 : * Also, during early boot the idle thread is in the fair class,
6999 : * for obvious reasons its a bad idea to schedule back to it.
7000 : */
7001 5097 : if (unlikely(!se->on_rq || curr == rq->idle))
7002 : return;
7003 :
7004 4986 : if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7005 0 : set_last_buddy(se);
7006 : }
7007 :
7008 : struct task_struct *
7009 28048 : pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7010 : {
7011 28048 : struct cfs_rq *cfs_rq = &rq->cfs;
7012 28495 : struct sched_entity *se;
7013 28495 : struct task_struct *p;
7014 28495 : int new_tasks;
7015 :
7016 28495 : again:
7017 28495 : if (!sched_fair_runnable(rq))
7018 7420 : goto idle;
7019 :
7020 : #ifdef CONFIG_FAIR_GROUP_SCHED
7021 : if (!prev || prev->sched_class != &fair_sched_class)
7022 : goto simple;
7023 :
7024 : /*
7025 : * Because of the set_next_buddy() in dequeue_task_fair() it is rather
7026 : * likely that a next task is from the same cgroup as the current.
7027 : *
7028 : * Therefore attempt to avoid putting and setting the entire cgroup
7029 : * hierarchy, only change the part that actually changes.
7030 : */
7031 :
7032 : do {
7033 : struct sched_entity *curr = cfs_rq->curr;
7034 :
7035 : /*
7036 : * Since we got here without doing put_prev_entity() we also
7037 : * have to consider cfs_rq->curr. If it is still a runnable
7038 : * entity, update_curr() will update its vruntime, otherwise
7039 : * forget we've ever seen it.
7040 : */
7041 : if (curr) {
7042 : if (curr->on_rq)
7043 : update_curr(cfs_rq);
7044 : else
7045 : curr = NULL;
7046 :
7047 : /*
7048 : * This call to check_cfs_rq_runtime() will do the
7049 : * throttle and dequeue its entity in the parent(s).
7050 : * Therefore the nr_running test will indeed
7051 : * be correct.
7052 : */
7053 : if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
7054 : cfs_rq = &rq->cfs;
7055 :
7056 : if (!cfs_rq->nr_running)
7057 : goto idle;
7058 :
7059 : goto simple;
7060 : }
7061 : }
7062 :
7063 : se = pick_next_entity(cfs_rq, curr);
7064 : cfs_rq = group_cfs_rq(se);
7065 : } while (cfs_rq);
7066 :
7067 : p = task_of(se);
7068 :
7069 : /*
7070 : * Since we haven't yet done put_prev_entity and if the selected task
7071 : * is a different task than we started out with, try and touch the
7072 : * least amount of cfs_rqs.
7073 : */
7074 : if (prev != p) {
7075 : struct sched_entity *pse = &prev->se;
7076 :
7077 : while (!(cfs_rq = is_same_group(se, pse))) {
7078 : int se_depth = se->depth;
7079 : int pse_depth = pse->depth;
7080 :
7081 : if (se_depth <= pse_depth) {
7082 : put_prev_entity(cfs_rq_of(pse), pse);
7083 : pse = parent_entity(pse);
7084 : }
7085 : if (se_depth >= pse_depth) {
7086 : set_next_entity(cfs_rq_of(se), se);
7087 : se = parent_entity(se);
7088 : }
7089 : }
7090 :
7091 : put_prev_entity(cfs_rq, pse);
7092 : set_next_entity(cfs_rq, se);
7093 : }
7094 :
7095 : goto done;
7096 : simple:
7097 : #endif
7098 21075 : if (prev)
7099 21042 : put_prev_task(rq, prev);
7100 :
7101 21074 : do {
7102 21074 : se = pick_next_entity(cfs_rq, NULL);
7103 21075 : set_next_entity(cfs_rq, se);
7104 21076 : cfs_rq = group_cfs_rq(se);
7105 21076 : } while (cfs_rq);
7106 :
7107 21076 : p = task_of(se);
7108 :
7109 21076 : done: __maybe_unused;
7110 : #ifdef CONFIG_SMP
7111 : /*
7112 : * Move the next running task to the front of
7113 : * the list, so our cfs_tasks list becomes MRU
7114 : * one.
7115 : */
7116 21076 : list_move(&p->se.group_node, &rq->cfs_tasks);
7117 : #endif
7118 :
7119 21076 : if (hrtick_enabled_fair(rq))
7120 21076 : hrtick_start_fair(rq, p);
7121 :
7122 21076 : update_misfit_status(p, rq);
7123 :
7124 21076 : return p;
7125 :
7126 7420 : idle:
7127 7420 : if (!rf)
7128 : return NULL;
7129 :
7130 7413 : new_tasks = newidle_balance(rq, rf);
7131 :
7132 : /*
7133 : * Because newidle_balance() releases (and re-acquires) rq->lock, it is
7134 : * possible for any higher priority task to appear. In that case we
7135 : * must re-start the pick_next_entity() loop.
7136 : */
7137 7417 : if (new_tasks < 0)
7138 : return RETRY_TASK;
7139 :
7140 7417 : if (new_tasks > 0)
7141 447 : goto again;
7142 :
7143 : /*
7144 : * rq is about to be idle, check if we need to update the
7145 : * lost_idle_time of clock_pelt
7146 : */
7147 6970 : update_idle_rq_clock_pelt(rq);
7148 :
7149 6970 : return NULL;
7150 : }
7151 :
7152 39 : static struct task_struct *__pick_next_task_fair(struct rq *rq)
7153 : {
7154 39 : return pick_next_task_fair(rq, NULL, NULL);
7155 : }
7156 :
7157 : /*
7158 : * Account for a descheduled task:
7159 : */
7160 21087 : static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
7161 : {
7162 21087 : struct sched_entity *se = &prev->se;
7163 21087 : struct cfs_rq *cfs_rq;
7164 :
7165 21087 : for_each_sched_entity(se) {
7166 21087 : cfs_rq = cfs_rq_of(se);
7167 21087 : put_prev_entity(cfs_rq, se);
7168 : }
7169 21088 : }
7170 :
7171 : /*
7172 : * sched_yield() is very simple
7173 : *
7174 : * The magic of dealing with the ->skip buddy is in pick_next_entity.
7175 : */
7176 0 : static void yield_task_fair(struct rq *rq)
7177 : {
7178 0 : struct task_struct *curr = rq->curr;
7179 0 : struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7180 0 : struct sched_entity *se = &curr->se;
7181 :
7182 : /*
7183 : * Are we the only task in the tree?
7184 : */
7185 0 : if (unlikely(rq->nr_running == 1))
7186 : return;
7187 :
7188 0 : clear_buddies(cfs_rq, se);
7189 :
7190 0 : if (curr->policy != SCHED_BATCH) {
7191 0 : update_rq_clock(rq);
7192 : /*
7193 : * Update run-time statistics of the 'current'.
7194 : */
7195 0 : update_curr(cfs_rq);
7196 : /*
7197 : * Tell update_rq_clock() that we've just updated,
7198 : * so we don't do microscopic update in schedule()
7199 : * and double the fastpath cost.
7200 : */
7201 0 : rq_clock_skip_update(rq);
7202 : }
7203 :
7204 0 : set_skip_buddy(se);
7205 : }
7206 :
7207 0 : static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
7208 : {
7209 0 : struct sched_entity *se = &p->se;
7210 :
7211 : /* throttled hierarchies are not runnable */
7212 0 : if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7213 : return false;
7214 :
7215 : /* Tell the scheduler that we'd really like pse to run next. */
7216 0 : set_next_buddy(se);
7217 :
7218 0 : yield_task_fair(rq);
7219 :
7220 0 : return true;
7221 : }
7222 :
7223 : #ifdef CONFIG_SMP
7224 : /**************************************************
7225 : * Fair scheduling class load-balancing methods.
7226 : *
7227 : * BASICS
7228 : *
7229 : * The purpose of load-balancing is to achieve the same basic fairness the
7230 : * per-CPU scheduler provides, namely provide a proportional amount of compute
7231 : * time to each task. This is expressed in the following equation:
7232 : *
7233 : * W_i,n/P_i == W_j,n/P_j for all i,j (1)
7234 : *
7235 : * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
7236 : * W_i,0 is defined as:
7237 : *
7238 : * W_i,0 = \Sum_j w_i,j (2)
7239 : *
7240 : * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
7241 : * is derived from the nice value as per sched_prio_to_weight[].
7242 : *
7243 : * The weight average is an exponential decay average of the instantaneous
7244 : * weight:
7245 : *
7246 : * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
7247 : *
7248 : * C_i is the compute capacity of CPU i, typically it is the
7249 : * fraction of 'recent' time available for SCHED_OTHER task execution. But it
7250 : * can also include other factors [XXX].
7251 : *
7252 : * To achieve this balance we define a measure of imbalance which follows
7253 : * directly from (1):
7254 : *
7255 : * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
7256 : *
7257 : * We them move tasks around to minimize the imbalance. In the continuous
7258 : * function space it is obvious this converges, in the discrete case we get
7259 : * a few fun cases generally called infeasible weight scenarios.
7260 : *
7261 : * [XXX expand on:
7262 : * - infeasible weights;
7263 : * - local vs global optima in the discrete case. ]
7264 : *
7265 : *
7266 : * SCHED DOMAINS
7267 : *
7268 : * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
7269 : * for all i,j solution, we create a tree of CPUs that follows the hardware
7270 : * topology where each level pairs two lower groups (or better). This results
7271 : * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
7272 : * tree to only the first of the previous level and we decrease the frequency
7273 : * of load-balance at each level inv. proportional to the number of CPUs in
7274 : * the groups.
7275 : *
7276 : * This yields:
7277 : *
7278 : * log_2 n 1 n
7279 : * \Sum { --- * --- * 2^i } = O(n) (5)
7280 : * i = 0 2^i 2^i
7281 : * `- size of each group
7282 : * | | `- number of CPUs doing load-balance
7283 : * | `- freq
7284 : * `- sum over all levels
7285 : *
7286 : * Coupled with a limit on how many tasks we can migrate every balance pass,
7287 : * this makes (5) the runtime complexity of the balancer.
7288 : *
7289 : * An important property here is that each CPU is still (indirectly) connected
7290 : * to every other CPU in at most O(log n) steps:
7291 : *
7292 : * The adjacency matrix of the resulting graph is given by:
7293 : *
7294 : * log_2 n
7295 : * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
7296 : * k = 0
7297 : *
7298 : * And you'll find that:
7299 : *
7300 : * A^(log_2 n)_i,j != 0 for all i,j (7)
7301 : *
7302 : * Showing there's indeed a path between every CPU in at most O(log n) steps.
7303 : * The task movement gives a factor of O(m), giving a convergence complexity
7304 : * of:
7305 : *
7306 : * O(nm log n), n := nr_cpus, m := nr_tasks (8)
7307 : *
7308 : *
7309 : * WORK CONSERVING
7310 : *
7311 : * In order to avoid CPUs going idle while there's still work to do, new idle
7312 : * balancing is more aggressive and has the newly idle CPU iterate up the domain
7313 : * tree itself instead of relying on other CPUs to bring it work.
7314 : *
7315 : * This adds some complexity to both (5) and (8) but it reduces the total idle
7316 : * time.
7317 : *
7318 : * [XXX more?]
7319 : *
7320 : *
7321 : * CGROUPS
7322 : *
7323 : * Cgroups make a horror show out of (2), instead of a simple sum we get:
7324 : *
7325 : * s_k,i
7326 : * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
7327 : * S_k
7328 : *
7329 : * Where
7330 : *
7331 : * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
7332 : *
7333 : * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
7334 : *
7335 : * The big problem is S_k, its a global sum needed to compute a local (W_i)
7336 : * property.
7337 : *
7338 : * [XXX write more on how we solve this.. _after_ merging pjt's patches that
7339 : * rewrite all of this once again.]
7340 : */
7341 :
7342 : static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7343 :
7344 : enum fbq_type { regular, remote, all };
7345 :
7346 : /*
7347 : * 'group_type' describes the group of CPUs at the moment of load balancing.
7348 : *
7349 : * The enum is ordered by pulling priority, with the group with lowest priority
7350 : * first so the group_type can simply be compared when selecting the busiest
7351 : * group. See update_sd_pick_busiest().
7352 : */
7353 : enum group_type {
7354 : /* The group has spare capacity that can be used to run more tasks. */
7355 : group_has_spare = 0,
7356 : /*
7357 : * The group is fully used and the tasks don't compete for more CPU
7358 : * cycles. Nevertheless, some tasks might wait before running.
7359 : */
7360 : group_fully_busy,
7361 : /*
7362 : * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
7363 : * and must be migrated to a more powerful CPU.
7364 : */
7365 : group_misfit_task,
7366 : /*
7367 : * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
7368 : * and the task should be migrated to it instead of running on the
7369 : * current CPU.
7370 : */
7371 : group_asym_packing,
7372 : /*
7373 : * The tasks' affinity constraints previously prevented the scheduler
7374 : * from balancing the load across the system.
7375 : */
7376 : group_imbalanced,
7377 : /*
7378 : * The CPU is overloaded and can't provide expected CPU cycles to all
7379 : * tasks.
7380 : */
7381 : group_overloaded
7382 : };
7383 :
7384 : enum migration_type {
7385 : migrate_load = 0,
7386 : migrate_util,
7387 : migrate_task,
7388 : migrate_misfit
7389 : };
7390 :
7391 : #define LBF_ALL_PINNED 0x01
7392 : #define LBF_NEED_BREAK 0x02
7393 : #define LBF_DST_PINNED 0x04
7394 : #define LBF_SOME_PINNED 0x08
7395 : #define LBF_NOHZ_STATS 0x10
7396 : #define LBF_NOHZ_AGAIN 0x20
7397 :
7398 : struct lb_env {
7399 : struct sched_domain *sd;
7400 :
7401 : struct rq *src_rq;
7402 : int src_cpu;
7403 :
7404 : int dst_cpu;
7405 : struct rq *dst_rq;
7406 :
7407 : struct cpumask *dst_grpmask;
7408 : int new_dst_cpu;
7409 : enum cpu_idle_type idle;
7410 : long imbalance;
7411 : /* The set of CPUs under consideration for load-balancing */
7412 : struct cpumask *cpus;
7413 :
7414 : unsigned int flags;
7415 :
7416 : unsigned int loop;
7417 : unsigned int loop_break;
7418 : unsigned int loop_max;
7419 :
7420 : enum fbq_type fbq_type;
7421 : enum migration_type migration_type;
7422 : struct list_head tasks;
7423 : };
7424 :
7425 : /*
7426 : * Is this task likely cache-hot:
7427 : */
7428 2300 : static int task_hot(struct task_struct *p, struct lb_env *env)
7429 : {
7430 2300 : s64 delta;
7431 :
7432 4600 : lockdep_assert_held(&env->src_rq->lock);
7433 :
7434 2300 : if (p->sched_class != &fair_sched_class)
7435 : return 0;
7436 :
7437 2300 : if (unlikely(task_has_idle_policy(p)))
7438 : return 0;
7439 :
7440 : /* SMT siblings share cache */
7441 2300 : if (env->sd->flags & SD_SHARE_CPUCAPACITY)
7442 : return 0;
7443 :
7444 : /*
7445 : * Buddy candidates are cache hot:
7446 : */
7447 2300 : if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7448 287 : (&p->se == cfs_rq_of(&p->se)->next ||
7449 274 : &p->se == cfs_rq_of(&p->se)->last))
7450 : return 1;
7451 :
7452 2287 : if (sysctl_sched_migration_cost == -1)
7453 : return 1;
7454 2287 : if (sysctl_sched_migration_cost == 0)
7455 : return 0;
7456 :
7457 2287 : delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7458 :
7459 2287 : return delta < (s64)sysctl_sched_migration_cost;
7460 : }
7461 :
7462 : #ifdef CONFIG_NUMA_BALANCING
7463 : /*
7464 : * Returns 1, if task migration degrades locality
7465 : * Returns 0, if task migration improves locality i.e migration preferred.
7466 : * Returns -1, if task migration is not affected by locality.
7467 : */
7468 : static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7469 : {
7470 : struct numa_group *numa_group = rcu_dereference(p->numa_group);
7471 : unsigned long src_weight, dst_weight;
7472 : int src_nid, dst_nid, dist;
7473 :
7474 : if (!static_branch_likely(&sched_numa_balancing))
7475 : return -1;
7476 :
7477 : if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7478 : return -1;
7479 :
7480 : src_nid = cpu_to_node(env->src_cpu);
7481 : dst_nid = cpu_to_node(env->dst_cpu);
7482 :
7483 : if (src_nid == dst_nid)
7484 : return -1;
7485 :
7486 : /* Migrating away from the preferred node is always bad. */
7487 : if (src_nid == p->numa_preferred_nid) {
7488 : if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7489 : return 1;
7490 : else
7491 : return -1;
7492 : }
7493 :
7494 : /* Encourage migration to the preferred node. */
7495 : if (dst_nid == p->numa_preferred_nid)
7496 : return 0;
7497 :
7498 : /* Leaving a core idle is often worse than degrading locality. */
7499 : if (env->idle == CPU_IDLE)
7500 : return -1;
7501 :
7502 : dist = node_distance(src_nid, dst_nid);
7503 : if (numa_group) {
7504 : src_weight = group_weight(p, src_nid, dist);
7505 : dst_weight = group_weight(p, dst_nid, dist);
7506 : } else {
7507 : src_weight = task_weight(p, src_nid, dist);
7508 : dst_weight = task_weight(p, dst_nid, dist);
7509 : }
7510 :
7511 : return dst_weight < src_weight;
7512 : }
7513 :
7514 : #else
7515 2300 : static inline int migrate_degrades_locality(struct task_struct *p,
7516 : struct lb_env *env)
7517 : {
7518 2300 : return -1;
7519 : }
7520 : #endif
7521 :
7522 : /*
7523 : * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7524 : */
7525 : static
7526 5144 : int can_migrate_task(struct task_struct *p, struct lb_env *env)
7527 : {
7528 5144 : int tsk_cache_hot;
7529 :
7530 10289 : lockdep_assert_held(&env->src_rq->lock);
7531 :
7532 : /*
7533 : * We do not migrate tasks that are:
7534 : * 1) throttled_lb_pair, or
7535 : * 2) cannot be migrated to this CPU due to cpus_ptr, or
7536 : * 3) running (obviously), or
7537 : * 4) are cache-hot on their current CPU.
7538 : */
7539 5145 : if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7540 : return 0;
7541 :
7542 5145 : if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7543 1341 : int cpu;
7544 :
7545 1341 : schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7546 :
7547 1341 : env->flags |= LBF_SOME_PINNED;
7548 :
7549 : /*
7550 : * Remember if this task can be migrated to any other CPU in
7551 : * our sched_group. We may want to revisit it if we couldn't
7552 : * meet load balance goals by pulling other tasks on src_cpu.
7553 : *
7554 : * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
7555 : * already computed one in current iteration.
7556 : */
7557 1341 : if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7558 : return 0;
7559 :
7560 : /* Prevent to re-select dst_cpu via env's CPUs: */
7561 1382 : for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7562 691 : if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7563 0 : env->flags |= LBF_DST_PINNED;
7564 0 : env->new_dst_cpu = cpu;
7565 0 : break;
7566 : }
7567 : }
7568 :
7569 691 : return 0;
7570 : }
7571 :
7572 : /* Record that we found atleast one task that could run on dst_cpu */
7573 3804 : env->flags &= ~LBF_ALL_PINNED;
7574 :
7575 3804 : if (task_running(env->src_rq, p)) {
7576 : schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7577 : return 0;
7578 : }
7579 :
7580 : /*
7581 : * Aggressive migration if:
7582 : * 1) destination numa is preferred
7583 : * 2) task is cache cold, or
7584 : * 3) too many balance attempts have failed.
7585 : */
7586 2300 : tsk_cache_hot = migrate_degrades_locality(p, env);
7587 2300 : if (tsk_cache_hot == -1)
7588 2300 : tsk_cache_hot = task_hot(p, env);
7589 :
7590 2300 : if (tsk_cache_hot <= 0 ||
7591 316 : env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7592 2015 : if (tsk_cache_hot == 1) {
7593 2015 : schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7594 2015 : schedstat_inc(p->se.statistics.nr_forced_migrations);
7595 : }
7596 2015 : return 1;
7597 : }
7598 :
7599 : schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7600 : return 0;
7601 : }
7602 :
7603 : /*
7604 : * detach_task() -- detach the task for the migration specified in env
7605 : */
7606 855 : static void detach_task(struct task_struct *p, struct lb_env *env)
7607 : {
7608 1711 : lockdep_assert_held(&env->src_rq->lock);
7609 :
7610 856 : deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7611 856 : set_task_cpu(p, env->dst_cpu);
7612 856 : }
7613 :
7614 : /*
7615 : * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
7616 : * part of active balancing operations within "domain".
7617 : *
7618 : * Returns a task if successful and NULL otherwise.
7619 : */
7620 1 : static struct task_struct *detach_one_task(struct lb_env *env)
7621 : {
7622 1 : struct task_struct *p;
7623 :
7624 2 : lockdep_assert_held(&env->src_rq->lock);
7625 :
7626 1 : list_for_each_entry_reverse(p,
7627 : &env->src_rq->cfs_tasks, se.group_node) {
7628 1 : if (!can_migrate_task(p, env))
7629 0 : continue;
7630 :
7631 1 : detach_task(p, env);
7632 :
7633 : /*
7634 : * Right now, this is only the second place where
7635 : * lb_gained[env->idle] is updated (other is detach_tasks)
7636 : * so we can safely collect stats here rather than
7637 : * inside detach_tasks().
7638 : */
7639 1 : schedstat_inc(env->sd->lb_gained[env->idle]);
7640 1 : return p;
7641 : }
7642 : return NULL;
7643 : }
7644 :
7645 : static const unsigned int sched_nr_migrate_break = 32;
7646 :
7647 : /*
7648 : * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
7649 : * busiest_rq, as part of a balancing operation within domain "sd".
7650 : *
7651 : * Returns number of detached tasks if successful and 0 otherwise.
7652 : */
7653 2607 : static int detach_tasks(struct lb_env *env)
7654 : {
7655 2607 : struct list_head *tasks = &env->src_rq->cfs_tasks;
7656 2607 : unsigned long util, load;
7657 2607 : struct task_struct *p;
7658 2607 : int detached = 0;
7659 :
7660 5215 : lockdep_assert_held(&env->src_rq->lock);
7661 :
7662 2607 : if (env->imbalance <= 0)
7663 : return 0;
7664 :
7665 7332 : while (!list_empty(tasks)) {
7666 : /*
7667 : * We don't want to steal all, otherwise we may be treated likewise,
7668 : * which could at worst lead to a livelock crash.
7669 : */
7670 7332 : if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7671 : break;
7672 :
7673 7006 : p = list_last_entry(tasks, struct task_struct, se.group_node);
7674 :
7675 7006 : env->loop++;
7676 : /* We've more or less seen every task there is, call it quits */
7677 7006 : if (env->loop > env->loop_max)
7678 : break;
7679 :
7680 : /* take a breather every nr_migrate tasks */
7681 5141 : if (env->loop > env->loop_break) {
7682 0 : env->loop_break += sched_nr_migrate_break;
7683 0 : env->flags |= LBF_NEED_BREAK;
7684 0 : break;
7685 : }
7686 :
7687 5141 : if (!can_migrate_task(p, env))
7688 3129 : goto next;
7689 :
7690 2013 : switch (env->migration_type) {
7691 : case migrate_load:
7692 : /*
7693 : * Depending of the number of CPUs and tasks and the
7694 : * cgroup hierarchy, task_h_load() can return a null
7695 : * value. Make sure that env->imbalance decreases
7696 : * otherwise detach_tasks() will stop only after
7697 : * detaching up to loop_max tasks.
7698 : */
7699 144 : load = max_t(unsigned long, task_h_load(p), 1);
7700 :
7701 144 : if (sched_feat(LB_MIN) &&
7702 : load < 16 && !env->sd->nr_balance_failed)
7703 : goto next;
7704 :
7705 : /*
7706 : * Make sure that we don't migrate too much load.
7707 : * Nevertheless, let relax the constraint if
7708 : * scheduler fails to find a good waiting task to
7709 : * migrate.
7710 : */
7711 :
7712 144 : if ((load >> env->sd->nr_balance_failed) > env->imbalance)
7713 100 : goto next;
7714 :
7715 44 : env->imbalance -= load;
7716 44 : break;
7717 :
7718 : case migrate_util:
7719 1465 : util = task_util_est(p);
7720 :
7721 1465 : if (util > env->imbalance)
7722 1059 : goto next;
7723 :
7724 406 : env->imbalance -= util;
7725 406 : break;
7726 :
7727 404 : case migrate_task:
7728 404 : env->imbalance--;
7729 404 : break;
7730 :
7731 0 : case migrate_misfit:
7732 : /* This is not a misfit task */
7733 0 : if (task_fits_capacity(p, capacity_of(env->src_cpu)))
7734 0 : goto next;
7735 :
7736 0 : env->imbalance = 0;
7737 0 : break;
7738 : }
7739 :
7740 854 : detach_task(p, env);
7741 853 : list_add(&p->se.group_node, &env->tasks);
7742 :
7743 853 : detached++;
7744 :
7745 : #ifdef CONFIG_PREEMPTION
7746 : /*
7747 : * NEWIDLE balancing is a source of latency, so preemptible
7748 : * kernels will stop after the first task is detached to minimize
7749 : * the critical section.
7750 : */
7751 : if (env->idle == CPU_NEWLY_IDLE)
7752 : break;
7753 : #endif
7754 :
7755 : /*
7756 : * We only want to steal up to the prescribed amount of
7757 : * load/util/tasks.
7758 : */
7759 853 : if (env->imbalance <= 0)
7760 : break;
7761 :
7762 437 : continue;
7763 4288 : next:
7764 11620 : list_move(&p->se.group_node, tasks);
7765 : }
7766 :
7767 : /*
7768 : * Right now, this is one of only two places we collect this stat
7769 : * so we can safely collect detach_one_task() stats here rather
7770 : * than inside detach_one_task().
7771 : */
7772 : schedstat_add(env->sd->lb_gained[env->idle], detached);
7773 :
7774 : return detached;
7775 : }
7776 :
7777 : /*
7778 : * attach_task() -- attach the task detached by detach_task() to its new rq.
7779 : */
7780 855 : static void attach_task(struct rq *rq, struct task_struct *p)
7781 : {
7782 1710 : lockdep_assert_held(&rq->lock);
7783 :
7784 855 : BUG_ON(task_rq(p) != rq);
7785 855 : activate_task(rq, p, ENQUEUE_NOCLOCK);
7786 854 : check_preempt_curr(rq, p, 0);
7787 855 : }
7788 :
7789 : /*
7790 : * attach_one_task() -- attaches the task returned from detach_one_task() to
7791 : * its new rq.
7792 : */
7793 1 : static void attach_one_task(struct rq *rq, struct task_struct *p)
7794 : {
7795 1 : struct rq_flags rf;
7796 :
7797 1 : rq_lock(rq, &rf);
7798 1 : update_rq_clock(rq);
7799 1 : attach_task(rq, p);
7800 1 : rq_unlock(rq, &rf);
7801 1 : }
7802 :
7803 : /*
7804 : * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
7805 : * new rq.
7806 : */
7807 826 : static void attach_tasks(struct lb_env *env)
7808 : {
7809 826 : struct list_head *tasks = &env->tasks;
7810 826 : struct task_struct *p;
7811 826 : struct rq_flags rf;
7812 :
7813 826 : rq_lock(env->dst_rq, &rf);
7814 826 : update_rq_clock(env->dst_rq);
7815 :
7816 1680 : while (!list_empty(tasks)) {
7817 854 : p = list_first_entry(tasks, struct task_struct, se.group_node);
7818 854 : list_del_init(&p->se.group_node);
7819 :
7820 854 : attach_task(env->dst_rq, p);
7821 : }
7822 :
7823 826 : rq_unlock(env->dst_rq, &rf);
7824 826 : }
7825 :
7826 : #ifdef CONFIG_NO_HZ_COMMON
7827 12202 : static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7828 : {
7829 12202 : if (cfs_rq->avg.load_avg)
7830 : return true;
7831 :
7832 39 : if (cfs_rq->avg.util_avg)
7833 : return true;
7834 :
7835 : return false;
7836 : }
7837 :
7838 12211 : static inline bool others_have_blocked(struct rq *rq)
7839 : {
7840 12211 : if (READ_ONCE(rq->avg_rt.util_avg))
7841 : return true;
7842 :
7843 12211 : if (READ_ONCE(rq->avg_dl.util_avg))
7844 : return true;
7845 :
7846 12211 : if (thermal_load_avg(rq))
7847 : return true;
7848 :
7849 : #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
7850 12211 : if (READ_ONCE(rq->avg_irq.util_avg))
7851 : return true;
7852 : #endif
7853 :
7854 : return false;
7855 : }
7856 :
7857 12193 : static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
7858 : {
7859 12193 : rq->last_blocked_load_update_tick = jiffies;
7860 :
7861 12193 : if (!has_blocked)
7862 38 : rq->has_blocked_load = 0;
7863 : }
7864 : #else
7865 : static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
7866 : static inline bool others_have_blocked(struct rq *rq) { return false; }
7867 : static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
7868 : #endif
7869 :
7870 12179 : static bool __update_blocked_others(struct rq *rq, bool *done)
7871 : {
7872 12179 : const struct sched_class *curr_class;
7873 12179 : u64 now = rq_clock_pelt(rq);
7874 12247 : unsigned long thermal_pressure;
7875 12247 : bool decayed;
7876 :
7877 : /*
7878 : * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
7879 : * DL and IRQ signals have been updated before updating CFS.
7880 : */
7881 12247 : curr_class = rq->curr->sched_class;
7882 :
7883 12247 : thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
7884 :
7885 24460 : decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
7886 12163 : update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
7887 12222 : update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
7888 12222 : update_irq_load_avg(rq, 0);
7889 :
7890 12211 : if (others_have_blocked(rq))
7891 8229 : *done = false;
7892 :
7893 12211 : return decayed;
7894 : }
7895 :
7896 : #ifdef CONFIG_FAIR_GROUP_SCHED
7897 :
7898 : static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7899 : {
7900 : if (cfs_rq->load.weight)
7901 : return false;
7902 :
7903 : if (cfs_rq->avg.load_sum)
7904 : return false;
7905 :
7906 : if (cfs_rq->avg.util_sum)
7907 : return false;
7908 :
7909 : if (cfs_rq->avg.runnable_sum)
7910 : return false;
7911 :
7912 : return true;
7913 : }
7914 :
7915 : static bool __update_blocked_fair(struct rq *rq, bool *done)
7916 : {
7917 : struct cfs_rq *cfs_rq, *pos;
7918 : bool decayed = false;
7919 : int cpu = cpu_of(rq);
7920 :
7921 : /*
7922 : * Iterates the task_group tree in a bottom up fashion, see
7923 : * list_add_leaf_cfs_rq() for details.
7924 : */
7925 : for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7926 : struct sched_entity *se;
7927 :
7928 : if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
7929 : update_tg_load_avg(cfs_rq);
7930 :
7931 : if (cfs_rq == &rq->cfs)
7932 : decayed = true;
7933 : }
7934 :
7935 : /* Propagate pending load changes to the parent, if any: */
7936 : se = cfs_rq->tg->se[cpu];
7937 : if (se && !skip_blocked_update(se))
7938 : update_load_avg(cfs_rq_of(se), se, 0);
7939 :
7940 : /*
7941 : * There can be a lot of idle CPU cgroups. Don't let fully
7942 : * decayed cfs_rqs linger on the list.
7943 : */
7944 : if (cfs_rq_is_decayed(cfs_rq))
7945 : list_del_leaf_cfs_rq(cfs_rq);
7946 :
7947 : /* Don't need periodic decay once load/util_avg are null */
7948 : if (cfs_rq_has_blocked(cfs_rq))
7949 : *done = false;
7950 : }
7951 :
7952 : return decayed;
7953 : }
7954 :
7955 : /*
7956 : * Compute the hierarchical load factor for cfs_rq and all its ascendants.
7957 : * This needs to be done in a top-down fashion because the load of a child
7958 : * group is a fraction of its parents load.
7959 : */
7960 : static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7961 : {
7962 : struct rq *rq = rq_of(cfs_rq);
7963 : struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7964 : unsigned long now = jiffies;
7965 : unsigned long load;
7966 :
7967 : if (cfs_rq->last_h_load_update == now)
7968 : return;
7969 :
7970 : WRITE_ONCE(cfs_rq->h_load_next, NULL);
7971 : for_each_sched_entity(se) {
7972 : cfs_rq = cfs_rq_of(se);
7973 : WRITE_ONCE(cfs_rq->h_load_next, se);
7974 : if (cfs_rq->last_h_load_update == now)
7975 : break;
7976 : }
7977 :
7978 : if (!se) {
7979 : cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7980 : cfs_rq->last_h_load_update = now;
7981 : }
7982 :
7983 : while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
7984 : load = cfs_rq->h_load;
7985 : load = div64_ul(load * se->avg.load_avg,
7986 : cfs_rq_load_avg(cfs_rq) + 1);
7987 : cfs_rq = group_cfs_rq(se);
7988 : cfs_rq->h_load = load;
7989 : cfs_rq->last_h_load_update = now;
7990 : }
7991 : }
7992 :
7993 : static unsigned long task_h_load(struct task_struct *p)
7994 : {
7995 : struct cfs_rq *cfs_rq = task_cfs_rq(p);
7996 :
7997 : update_cfs_rq_h_load(cfs_rq);
7998 : return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7999 : cfs_rq_load_avg(cfs_rq) + 1);
8000 : }
8001 : #else
8002 12171 : static bool __update_blocked_fair(struct rq *rq, bool *done)
8003 : {
8004 12171 : struct cfs_rq *cfs_rq = &rq->cfs;
8005 12171 : bool decayed;
8006 :
8007 12171 : decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8008 12202 : if (cfs_rq_has_blocked(cfs_rq))
8009 12163 : *done = false;
8010 :
8011 12202 : return decayed;
8012 : }
8013 :
8014 1025 : static unsigned long task_h_load(struct task_struct *p)
8015 : {
8016 1025 : return p->se.avg.load_avg;
8017 : }
8018 : #endif
8019 :
8020 12154 : static void update_blocked_averages(int cpu)
8021 : {
8022 12154 : bool decayed = false, done = true;
8023 12154 : struct rq *rq = cpu_rq(cpu);
8024 12154 : struct rq_flags rf;
8025 :
8026 12154 : rq_lock_irqsave(rq, &rf);
8027 12223 : update_rq_clock(rq);
8028 :
8029 12223 : decayed |= __update_blocked_others(rq, &done);
8030 12209 : decayed |= __update_blocked_fair(rq, &done);
8031 :
8032 12193 : update_blocked_load_status(rq, !done);
8033 12193 : if (decayed)
8034 12193 : cpufreq_update_util(rq, 0);
8035 12193 : rq_unlock_irqrestore(rq, &rf);
8036 12217 : }
8037 :
8038 : /********** Helpers for find_busiest_group ************************/
8039 :
8040 : /*
8041 : * sg_lb_stats - stats of a sched_group required for load_balancing
8042 : */
8043 : struct sg_lb_stats {
8044 : unsigned long avg_load; /*Avg load across the CPUs of the group */
8045 : unsigned long group_load; /* Total load over the CPUs of the group */
8046 : unsigned long group_capacity;
8047 : unsigned long group_util; /* Total utilization over the CPUs of the group */
8048 : unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
8049 : unsigned int sum_nr_running; /* Nr of tasks running in the group */
8050 : unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
8051 : unsigned int idle_cpus;
8052 : unsigned int group_weight;
8053 : enum group_type group_type;
8054 : unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
8055 : unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
8056 : #ifdef CONFIG_NUMA_BALANCING
8057 : unsigned int nr_numa_running;
8058 : unsigned int nr_preferred_running;
8059 : #endif
8060 : };
8061 :
8062 : /*
8063 : * sd_lb_stats - Structure to store the statistics of a sched_domain
8064 : * during load balancing.
8065 : */
8066 : struct sd_lb_stats {
8067 : struct sched_group *busiest; /* Busiest group in this sd */
8068 : struct sched_group *local; /* Local group in this sd */
8069 : unsigned long total_load; /* Total load of all groups in sd */
8070 : unsigned long total_capacity; /* Total capacity of all groups in sd */
8071 : unsigned long avg_load; /* Average load across all groups in sd */
8072 : unsigned int prefer_sibling; /* tasks should go to sibling first */
8073 :
8074 : struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
8075 : struct sg_lb_stats local_stat; /* Statistics of the local group */
8076 : };
8077 :
8078 9304 : static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
8079 : {
8080 : /*
8081 : * Skimp on the clearing to avoid duplicate work. We can avoid clearing
8082 : * local_stat because update_sg_lb_stats() does a full clear/assignment.
8083 : * We must however set busiest_stat::group_type and
8084 : * busiest_stat::idle_cpus to the worst busiest group because
8085 : * update_sd_pick_busiest() reads these before assignment.
8086 : */
8087 9304 : *sds = (struct sd_lb_stats){
8088 : .busiest = NULL,
8089 : .local = NULL,
8090 : .total_load = 0UL,
8091 : .total_capacity = 0UL,
8092 : .busiest_stat = {
8093 : .idle_cpus = UINT_MAX,
8094 : .group_type = group_has_spare,
8095 : },
8096 : };
8097 : }
8098 :
8099 7892 : static unsigned long scale_rt_capacity(int cpu)
8100 : {
8101 7892 : struct rq *rq = cpu_rq(cpu);
8102 7892 : unsigned long max = arch_scale_cpu_capacity(cpu);
8103 7892 : unsigned long used, free;
8104 7892 : unsigned long irq;
8105 :
8106 7892 : irq = cpu_util_irq(rq);
8107 :
8108 7892 : if (unlikely(irq >= max))
8109 : return 1;
8110 :
8111 : /*
8112 : * avg_rt.util_avg and avg_dl.util_avg track binary signals
8113 : * (running and not running) with weights 0 and 1024 respectively.
8114 : * avg_thermal.load_avg tracks thermal pressure and the weighted
8115 : * average uses the actual delta max capacity(load).
8116 : */
8117 7892 : used = READ_ONCE(rq->avg_rt.util_avg);
8118 7892 : used += READ_ONCE(rq->avg_dl.util_avg);
8119 7892 : used += thermal_load_avg(rq);
8120 :
8121 7892 : if (unlikely(used >= max))
8122 : return 1;
8123 :
8124 7892 : free = max - used;
8125 :
8126 7892 : return scale_irq_capacity(free, irq, max);
8127 : }
8128 :
8129 7877 : static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8130 : {
8131 7877 : unsigned long capacity = scale_rt_capacity(cpu);
8132 7904 : struct sched_group *sdg = sd->groups;
8133 :
8134 7904 : cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
8135 :
8136 7904 : if (!capacity)
8137 : capacity = 1;
8138 :
8139 7904 : cpu_rq(cpu)->cpu_capacity = capacity;
8140 7904 : trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8141 :
8142 7920 : sdg->sgc->capacity = capacity;
8143 7920 : sdg->sgc->min_capacity = capacity;
8144 7920 : sdg->sgc->max_capacity = capacity;
8145 7920 : }
8146 :
8147 7911 : void update_group_capacity(struct sched_domain *sd, int cpu)
8148 : {
8149 7911 : struct sched_domain *child = sd->child;
8150 7911 : struct sched_group *group, *sdg = sd->groups;
8151 7911 : unsigned long capacity, min_capacity, max_capacity;
8152 7911 : unsigned long interval;
8153 :
8154 7911 : interval = msecs_to_jiffies(sd->balance_interval);
8155 7926 : interval = clamp(interval, 1UL, max_load_balance_interval);
8156 7926 : sdg->sgc->next_update = jiffies + interval;
8157 :
8158 7926 : if (!child) {
8159 7882 : update_cpu_capacity(sd, cpu);
8160 7882 : return;
8161 : }
8162 :
8163 44 : capacity = 0;
8164 44 : min_capacity = ULONG_MAX;
8165 44 : max_capacity = 0;
8166 :
8167 44 : if (child->flags & SD_OVERLAP) {
8168 : /*
8169 : * SD_OVERLAP domains cannot assume that child groups
8170 : * span the current group.
8171 : */
8172 :
8173 36 : for_each_cpu(cpu, sched_group_span(sdg)) {
8174 0 : unsigned long cpu_cap = capacity_of(cpu);
8175 :
8176 0 : capacity += cpu_cap;
8177 0 : min_capacity = min(cpu_cap, min_capacity);
8178 0 : max_capacity = max(cpu_cap, max_capacity);
8179 : }
8180 : } else {
8181 : /*
8182 : * !SD_OVERLAP domains can assume that child groups
8183 : * span the current group.
8184 : */
8185 :
8186 8 : group = child->groups;
8187 8 : do {
8188 8 : struct sched_group_capacity *sgc = group->sgc;
8189 :
8190 8 : capacity += sgc->capacity;
8191 8 : min_capacity = min(sgc->min_capacity, min_capacity);
8192 8 : max_capacity = max(sgc->max_capacity, max_capacity);
8193 8 : group = group->next;
8194 8 : } while (group != child->groups);
8195 : }
8196 :
8197 8 : sdg->sgc->capacity = capacity;
8198 8 : sdg->sgc->min_capacity = min_capacity;
8199 8 : sdg->sgc->max_capacity = max_capacity;
8200 : }
8201 :
8202 : /*
8203 : * Check whether the capacity of the rq has been noticeably reduced by side
8204 : * activity. The imbalance_pct is used for the threshold.
8205 : * Return true is the capacity is reduced
8206 : */
8207 : static inline int
8208 2223 : check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
8209 : {
8210 2223 : return ((rq->cpu_capacity * sd->imbalance_pct) <
8211 2223 : (rq->cpu_capacity_orig * 100));
8212 : }
8213 :
8214 : /*
8215 : * Check whether a rq has a misfit task and if it looks like we can actually
8216 : * help that task: we can migrate the task to a CPU of higher capacity, or
8217 : * the task's current CPU is heavily pressured.
8218 : */
8219 0 : static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8220 : {
8221 0 : return rq->misfit_task_load &&
8222 0 : (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8223 0 : check_cpu_capacity(rq, sd));
8224 : }
8225 :
8226 : /*
8227 : * Group imbalance indicates (and tries to solve) the problem where balancing
8228 : * groups is inadequate due to ->cpus_ptr constraints.
8229 : *
8230 : * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
8231 : * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
8232 : * Something like:
8233 : *
8234 : * { 0 1 2 3 } { 4 5 6 7 }
8235 : * * * * *
8236 : *
8237 : * If we were to balance group-wise we'd place two tasks in the first group and
8238 : * two tasks in the second group. Clearly this is undesired as it will overload
8239 : * cpu 3 and leave one of the CPUs in the second group unused.
8240 : *
8241 : * The current solution to this issue is detecting the skew in the first group
8242 : * by noticing the lower domain failed to reach balance and had difficulty
8243 : * moving tasks due to affinity constraints.
8244 : *
8245 : * When this is so detected; this group becomes a candidate for busiest; see
8246 : * update_sd_pick_busiest(). And calculate_imbalance() and
8247 : * find_busiest_group() avoid some of the usual balance conditions to allow it
8248 : * to create an effective group imbalance.
8249 : *
8250 : * This is a somewhat tricky proposition since the next run might not find the
8251 : * group imbalance and decide the groups need to be balanced again. A most
8252 : * subtle and fragile situation.
8253 : */
8254 :
8255 40406 : static inline int sg_imbalanced(struct sched_group *group)
8256 : {
8257 40406 : return group->sgc->imbalance;
8258 : }
8259 :
8260 : /*
8261 : * group_has_capacity returns true if the group has spare capacity that could
8262 : * be used by some tasks.
8263 : * We consider that a group has spare capacity if the * number of task is
8264 : * smaller than the number of CPUs or if the utilization is lower than the
8265 : * available capacity for CFS tasks.
8266 : * For the latter, we use a threshold to stabilize the state, to take into
8267 : * account the variance of the tasks' load and to return true if the available
8268 : * capacity in meaningful for the load balancer.
8269 : * As an example, an available capacity of 1% can appear but it doesn't make
8270 : * any benefit for the load balance.
8271 : */
8272 : static inline bool
8273 40407 : group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8274 : {
8275 40407 : if (sgs->sum_nr_running < sgs->group_weight)
8276 : return true;
8277 :
8278 17899 : if ((sgs->group_capacity * imbalance_pct) <
8279 17899 : (sgs->group_runnable * 100))
8280 : return false;
8281 :
8282 15403 : if ((sgs->group_capacity * 100) >
8283 15403 : (sgs->group_util * imbalance_pct))
8284 12289 : return true;
8285 :
8286 : return false;
8287 : }
8288 :
8289 : /*
8290 : * group_is_overloaded returns true if the group has more tasks than it can
8291 : * handle.
8292 : * group_is_overloaded is not equals to !group_has_capacity because a group
8293 : * with the exact right number of tasks, has no more spare capacity but is not
8294 : * overloaded so both group_has_capacity and group_is_overloaded return
8295 : * false.
8296 : */
8297 : static inline bool
8298 43869 : group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8299 : {
8300 43869 : if (sgs->sum_nr_running <= sgs->group_weight)
8301 : return false;
8302 :
8303 5368 : if ((sgs->group_capacity * 100) <
8304 5368 : (sgs->group_util * imbalance_pct))
8305 : return true;
8306 :
8307 2165 : if ((sgs->group_capacity * imbalance_pct) <
8308 2165 : (sgs->group_runnable * 100))
8309 284 : return true;
8310 :
8311 : return false;
8312 : }
8313 :
8314 : /*
8315 : * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
8316 : * per-CPU capacity than sched_group ref.
8317 : */
8318 : static inline bool
8319 0 : group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
8320 : {
8321 0 : return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
8322 : }
8323 :
8324 : /*
8325 : * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
8326 : * per-CPU capacity_orig than sched_group ref.
8327 : */
8328 : static inline bool
8329 0 : group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
8330 : {
8331 0 : return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
8332 : }
8333 :
8334 : static inline enum
8335 43892 : group_type group_classify(unsigned int imbalance_pct,
8336 : struct sched_group *group,
8337 : struct sg_lb_stats *sgs)
8338 : {
8339 43892 : if (group_is_overloaded(imbalance_pct, sgs))
8340 : return group_overloaded;
8341 :
8342 40406 : if (sg_imbalanced(group))
8343 : return group_imbalanced;
8344 :
8345 40406 : if (sgs->group_asym_packing)
8346 : return group_asym_packing;
8347 :
8348 40406 : if (sgs->group_misfit_task_load)
8349 : return group_misfit_task;
8350 :
8351 40406 : if (!group_has_capacity(imbalance_pct, sgs))
8352 5613 : return group_fully_busy;
8353 :
8354 : return group_has_spare;
8355 : }
8356 :
8357 3039 : static bool update_nohz_stats(struct rq *rq, bool force)
8358 : {
8359 : #ifdef CONFIG_NO_HZ_COMMON
8360 3039 : unsigned int cpu = rq->cpu;
8361 :
8362 3039 : if (!rq->has_blocked_load)
8363 : return false;
8364 :
8365 2821 : if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
8366 : return false;
8367 :
8368 764 : if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
8369 : return true;
8370 :
8371 463 : update_blocked_averages(cpu);
8372 :
8373 463 : return rq->has_blocked_load;
8374 : #else
8375 : return false;
8376 : #endif
8377 : }
8378 :
8379 : /**
8380 : * update_sg_lb_stats - Update sched_group's statistics for load balancing.
8381 : * @env: The load balancing environment.
8382 : * @group: sched_group whose statistics are to be updated.
8383 : * @sgs: variable to hold the statistics for this group.
8384 : * @sg_status: Holds flag indicating the status of the sched_group
8385 : */
8386 37514 : static inline void update_sg_lb_stats(struct lb_env *env,
8387 : struct sched_group *group,
8388 : struct sg_lb_stats *sgs,
8389 : int *sg_status)
8390 : {
8391 37514 : int i, nr_running, local_group;
8392 :
8393 37514 : memset(sgs, 0, sizeof(*sgs));
8394 :
8395 37514 : local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8396 :
8397 112772 : for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8398 37533 : struct rq *rq = cpu_rq(i);
8399 :
8400 37533 : if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8401 580 : env->flags |= LBF_NOHZ_AGAIN;
8402 :
8403 37532 : sgs->group_load += cpu_load(rq);
8404 37532 : sgs->group_util += cpu_util(i);
8405 37551 : sgs->group_runnable += cpu_runnable(rq);
8406 37551 : sgs->sum_h_nr_running += rq->cfs.h_nr_running;
8407 :
8408 37551 : nr_running = rq->nr_running;
8409 37551 : sgs->sum_nr_running += nr_running;
8410 :
8411 37551 : if (nr_running > 1)
8412 4621 : *sg_status |= SG_OVERLOAD;
8413 :
8414 37551 : if (cpu_overutilized(i))
8415 9703 : *sg_status |= SG_OVERUTILIZED;
8416 :
8417 : #ifdef CONFIG_NUMA_BALANCING
8418 : sgs->nr_numa_running += rq->nr_numa_running;
8419 : sgs->nr_preferred_running += rq->nr_preferred_running;
8420 : #endif
8421 : /*
8422 : * No need to call idle_cpu() if nr_running is not 0
8423 : */
8424 37578 : if (!nr_running && idle_cpu(i)) {
8425 16356 : sgs->idle_cpus++;
8426 : /* Idle cpu can't have misfit task */
8427 16356 : continue;
8428 : }
8429 :
8430 21257 : if (local_group)
8431 4559 : continue;
8432 :
8433 : /* Check for a misfit task on the cpu */
8434 16698 : if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8435 0 : sgs->group_misfit_task_load < rq->misfit_task_load) {
8436 0 : sgs->group_misfit_task_load = rq->misfit_task_load;
8437 0 : *sg_status |= SG_OVERLOAD;
8438 : }
8439 : }
8440 :
8441 : /* Check if dst CPU is idle and preferred to this group */
8442 37683 : if (env->sd->flags & SD_ASYM_PACKING &&
8443 0 : env->idle != CPU_NOT_IDLE &&
8444 0 : sgs->sum_h_nr_running &&
8445 0 : sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
8446 0 : sgs->group_asym_packing = 1;
8447 : }
8448 :
8449 37683 : sgs->group_capacity = group->sgc->capacity;
8450 :
8451 37683 : sgs->group_weight = group->group_weight;
8452 :
8453 37683 : sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
8454 :
8455 : /* Computing avg_load makes sense only when group is overloaded */
8456 37683 : if (sgs->group_type == group_overloaded)
8457 2959 : sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8458 : sgs->group_capacity;
8459 37683 : }
8460 :
8461 : /**
8462 : * update_sd_pick_busiest - return 1 on busiest group
8463 : * @env: The load balancing environment.
8464 : * @sds: sched_domain statistics
8465 : * @sg: sched_group candidate to be checked for being the busiest
8466 : * @sgs: sched_group statistics
8467 : *
8468 : * Determine if @sg is a busier group than the previously selected
8469 : * busiest group.
8470 : *
8471 : * Return: %true if @sg is a busier group than the previously selected
8472 : * busiest group. %false otherwise.
8473 : */
8474 28257 : static bool update_sd_pick_busiest(struct lb_env *env,
8475 : struct sd_lb_stats *sds,
8476 : struct sched_group *sg,
8477 : struct sg_lb_stats *sgs)
8478 : {
8479 28257 : struct sg_lb_stats *busiest = &sds->busiest_stat;
8480 :
8481 : /* Make sure that there is at least one task to pull */
8482 28257 : if (!sgs->sum_h_nr_running)
8483 : return false;
8484 :
8485 : /*
8486 : * Don't try to pull misfit tasks we can't help.
8487 : * We can use max_capacity here as reduction in capacity on some
8488 : * CPUs in the group should either be possible to resolve
8489 : * internally or be covered by avg_load imbalance (eventually).
8490 : */
8491 16554 : if (sgs->group_type == group_misfit_task &&
8492 0 : (!group_smaller_max_cpu_capacity(sg, sds->local) ||
8493 0 : sds->local_stat.group_type != group_has_spare))
8494 : return false;
8495 :
8496 16554 : if (sgs->group_type > busiest->group_type)
8497 : return true;
8498 :
8499 11657 : if (sgs->group_type < busiest->group_type)
8500 : return false;
8501 :
8502 : /*
8503 : * The candidate and the current busiest group are the same type of
8504 : * group. Let check which one is the busiest according to the type.
8505 : */
8506 :
8507 9184 : switch (sgs->group_type) {
8508 666 : case group_overloaded:
8509 : /* Select the overloaded group with highest avg_load. */
8510 666 : if (sgs->avg_load <= busiest->avg_load)
8511 : return false;
8512 : break;
8513 :
8514 : case group_imbalanced:
8515 : /*
8516 : * Select the 1st imbalanced group as we don't have any way to
8517 : * choose one more than another.
8518 : */
8519 : return false;
8520 :
8521 0 : case group_asym_packing:
8522 : /* Prefer to move from lowest priority CPU's work */
8523 0 : if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8524 : return false;
8525 : break;
8526 :
8527 0 : case group_misfit_task:
8528 : /*
8529 : * If we have more than one misfit sg go with the biggest
8530 : * misfit.
8531 : */
8532 0 : if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8533 : return false;
8534 : break;
8535 :
8536 909 : case group_fully_busy:
8537 : /*
8538 : * Select the fully busy group with highest avg_load. In
8539 : * theory, there is no need to pull task from such kind of
8540 : * group because tasks have all compute capacity that they need
8541 : * but we can still improve the overall throughput by reducing
8542 : * contention when accessing shared HW resources.
8543 : *
8544 : * XXX for now avg_load is not computed and always 0 so we
8545 : * select the 1st one.
8546 : */
8547 909 : if (sgs->avg_load <= busiest->avg_load)
8548 : return false;
8549 : break;
8550 :
8551 7609 : case group_has_spare:
8552 : /*
8553 : * Select not overloaded group with lowest number of idle cpus
8554 : * and highest number of running tasks. We could also compare
8555 : * the spare capacity which is more stable but it can end up
8556 : * that the group has less spare capacity but finally more idle
8557 : * CPUs which means less opportunity to pull tasks.
8558 : */
8559 7609 : if (sgs->idle_cpus > busiest->idle_cpus)
8560 : return false;
8561 7609 : else if ((sgs->idle_cpus == busiest->idle_cpus) &&
8562 1630 : (sgs->sum_nr_running <= busiest->sum_nr_running))
8563 : return false;
8564 :
8565 : break;
8566 : }
8567 :
8568 : /*
8569 : * Candidate sg has no more than one task per CPU and has higher
8570 : * per-CPU capacity. Migrating tasks to less capable CPUs may harm
8571 : * throughput. Maximize throughput, power/energy consequences are not
8572 : * considered.
8573 : */
8574 6535 : if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8575 0 : (sgs->group_type <= group_fully_busy) &&
8576 0 : (group_smaller_min_cpu_capacity(sds->local, sg)))
8577 0 : return false;
8578 :
8579 : return true;
8580 : }
8581 :
8582 : #ifdef CONFIG_NUMA_BALANCING
8583 : static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8584 : {
8585 : if (sgs->sum_h_nr_running > sgs->nr_numa_running)
8586 : return regular;
8587 : if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
8588 : return remote;
8589 : return all;
8590 : }
8591 :
8592 : static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8593 : {
8594 : if (rq->nr_running > rq->nr_numa_running)
8595 : return regular;
8596 : if (rq->nr_running > rq->nr_preferred_running)
8597 : return remote;
8598 : return all;
8599 : }
8600 : #else
8601 0 : static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8602 : {
8603 0 : return all;
8604 : }
8605 :
8606 2601 : static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8607 : {
8608 2601 : return regular;
8609 : }
8610 : #endif /* CONFIG_NUMA_BALANCING */
8611 :
8612 :
8613 : struct sg_lb_stats;
8614 :
8615 : /*
8616 : * task_running_on_cpu - return 1 if @p is running on @cpu.
8617 : */
8618 :
8619 6336 : static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
8620 : {
8621 : /* Task has no contribution or is new */
8622 1584 : if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8623 : return 0;
8624 :
8625 625 : if (task_on_rq_queued(p))
8626 625 : return 1;
8627 :
8628 : return 0;
8629 : }
8630 :
8631 : /**
8632 : * idle_cpu_without - would a given CPU be idle without p ?
8633 : * @cpu: the processor on which idleness is tested.
8634 : * @p: task which should be ignored.
8635 : *
8636 : * Return: 1 if the CPU would be idle. 0 otherwise.
8637 : */
8638 3165 : static int idle_cpu_without(int cpu, struct task_struct *p)
8639 : {
8640 3165 : struct rq *rq = cpu_rq(cpu);
8641 :
8642 3165 : if (rq->curr != rq->idle && rq->curr != p)
8643 : return 0;
8644 :
8645 : /*
8646 : * rq->nr_running can't be used but an updated version without the
8647 : * impact of p on cpu must be used instead. The updated nr_running
8648 : * be computed and tested before calling idle_cpu_without().
8649 : */
8650 :
8651 : #ifdef CONFIG_SMP
8652 3165 : if (rq->ttwu_pending)
8653 5 : return 0;
8654 : #endif
8655 :
8656 : return 1;
8657 : }
8658 :
8659 : /*
8660 : * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
8661 : * @sd: The sched_domain level to look for idlest group.
8662 : * @group: sched_group whose statistics are to be updated.
8663 : * @sgs: variable to hold the statistics for this group.
8664 : * @p: The task for which we look for the idlest group/CPU.
8665 : */
8666 6336 : static inline void update_sg_wakeup_stats(struct sched_domain *sd,
8667 : struct sched_group *group,
8668 : struct sg_lb_stats *sgs,
8669 : struct task_struct *p)
8670 : {
8671 6336 : int i, nr_running;
8672 :
8673 6336 : memset(sgs, 0, sizeof(*sgs));
8674 :
8675 12672 : for_each_cpu(i, sched_group_span(group)) {
8676 6336 : struct rq *rq = cpu_rq(i);
8677 6336 : unsigned int local;
8678 :
8679 6336 : sgs->group_load += cpu_load_without(rq, p);
8680 6336 : sgs->group_util += cpu_util_without(i, p);
8681 6336 : sgs->group_runnable += cpu_runnable_without(rq, p);
8682 6336 : local = task_running_on_cpu(i, p);
8683 6336 : sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
8684 :
8685 6336 : nr_running = rq->nr_running - local;
8686 6336 : sgs->sum_nr_running += nr_running;
8687 :
8688 : /*
8689 : * No need to call idle_cpu_without() if nr_running is not 0
8690 : */
8691 6336 : if (!nr_running && idle_cpu_without(i, p))
8692 3160 : sgs->idle_cpus++;
8693 :
8694 : }
8695 :
8696 : /* Check if task fits in the group */
8697 6336 : if (sd->flags & SD_ASYM_CPUCAPACITY &&
8698 0 : !task_fits_capacity(p, group->sgc->max_capacity)) {
8699 0 : sgs->group_misfit_task_load = 1;
8700 : }
8701 :
8702 6336 : sgs->group_capacity = group->sgc->capacity;
8703 :
8704 6336 : sgs->group_weight = group->group_weight;
8705 :
8706 6336 : sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
8707 :
8708 : /*
8709 : * Computing avg_load makes sense only when group is fully busy or
8710 : * overloaded
8711 : */
8712 6336 : if (sgs->group_type == group_fully_busy ||
8713 : sgs->group_type == group_overloaded)
8714 1622 : sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8715 : sgs->group_capacity;
8716 6336 : }
8717 :
8718 4752 : static bool update_pick_idlest(struct sched_group *idlest,
8719 : struct sg_lb_stats *idlest_sgs,
8720 : struct sched_group *group,
8721 : struct sg_lb_stats *sgs)
8722 : {
8723 4752 : if (sgs->group_type < idlest_sgs->group_type)
8724 : return true;
8725 :
8726 2970 : if (sgs->group_type > idlest_sgs->group_type)
8727 : return false;
8728 :
8729 : /*
8730 : * The candidate and the current idlest group are the same type of
8731 : * group. Let check which one is the idlest according to the type.
8732 : */
8733 :
8734 2316 : switch (sgs->group_type) {
8735 348 : case group_overloaded:
8736 : case group_fully_busy:
8737 : /* Select the group with lowest avg_load. */
8738 348 : if (idlest_sgs->avg_load <= sgs->avg_load)
8739 : return false;
8740 : break;
8741 :
8742 : case group_imbalanced:
8743 : case group_asym_packing:
8744 : /* Those types are not used in the slow wakeup path */
8745 : return false;
8746 :
8747 0 : case group_misfit_task:
8748 : /* Select group with the highest max capacity */
8749 0 : if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
8750 : return false;
8751 : break;
8752 :
8753 1968 : case group_has_spare:
8754 : /* Select group with most idle CPUs */
8755 1968 : if (idlest_sgs->idle_cpus > sgs->idle_cpus)
8756 : return false;
8757 :
8758 : /* Select group with lowest group_util */
8759 1726 : if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
8760 1562 : idlest_sgs->group_util <= sgs->group_util)
8761 : return false;
8762 :
8763 : break;
8764 : }
8765 :
8766 1035 : return true;
8767 : }
8768 :
8769 : /*
8770 : * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
8771 : * This is an approximation as the number of running tasks may not be
8772 : * related to the number of busy CPUs due to sched_setaffinity.
8773 : */
8774 0 : static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
8775 : {
8776 0 : return (dst_running < (dst_weight >> 2));
8777 : }
8778 :
8779 : /*
8780 : * find_idlest_group() finds and returns the least busy CPU group within the
8781 : * domain.
8782 : *
8783 : * Assumes p is allowed on at least one CPU in sd.
8784 : */
8785 : static struct sched_group *
8786 1584 : find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
8787 : {
8788 1584 : struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
8789 1584 : struct sg_lb_stats local_sgs, tmp_sgs;
8790 1584 : struct sg_lb_stats *sgs;
8791 1584 : unsigned long imbalance;
8792 1584 : struct sg_lb_stats idlest_sgs = {
8793 : .avg_load = UINT_MAX,
8794 : .group_type = group_overloaded,
8795 : };
8796 :
8797 6336 : do {
8798 6336 : int local_group;
8799 :
8800 : /* Skip over this group if it has no CPUs allowed */
8801 6336 : if (!cpumask_intersects(sched_group_span(group),
8802 6336 : p->cpus_ptr))
8803 0 : continue;
8804 :
8805 6336 : local_group = cpumask_test_cpu(this_cpu,
8806 6336 : sched_group_span(group));
8807 :
8808 6336 : if (local_group) {
8809 1584 : sgs = &local_sgs;
8810 1584 : local = group;
8811 : } else {
8812 : sgs = &tmp_sgs;
8813 : }
8814 :
8815 6336 : update_sg_wakeup_stats(sd, group, sgs, p);
8816 :
8817 6336 : if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
8818 2817 : idlest = group;
8819 2817 : idlest_sgs = *sgs;
8820 : }
8821 :
8822 6336 : } while (group = group->next, group != sd->groups);
8823 :
8824 :
8825 : /* There is no idlest group to push tasks to */
8826 1584 : if (!idlest)
8827 : return NULL;
8828 :
8829 : /* The local group has been skipped because of CPU affinity */
8830 1584 : if (!local)
8831 : return idlest;
8832 :
8833 : /*
8834 : * If the local group is idler than the selected idlest group
8835 : * don't try and push the task.
8836 : */
8837 1584 : if (local_sgs.group_type < idlest_sgs.group_type)
8838 : return NULL;
8839 :
8840 : /*
8841 : * If the local group is busier than the selected idlest group
8842 : * try and push the task.
8843 : */
8844 1506 : if (local_sgs.group_type > idlest_sgs.group_type)
8845 : return idlest;
8846 :
8847 1302 : switch (local_sgs.group_type) {
8848 : case group_overloaded:
8849 : case group_fully_busy:
8850 :
8851 : /* Calculate allowed imbalance based on load */
8852 54 : imbalance = scale_load_down(NICE_0_LOAD) *
8853 54 : (sd->imbalance_pct-100) / 100;
8854 :
8855 : /*
8856 : * When comparing groups across NUMA domains, it's possible for
8857 : * the local domain to be very lightly loaded relative to the
8858 : * remote domains but "imbalance" skews the comparison making
8859 : * remote CPUs look much more favourable. When considering
8860 : * cross-domain, add imbalance to the load on the remote node
8861 : * and consider staying local.
8862 : */
8863 :
8864 54 : if ((sd->flags & SD_NUMA) &&
8865 0 : ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
8866 : return NULL;
8867 :
8868 : /*
8869 : * If the local group is less loaded than the selected
8870 : * idlest group don't try and push any tasks.
8871 : */
8872 54 : if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
8873 : return NULL;
8874 :
8875 29 : if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
8876 13 : return NULL;
8877 : break;
8878 :
8879 : case group_imbalanced:
8880 : case group_asym_packing:
8881 : /* Those type are not used in the slow wakeup path */
8882 : return NULL;
8883 :
8884 0 : case group_misfit_task:
8885 : /* Select group with the highest max capacity */
8886 0 : if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
8887 0 : return NULL;
8888 : break;
8889 :
8890 1248 : case group_has_spare:
8891 1248 : if (sd->flags & SD_NUMA) {
8892 : #ifdef CONFIG_NUMA_BALANCING
8893 : int idlest_cpu;
8894 : /*
8895 : * If there is spare capacity at NUMA, try to select
8896 : * the preferred node
8897 : */
8898 : if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
8899 : return NULL;
8900 :
8901 : idlest_cpu = cpumask_first(sched_group_span(idlest));
8902 : if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
8903 : return idlest;
8904 : #endif
8905 : /*
8906 : * Otherwise, keep the task on this node to stay close
8907 : * its wakeup source and improve locality. If there is
8908 : * a real need of migration, periodic load balance will
8909 : * take care of it.
8910 : */
8911 0 : if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
8912 : return NULL;
8913 : }
8914 :
8915 : /*
8916 : * Select group with highest number of idle CPUs. We could also
8917 : * compare the utilization which is more stable but it can end
8918 : * up that the group has less spare capacity but finally more
8919 : * idle CPUs which means more opportunity to run task.
8920 : */
8921 1248 : if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
8922 637 : return NULL;
8923 : break;
8924 : }
8925 :
8926 : return idlest;
8927 : }
8928 :
8929 : /**
8930 : * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
8931 : * @env: The load balancing environment.
8932 : * @sds: variable to hold the statistics for this sched_domain.
8933 : */
8934 :
8935 9297 : static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8936 : {
8937 9297 : struct sched_domain *child = env->sd->child;
8938 9297 : struct sched_group *sg = env->sd->groups;
8939 9297 : struct sg_lb_stats *local = &sds->local_stat;
8940 9297 : struct sg_lb_stats tmp_sgs;
8941 9297 : int sg_status = 0;
8942 :
8943 : #ifdef CONFIG_NO_HZ_COMMON
8944 9297 : if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8945 711 : env->flags |= LBF_NOHZ_STATS;
8946 : #endif
8947 :
8948 37410 : do {
8949 37410 : struct sg_lb_stats *sgs = &tmp_sgs;
8950 37410 : int local_group;
8951 :
8952 37410 : local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
8953 37536 : if (local_group) {
8954 9346 : sds->local = sg;
8955 9346 : sgs = local;
8956 :
8957 9346 : if (env->idle != CPU_NEWLY_IDLE ||
8958 2901 : time_after_eq(jiffies, sg->sgc->next_update))
8959 7913 : update_group_capacity(env->sd, env->dst_cpu);
8960 : }
8961 :
8962 37527 : update_sg_lb_stats(env, sg, sgs, &sg_status);
8963 :
8964 37554 : if (local_group)
8965 9286 : goto next_group;
8966 :
8967 :
8968 28268 : if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8969 11432 : sds->busiest = sg;
8970 11432 : sds->busiest_stat = *sgs;
8971 : }
8972 :
8973 16826 : next_group:
8974 : /* Now, start updating sd_lb_stats */
8975 37544 : sds->total_load += sgs->group_load;
8976 37544 : sds->total_capacity += sgs->group_capacity;
8977 :
8978 37544 : sg = sg->next;
8979 37544 : } while (sg != env->sd->groups);
8980 :
8981 : /* Tag domain that child domain prefers tasks go to siblings first */
8982 9431 : sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
8983 :
8984 : #ifdef CONFIG_NO_HZ_COMMON
8985 9431 : if ((env->flags & LBF_NOHZ_AGAIN) &&
8986 425 : cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8987 :
8988 425 : WRITE_ONCE(nohz.next_blocked,
8989 : jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8990 : }
8991 : #endif
8992 :
8993 9431 : if (env->sd->flags & SD_NUMA)
8994 0 : env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8995 :
8996 9431 : if (!env->sd->parent) {
8997 9418 : struct root_domain *rd = env->dst_rq->rd;
8998 :
8999 : /* update overload indicator if we are at root domain */
9000 9418 : WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
9001 :
9002 : /* Update over-utilization (tipping point, U >= 0) indicator */
9003 9418 : WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9004 9418 : trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
9005 13 : } else if (sg_status & SG_OVERUTILIZED) {
9006 0 : struct root_domain *rd = env->dst_rq->rd;
9007 :
9008 0 : WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9009 0 : trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9010 : }
9011 9421 : }
9012 :
9013 : #define NUMA_IMBALANCE_MIN 2
9014 :
9015 0 : static inline long adjust_numa_imbalance(int imbalance,
9016 : int dst_running, int dst_weight)
9017 : {
9018 0 : if (!allow_numa_imbalance(dst_running, dst_weight))
9019 0 : return imbalance;
9020 :
9021 : /*
9022 : * Allow a small imbalance based on a simple pair of communicating
9023 : * tasks that remain local when the destination is lightly loaded.
9024 : */
9025 0 : if (imbalance <= NUMA_IMBALANCE_MIN)
9026 : return 0;
9027 :
9028 0 : return imbalance;
9029 : }
9030 :
9031 : /**
9032 : * calculate_imbalance - Calculate the amount of imbalance present within the
9033 : * groups of a given sched_domain during load balance.
9034 : * @env: load balance environment
9035 : * @sds: statistics of the sched_domain whose imbalance is to be calculated.
9036 : */
9037 2670 : static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9038 : {
9039 2670 : struct sg_lb_stats *local, *busiest;
9040 :
9041 2670 : local = &sds->local_stat;
9042 2670 : busiest = &sds->busiest_stat;
9043 :
9044 2670 : if (busiest->group_type == group_misfit_task) {
9045 : /* Set imbalance to allow misfit tasks to be balanced. */
9046 0 : env->migration_type = migrate_misfit;
9047 0 : env->imbalance = 1;
9048 0 : return;
9049 : }
9050 :
9051 2670 : if (busiest->group_type == group_asym_packing) {
9052 : /*
9053 : * In case of asym capacity, we will try to migrate all load to
9054 : * the preferred CPU.
9055 : */
9056 0 : env->migration_type = migrate_task;
9057 0 : env->imbalance = busiest->sum_h_nr_running;
9058 0 : return;
9059 : }
9060 :
9061 2670 : if (busiest->group_type == group_imbalanced) {
9062 : /*
9063 : * In the group_imb case we cannot rely on group-wide averages
9064 : * to ensure CPU-load equilibrium, try to move any task to fix
9065 : * the imbalance. The next load balance will take care of
9066 : * balancing back the system.
9067 : */
9068 0 : env->migration_type = migrate_task;
9069 0 : env->imbalance = 1;
9070 0 : return;
9071 : }
9072 :
9073 : /*
9074 : * Try to use spare capacity of local group without overloading it or
9075 : * emptying busiest.
9076 : */
9077 2670 : if (local->group_type == group_has_spare) {
9078 2535 : if ((busiest->group_type > group_fully_busy) &&
9079 1903 : !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
9080 : /*
9081 : * If busiest is overloaded, try to fill spare
9082 : * capacity. This might end up creating spare capacity
9083 : * in busiest or busiest still being overloaded but
9084 : * there is no simple way to directly compute the
9085 : * amount of load to migrate in order to balance the
9086 : * system.
9087 : */
9088 1903 : env->migration_type = migrate_util;
9089 1903 : env->imbalance = max(local->group_capacity, local->group_util) -
9090 : local->group_util;
9091 :
9092 : /*
9093 : * In some cases, the group's utilization is max or even
9094 : * higher than capacity because of migrations but the
9095 : * local CPU is (newly) idle. There is at least one
9096 : * waiting task in this overloaded busiest group. Let's
9097 : * try to pull it.
9098 : */
9099 1903 : if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9100 70 : env->migration_type = migrate_task;
9101 70 : env->imbalance = 1;
9102 : }
9103 :
9104 1903 : return;
9105 : }
9106 :
9107 632 : if (busiest->group_weight == 1 || sds->prefer_sibling) {
9108 632 : unsigned int nr_diff = busiest->sum_nr_running;
9109 : /*
9110 : * When prefer sibling, evenly spread running tasks on
9111 : * groups.
9112 : */
9113 632 : env->migration_type = migrate_task;
9114 632 : lsub_positive(&nr_diff, local->sum_nr_running);
9115 632 : env->imbalance = nr_diff >> 1;
9116 : } else {
9117 :
9118 : /*
9119 : * If there is no overload, we just want to even the number of
9120 : * idle cpus.
9121 : */
9122 0 : env->migration_type = migrate_task;
9123 0 : env->imbalance = max_t(long, 0, (local->idle_cpus -
9124 : busiest->idle_cpus) >> 1);
9125 : }
9126 :
9127 : /* Consider allowing a small imbalance between NUMA groups */
9128 632 : if (env->sd->flags & SD_NUMA) {
9129 0 : env->imbalance = adjust_numa_imbalance(env->imbalance,
9130 0 : busiest->sum_nr_running, busiest->group_weight);
9131 : }
9132 :
9133 632 : return;
9134 : }
9135 :
9136 : /*
9137 : * Local is fully busy but has to take more load to relieve the
9138 : * busiest group
9139 : */
9140 135 : if (local->group_type < group_overloaded) {
9141 : /*
9142 : * Local will become overloaded so the avg_load metrics are
9143 : * finally needed.
9144 : */
9145 :
9146 88 : local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9147 88 : local->group_capacity;
9148 :
9149 88 : sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9150 88 : sds->total_capacity;
9151 : /*
9152 : * If the local group is more loaded than the selected
9153 : * busiest group don't try to pull any tasks.
9154 : */
9155 88 : if (local->avg_load >= busiest->avg_load) {
9156 13 : env->imbalance = 0;
9157 13 : return;
9158 : }
9159 : }
9160 :
9161 : /*
9162 : * Both group are or will become overloaded and we're trying to get all
9163 : * the CPUs to the average_load, so we don't want to push ourselves
9164 : * above the average load, nor do we wish to reduce the max loaded CPU
9165 : * below the average load. At the same time, we also don't want to
9166 : * reduce the group load below the group capacity. Thus we look for
9167 : * the minimum possible imbalance.
9168 : */
9169 122 : env->migration_type = migrate_load;
9170 122 : env->imbalance = min(
9171 : (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
9172 : (sds->avg_load - local->avg_load) * local->group_capacity
9173 122 : ) / SCHED_CAPACITY_SCALE;
9174 : }
9175 :
9176 : /******* find_busiest_group() helpers end here *********************/
9177 :
9178 : /*
9179 : * Decision matrix according to the local and busiest group type:
9180 : *
9181 : * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
9182 : * has_spare nr_idle balanced N/A N/A balanced balanced
9183 : * fully_busy nr_idle nr_idle N/A N/A balanced balanced
9184 : * misfit_task force N/A N/A N/A force force
9185 : * asym_packing force force N/A N/A force force
9186 : * imbalanced force force N/A N/A force force
9187 : * overloaded force force N/A N/A force avg_load
9188 : *
9189 : * N/A : Not Applicable because already filtered while updating
9190 : * statistics.
9191 : * balanced : The system is balanced for these 2 groups.
9192 : * force : Calculate the imbalance as load migration is probably needed.
9193 : * avg_load : Only if imbalance is significant enough.
9194 : * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
9195 : * different in groups.
9196 : */
9197 :
9198 : /**
9199 : * find_busiest_group - Returns the busiest group within the sched_domain
9200 : * if there is an imbalance.
9201 : *
9202 : * Also calculates the amount of runnable load which should be moved
9203 : * to restore balance.
9204 : *
9205 : * @env: The load balancing environment.
9206 : *
9207 : * Return: - The busiest group if imbalance exists.
9208 : */
9209 9304 : static struct sched_group *find_busiest_group(struct lb_env *env)
9210 : {
9211 9304 : struct sg_lb_stats *local, *busiest;
9212 9304 : struct sd_lb_stats sds;
9213 :
9214 9304 : init_sd_lb_stats(&sds);
9215 :
9216 : /*
9217 : * Compute the various statistics relevant for load balancing at
9218 : * this level.
9219 : */
9220 9304 : update_sd_lb_stats(env, &sds);
9221 :
9222 9399 : if (sched_energy_enabled()) {
9223 : struct root_domain *rd = env->dst_rq->rd;
9224 :
9225 : if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9226 : goto out_balanced;
9227 : }
9228 :
9229 9399 : local = &sds.local_stat;
9230 9399 : busiest = &sds.busiest_stat;
9231 :
9232 : /* There is no busy sibling group to pull tasks from */
9233 9399 : if (!sds.busiest)
9234 282 : goto out_balanced;
9235 :
9236 : /* Misfit tasks should be dealt with regardless of the avg load */
9237 9117 : if (busiest->group_type == group_misfit_task)
9238 0 : goto force_balance;
9239 :
9240 : /* ASYM feature bypasses nice load balance check */
9241 9117 : if (busiest->group_type == group_asym_packing)
9242 0 : goto force_balance;
9243 :
9244 : /*
9245 : * If the busiest group is imbalanced the below checks don't
9246 : * work because they assume all things are equal, which typically
9247 : * isn't true due to cpus_ptr constraints and the like.
9248 : */
9249 9117 : if (busiest->group_type == group_imbalanced)
9250 0 : goto force_balance;
9251 :
9252 : /*
9253 : * If the local group is busier than the selected busiest group
9254 : * don't try and pull any tasks.
9255 : */
9256 9117 : if (local->group_type > busiest->group_type)
9257 74 : goto out_balanced;
9258 :
9259 : /*
9260 : * When groups are overloaded, use the avg_load to ensure fairness
9261 : * between tasks.
9262 : */
9263 9043 : if (local->group_type == group_overloaded) {
9264 : /*
9265 : * If the local group is more loaded than the selected
9266 : * busiest group don't try to pull any tasks.
9267 : */
9268 122 : if (local->avg_load >= busiest->avg_load)
9269 41 : goto out_balanced;
9270 :
9271 : /* XXX broken for overlapping NUMA groups */
9272 81 : sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9273 81 : sds.total_capacity;
9274 :
9275 : /*
9276 : * Don't pull any tasks if this group is already above the
9277 : * domain average load.
9278 : */
9279 81 : if (local->avg_load >= sds.avg_load)
9280 28 : goto out_balanced;
9281 :
9282 : /*
9283 : * If the busiest group is more loaded, use imbalance_pct to be
9284 : * conservative.
9285 : */
9286 53 : if (100 * busiest->avg_load <=
9287 53 : env->sd->imbalance_pct * local->avg_load)
9288 6 : goto out_balanced;
9289 : }
9290 :
9291 : /* Try to move all excess tasks to child's sibling domain */
9292 8968 : if (sds.prefer_sibling && local->group_type == group_has_spare &&
9293 0 : busiest->sum_nr_running > local->sum_nr_running + 1)
9294 0 : goto force_balance;
9295 :
9296 8968 : if (busiest->group_type != group_overloaded) {
9297 6928 : if (env->idle == CPU_NOT_IDLE)
9298 : /*
9299 : * If the busiest group is not overloaded (and as a
9300 : * result the local one too) but this CPU is already
9301 : * busy, let another idle CPU try to pull task.
9302 : */
9303 78 : goto out_balanced;
9304 :
9305 6850 : if (busiest->group_weight > 1 &&
9306 0 : local->idle_cpus <= (busiest->idle_cpus + 1))
9307 : /*
9308 : * If the busiest group is not overloaded
9309 : * and there is no imbalance between this and busiest
9310 : * group wrt idle CPUs, it is balanced. The imbalance
9311 : * becomes significant if the diff is greater than 1
9312 : * otherwise we might end up to just move the imbalance
9313 : * on another group. Of course this applies only if
9314 : * there is more than 1 CPU per group.
9315 : */
9316 0 : goto out_balanced;
9317 :
9318 6850 : if (busiest->sum_h_nr_running == 1)
9319 : /*
9320 : * busiest doesn't have any tasks waiting to run
9321 : */
9322 6214 : goto out_balanced;
9323 : }
9324 :
9325 2676 : force_balance:
9326 : /* Looks like there is an imbalance. Compute it */
9327 2676 : calculate_imbalance(env, &sds);
9328 2672 : return env->imbalance ? sds.busiest : NULL;
9329 :
9330 6723 : out_balanced:
9331 6723 : env->imbalance = 0;
9332 6723 : return NULL;
9333 : }
9334 :
9335 : /*
9336 : * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
9337 : */
9338 2606 : static struct rq *find_busiest_queue(struct lb_env *env,
9339 : struct sched_group *group)
9340 : {
9341 2606 : struct rq *busiest = NULL, *rq;
9342 2606 : unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9343 2606 : unsigned int busiest_nr = 0;
9344 2606 : int i;
9345 :
9346 5207 : for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9347 2601 : unsigned long capacity, load, util;
9348 2601 : unsigned int nr_running;
9349 2601 : enum fbq_type rt;
9350 :
9351 2601 : rq = cpu_rq(i);
9352 2601 : rt = fbq_classify_rq(rq);
9353 :
9354 : /*
9355 : * We classify groups/runqueues into three groups:
9356 : * - regular: there are !numa tasks
9357 : * - remote: there are numa tasks that run on the 'wrong' node
9358 : * - all: there is no distinction
9359 : *
9360 : * In order to avoid migrating ideally placed numa tasks,
9361 : * ignore those when there's better options.
9362 : *
9363 : * If we ignore the actual busiest queue to migrate another
9364 : * task, the next balance pass can still reduce the busiest
9365 : * queue by moving tasks around inside the node.
9366 : *
9367 : * If we cannot move enough load due to this classification
9368 : * the next pass will adjust the group classification and
9369 : * allow migration of more tasks.
9370 : *
9371 : * Both cases only affect the total convergence complexity.
9372 : */
9373 2601 : if (rt > env->fbq_type)
9374 : continue;
9375 :
9376 2601 : nr_running = rq->cfs.h_nr_running;
9377 2601 : if (!nr_running)
9378 0 : continue;
9379 :
9380 2601 : capacity = capacity_of(i);
9381 :
9382 : /*
9383 : * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
9384 : * eventually lead to active_balancing high->low capacity.
9385 : * Higher per-CPU capacity is considered better than balancing
9386 : * average load.
9387 : */
9388 2601 : if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9389 0 : capacity_of(env->dst_cpu) < capacity &&
9390 : nr_running == 1)
9391 0 : continue;
9392 :
9393 2601 : switch (env->migration_type) {
9394 : case migrate_load:
9395 : /*
9396 : * When comparing with load imbalance, use cpu_load()
9397 : * which is not scaled with the CPU capacity.
9398 : */
9399 122 : load = cpu_load(rq);
9400 :
9401 122 : if (nr_running == 1 && load > env->imbalance &&
9402 0 : !check_cpu_capacity(rq, env->sd))
9403 : break;
9404 :
9405 : /*
9406 : * For the load comparisons with the other CPUs,
9407 : * consider the cpu_load() scaled with the CPU
9408 : * capacity, so that the load can be moved away
9409 : * from the CPU that is potentially running at a
9410 : * lower capacity.
9411 : *
9412 : * Thus we're looking for max(load_i / capacity_i),
9413 : * crosswise multiplication to rid ourselves of the
9414 : * division works out to:
9415 : * load_i * capacity_j > load_j * capacity_i;
9416 : * where j is our previous maximum.
9417 : */
9418 122 : if (load * busiest_capacity > busiest_load * capacity) {
9419 122 : busiest_load = load;
9420 122 : busiest_capacity = capacity;
9421 122 : busiest = rq;
9422 : }
9423 : break;
9424 :
9425 : case migrate_util:
9426 1832 : util = cpu_util(cpu_of(rq));
9427 :
9428 : /*
9429 : * Don't try to pull utilization from a CPU with one
9430 : * running task. Whatever its utilization, we will fail
9431 : * detach the task.
9432 : */
9433 1832 : if (nr_running <= 1)
9434 5 : continue;
9435 :
9436 1827 : if (busiest_util < util) {
9437 1829 : busiest_util = util;
9438 1829 : busiest = rq;
9439 : }
9440 : break;
9441 :
9442 647 : case migrate_task:
9443 647 : if (busiest_nr < nr_running) {
9444 648 : busiest_nr = nr_running;
9445 648 : busiest = rq;
9446 : }
9447 : break;
9448 :
9449 0 : case migrate_misfit:
9450 : /*
9451 : * For ASYM_CPUCAPACITY domains with misfit tasks we
9452 : * simply seek the "biggest" misfit task.
9453 : */
9454 0 : if (rq->misfit_task_load > busiest_load) {
9455 0 : busiest_load = rq->misfit_task_load;
9456 0 : busiest = rq;
9457 : }
9458 :
9459 : break;
9460 :
9461 : }
9462 0 : }
9463 :
9464 2613 : return busiest;
9465 : }
9466 :
9467 : /*
9468 : * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
9469 : * so long as it is large enough.
9470 : */
9471 : #define MAX_PINNED_INTERVAL 512
9472 :
9473 : static inline bool
9474 1713 : asym_active_balance(struct lb_env *env)
9475 : {
9476 : /*
9477 : * ASYM_PACKING needs to force migrate tasks from busy but
9478 : * lower priority CPUs in order to pack all tasks in the
9479 : * highest priority CPUs.
9480 : */
9481 1713 : return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9482 0 : sched_asym_prefer(env->dst_cpu, env->src_cpu);
9483 : }
9484 :
9485 : static inline bool
9486 1713 : imbalanced_active_balance(struct lb_env *env)
9487 : {
9488 1713 : struct sched_domain *sd = env->sd;
9489 :
9490 : /*
9491 : * The imbalanced case includes the case of pinned tasks preventing a fair
9492 : * distribution of the load on the system but also the even distribution of the
9493 : * threads on a system with spare capacity
9494 : */
9495 1929 : if ((env->migration_type == migrate_task) &&
9496 216 : (sd->nr_balance_failed > sd->cache_nice_tries+2))
9497 : return 1;
9498 :
9499 : return 0;
9500 : }
9501 :
9502 1713 : static int need_active_balance(struct lb_env *env)
9503 : {
9504 1713 : struct sched_domain *sd = env->sd;
9505 :
9506 1713 : if (asym_active_balance(env))
9507 : return 1;
9508 :
9509 1713 : if (imbalanced_active_balance(env))
9510 : return 1;
9511 :
9512 : /*
9513 : * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
9514 : * It's worth migrating the task if the src_cpu's capacity is reduced
9515 : * because of other sched_class or IRQs if more capacity stays
9516 : * available on dst_cpu.
9517 : */
9518 1712 : if ((env->idle != CPU_NOT_IDLE) &&
9519 1620 : (env->src_rq->cfs.h_nr_running == 1)) {
9520 6 : if ((check_cpu_capacity(env->src_rq, sd)) &&
9521 0 : (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
9522 : return 1;
9523 : }
9524 :
9525 1712 : if (env->migration_type == migrate_misfit)
9526 0 : return 1;
9527 :
9528 : return 0;
9529 : }
9530 :
9531 : static int active_load_balance_cpu_stop(void *data);
9532 :
9533 9357 : static int should_we_balance(struct lb_env *env)
9534 : {
9535 9357 : struct sched_group *sg = env->sd->groups;
9536 9357 : int cpu;
9537 :
9538 : /*
9539 : * Ensure the balancing environment is consistent; can happen
9540 : * when the softirq triggers 'during' hotplug.
9541 : */
9542 9357 : if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
9543 : return 0;
9544 :
9545 : /*
9546 : * In the newly idle case, we will allow all the CPUs
9547 : * to do the newly idle load balance.
9548 : */
9549 9359 : if (env->idle == CPU_NEWLY_IDLE)
9550 : return 1;
9551 :
9552 : /* Try to find first idle CPU */
9553 8111 : for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
9554 6444 : if (!idle_cpu(cpu))
9555 1654 : continue;
9556 :
9557 : /* Are we the first idle CPU? */
9558 4840 : return cpu == env->dst_cpu;
9559 : }
9560 :
9561 : /* Are we the first CPU of this group ? */
9562 1654 : return group_balance_cpu(sg) == env->dst_cpu;
9563 : }
9564 :
9565 : /*
9566 : * Check this_cpu to ensure it is balanced within domain. Attempt to move
9567 : * tasks if there is an imbalance.
9568 : */
9569 9247 : static int load_balance(int this_cpu, struct rq *this_rq,
9570 : struct sched_domain *sd, enum cpu_idle_type idle,
9571 : int *continue_balancing)
9572 : {
9573 9247 : int ld_moved, cur_ld_moved, active_balance = 0;
9574 9247 : struct sched_domain *sd_parent = sd->parent;
9575 9247 : struct sched_group *group;
9576 9247 : struct rq *busiest;
9577 9247 : struct rq_flags rf;
9578 9247 : struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
9579 :
9580 9279 : struct lb_env env = {
9581 : .sd = sd,
9582 : .dst_cpu = this_cpu,
9583 : .dst_rq = this_rq,
9584 9279 : .dst_grpmask = sched_group_span(sd->groups),
9585 : .idle = idle,
9586 : .loop_break = sched_nr_migrate_break,
9587 : .cpus = cpus,
9588 : .fbq_type = all,
9589 : .tasks = LIST_HEAD_INIT(env.tasks),
9590 : };
9591 :
9592 9279 : cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
9593 :
9594 9355 : schedstat_inc(sd->lb_count[idle]);
9595 :
9596 9355 : redo:
9597 9355 : if (!should_we_balance(&env)) {
9598 0 : *continue_balancing = 0;
9599 0 : goto out_balanced;
9600 : }
9601 :
9602 9404 : group = find_busiest_group(&env);
9603 9375 : if (!group) {
9604 6767 : schedstat_inc(sd->lb_nobusyg[idle]);
9605 6767 : goto out_balanced;
9606 : }
9607 :
9608 2608 : busiest = find_busiest_queue(&env, group);
9609 2607 : if (!busiest) {
9610 6 : schedstat_inc(sd->lb_nobusyq[idle]);
9611 6 : goto out_balanced;
9612 : }
9613 :
9614 2601 : BUG_ON(busiest == env.dst_rq);
9615 :
9616 2601 : schedstat_add(sd->lb_imbalance[idle], env.imbalance);
9617 :
9618 2601 : env.src_cpu = busiest->cpu;
9619 2601 : env.src_rq = busiest;
9620 :
9621 2601 : ld_moved = 0;
9622 : /* Clear this flag as soon as we find a pullable task */
9623 2601 : env.flags |= LBF_ALL_PINNED;
9624 2601 : if (busiest->nr_running > 1) {
9625 : /*
9626 : * Attempt to move tasks. If find_busiest_group has found
9627 : * an imbalance but busiest->nr_running <= 1, the group is
9628 : * still unbalanced. ld_moved simply stays zero, so it is
9629 : * correctly treated as an imbalance.
9630 : */
9631 2595 : env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
9632 :
9633 : more_balance:
9634 2595 : rq_lock_irqsave(busiest, &rf);
9635 2608 : update_rq_clock(busiest);
9636 :
9637 : /*
9638 : * cur_ld_moved - load moved in current iteration
9639 : * ld_moved - cumulative load moved across iterations
9640 : */
9641 2608 : cur_ld_moved = detach_tasks(&env);
9642 :
9643 : /*
9644 : * We've detached some tasks from busiest_rq. Every
9645 : * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
9646 : * unlock busiest->lock, and we are able to be sure
9647 : * that nobody can manipulate the tasks in parallel.
9648 : * See task_rq_lock() family for the details.
9649 : */
9650 :
9651 2607 : rq_unlock(busiest, &rf);
9652 :
9653 2608 : if (cur_ld_moved) {
9654 826 : attach_tasks(&env);
9655 826 : ld_moved += cur_ld_moved;
9656 : }
9657 :
9658 2608 : local_irq_restore(rf.flags);
9659 :
9660 2608 : if (env.flags & LBF_NEED_BREAK) {
9661 0 : env.flags &= ~LBF_NEED_BREAK;
9662 0 : goto more_balance;
9663 : }
9664 :
9665 : /*
9666 : * Revisit (affine) tasks on src_cpu that couldn't be moved to
9667 : * us and move them to an alternate dst_cpu in our sched_group
9668 : * where they can run. The upper limit on how many times we
9669 : * iterate on same src_cpu is dependent on number of CPUs in our
9670 : * sched_group.
9671 : *
9672 : * This changes load balance semantics a bit on who can move
9673 : * load to a given_cpu. In addition to the given_cpu itself
9674 : * (or a ilb_cpu acting on its behalf where given_cpu is
9675 : * nohz-idle), we now have balance_cpu in a position to move
9676 : * load to given_cpu. In rare situations, this may cause
9677 : * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
9678 : * _independently_ and at _same_ time to move some load to
9679 : * given_cpu) causing exceess load to be moved to given_cpu.
9680 : * This however should not happen so much in practice and
9681 : * moreover subsequent load balance cycles should correct the
9682 : * excess load moved.
9683 : */
9684 2608 : if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9685 :
9686 : /* Prevent to re-select dst_cpu via env's CPUs */
9687 0 : __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9688 :
9689 0 : env.dst_rq = cpu_rq(env.new_dst_cpu);
9690 0 : env.dst_cpu = env.new_dst_cpu;
9691 0 : env.flags &= ~LBF_DST_PINNED;
9692 0 : env.loop = 0;
9693 0 : env.loop_break = sched_nr_migrate_break;
9694 :
9695 : /*
9696 : * Go back to "more_balance" rather than "redo" since we
9697 : * need to continue with same src_cpu.
9698 : */
9699 0 : goto more_balance;
9700 : }
9701 :
9702 : /*
9703 : * We failed to reach balance because of affinity.
9704 : */
9705 2608 : if (sd_parent) {
9706 0 : int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9707 :
9708 0 : if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
9709 0 : *group_imbalance = 1;
9710 : }
9711 :
9712 : /* All tasks on this runqueue were pinned by CPU affinity */
9713 2608 : if (unlikely(env.flags & LBF_ALL_PINNED)) {
9714 76 : __cpumask_clear_cpu(cpu_of(busiest), cpus);
9715 : /*
9716 : * Attempting to continue load balancing at the current
9717 : * sched_domain level only makes sense if there are
9718 : * active CPUs remaining as possible busiest CPUs to
9719 : * pull load from which are not contained within the
9720 : * destination group that is receiving any migrated
9721 : * load.
9722 : */
9723 76 : if (!cpumask_subset(cpus, env.dst_grpmask)) {
9724 76 : env.loop = 0;
9725 76 : env.loop_break = sched_nr_migrate_break;
9726 76 : goto redo;
9727 : }
9728 0 : goto out_all_pinned;
9729 : }
9730 : }
9731 :
9732 2538 : if (!ld_moved) {
9733 1712 : schedstat_inc(sd->lb_failed[idle]);
9734 : /*
9735 : * Increment the failure counter only on periodic balance.
9736 : * We do not want newidle balance, which can be very
9737 : * frequent, pollute the failure counter causing
9738 : * excessive cache_hot migrations and active balances.
9739 : */
9740 1712 : if (idle != CPU_NEWLY_IDLE)
9741 829 : sd->nr_balance_failed++;
9742 :
9743 1712 : if (need_active_balance(&env)) {
9744 1 : unsigned long flags;
9745 :
9746 1 : raw_spin_lock_irqsave(&busiest->lock, flags);
9747 :
9748 : /*
9749 : * Don't kick the active_load_balance_cpu_stop,
9750 : * if the curr task on busiest CPU can't be
9751 : * moved to this_cpu:
9752 : */
9753 1 : if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9754 0 : raw_spin_unlock_irqrestore(&busiest->lock,
9755 : flags);
9756 0 : goto out_one_pinned;
9757 : }
9758 :
9759 : /* Record that we found at least one task that could run on this_cpu */
9760 1 : env.flags &= ~LBF_ALL_PINNED;
9761 :
9762 : /*
9763 : * ->active_balance synchronizes accesses to
9764 : * ->active_balance_work. Once set, it's cleared
9765 : * only after active load balance is finished.
9766 : */
9767 1 : if (!busiest->active_balance) {
9768 1 : busiest->active_balance = 1;
9769 1 : busiest->push_cpu = this_cpu;
9770 1 : active_balance = 1;
9771 : }
9772 1 : raw_spin_unlock_irqrestore(&busiest->lock, flags);
9773 :
9774 1 : if (active_balance) {
9775 1 : stop_one_cpu_nowait(cpu_of(busiest),
9776 : active_load_balance_cpu_stop, busiest,
9777 : &busiest->active_balance_work);
9778 : }
9779 :
9780 : /* We've kicked active balancing, force task migration. */
9781 1 : sd->nr_balance_failed = sd->cache_nice_tries+1;
9782 : }
9783 : } else {
9784 826 : sd->nr_balance_failed = 0;
9785 : }
9786 :
9787 2538 : if (likely(!active_balance) || need_active_balance(&env)) {
9788 : /* We were unbalanced, so reset the balancing interval */
9789 2537 : sd->balance_interval = sd->min_interval;
9790 : }
9791 :
9792 2538 : goto out;
9793 :
9794 6773 : out_balanced:
9795 : /*
9796 : * We reach balance although we may have faced some affinity
9797 : * constraints. Clear the imbalance flag only if other tasks got
9798 : * a chance to move and fix the imbalance.
9799 : */
9800 6773 : if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
9801 0 : int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9802 :
9803 0 : if (*group_imbalance)
9804 0 : *group_imbalance = 0;
9805 : }
9806 :
9807 6773 : out_all_pinned:
9808 : /*
9809 : * We reach balance because all tasks are pinned at this level so
9810 : * we can't migrate them. Let the imbalance flag set so parent level
9811 : * can try to migrate them.
9812 : */
9813 6773 : schedstat_inc(sd->lb_balanced[idle]);
9814 :
9815 6773 : sd->nr_balance_failed = 0;
9816 :
9817 6753 : out_one_pinned:
9818 6753 : ld_moved = 0;
9819 :
9820 : /*
9821 : * newidle_balance() disregards balance intervals, so we could
9822 : * repeatedly reach this code, which would lead to balance_interval
9823 : * skyrocketting in a short amount of time. Skip the balance_interval
9824 : * increase logic to avoid that.
9825 : */
9826 6753 : if (env.idle == CPU_NEWLY_IDLE)
9827 1549 : goto out;
9828 :
9829 : /* tune up the balancing interval */
9830 5204 : if ((env.flags & LBF_ALL_PINNED &&
9831 53 : sd->balance_interval < MAX_PINNED_INTERVAL) ||
9832 5151 : sd->balance_interval < sd->max_interval)
9833 972 : sd->balance_interval *= 2;
9834 4232 : out:
9835 9291 : return ld_moved;
9836 : }
9837 :
9838 : static inline unsigned long
9839 22589 : get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9840 : {
9841 22589 : unsigned long interval = sd->balance_interval;
9842 :
9843 22589 : if (cpu_busy)
9844 3448 : interval *= sd->busy_factor;
9845 :
9846 : /* scale ms to jiffies */
9847 22589 : interval = msecs_to_jiffies(interval);
9848 :
9849 : /*
9850 : * Reduce likelihood of busy balancing at higher domains racing with
9851 : * balancing at lower domains by preventing their balancing periods
9852 : * from being multiples of each other.
9853 : */
9854 22605 : if (cpu_busy)
9855 3456 : interval -= 1;
9856 :
9857 22605 : interval = clamp(interval, 1UL, max_load_balance_interval);
9858 :
9859 22605 : return interval;
9860 : }
9861 :
9862 : static inline void
9863 7416 : update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
9864 : {
9865 7416 : unsigned long interval, next;
9866 :
9867 : /* used by idle balance, so cpu_busy = 0 */
9868 7416 : interval = get_sd_balance_interval(sd, 0);
9869 7416 : next = sd->last_balance + interval;
9870 :
9871 7416 : if (time_after(*next_balance, next))
9872 7416 : *next_balance = next;
9873 7416 : }
9874 :
9875 : /*
9876 : * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
9877 : * running tasks off the busiest CPU onto idle CPUs. It requires at
9878 : * least 1 task to be running on each physical CPU where possible, and
9879 : * avoids physical / logical imbalances.
9880 : */
9881 1 : static int active_load_balance_cpu_stop(void *data)
9882 : {
9883 1 : struct rq *busiest_rq = data;
9884 1 : int busiest_cpu = cpu_of(busiest_rq);
9885 1 : int target_cpu = busiest_rq->push_cpu;
9886 1 : struct rq *target_rq = cpu_rq(target_cpu);
9887 1 : struct sched_domain *sd;
9888 1 : struct task_struct *p = NULL;
9889 1 : struct rq_flags rf;
9890 :
9891 1 : rq_lock_irq(busiest_rq, &rf);
9892 : /*
9893 : * Between queueing the stop-work and running it is a hole in which
9894 : * CPUs can become inactive. We should not move tasks from or to
9895 : * inactive CPUs.
9896 : */
9897 2 : if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
9898 0 : goto out_unlock;
9899 :
9900 : /* Make sure the requested CPU hasn't gone down in the meantime: */
9901 1 : if (unlikely(busiest_cpu != smp_processor_id() ||
9902 : !busiest_rq->active_balance))
9903 0 : goto out_unlock;
9904 :
9905 : /* Is there any task to move? */
9906 1 : if (busiest_rq->nr_running <= 1)
9907 0 : goto out_unlock;
9908 :
9909 : /*
9910 : * This condition is "impossible", if it occurs
9911 : * we need to fix it. Originally reported by
9912 : * Bjorn Helgaas on a 128-CPU setup.
9913 : */
9914 1 : BUG_ON(busiest_rq == target_rq);
9915 :
9916 : /* Search for an sd spanning us and the target CPU. */
9917 1 : rcu_read_lock();
9918 3 : for_each_domain(target_cpu, sd) {
9919 1 : if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9920 : break;
9921 : }
9922 :
9923 1 : if (likely(sd)) {
9924 1 : struct lb_env env = {
9925 : .sd = sd,
9926 : .dst_cpu = target_cpu,
9927 : .dst_rq = target_rq,
9928 1 : .src_cpu = busiest_rq->cpu,
9929 : .src_rq = busiest_rq,
9930 : .idle = CPU_IDLE,
9931 : /*
9932 : * can_migrate_task() doesn't need to compute new_dst_cpu
9933 : * for active balancing. Since we have CPU_IDLE, but no
9934 : * @dst_grpmask we need to make that test go away with lying
9935 : * about DST_PINNED.
9936 : */
9937 : .flags = LBF_DST_PINNED,
9938 : };
9939 :
9940 1 : schedstat_inc(sd->alb_count);
9941 1 : update_rq_clock(busiest_rq);
9942 :
9943 1 : p = detach_one_task(&env);
9944 1 : if (p) {
9945 1 : schedstat_inc(sd->alb_pushed);
9946 : /* Active balancing done, reset the failure counter. */
9947 1 : sd->nr_balance_failed = 0;
9948 : } else {
9949 1 : schedstat_inc(sd->alb_failed);
9950 : }
9951 : }
9952 1 : rcu_read_unlock();
9953 1 : out_unlock:
9954 1 : busiest_rq->active_balance = 0;
9955 1 : rq_unlock(busiest_rq, &rf);
9956 :
9957 1 : if (p)
9958 1 : attach_one_task(target_rq, p);
9959 :
9960 1 : local_irq_enable();
9961 :
9962 1 : return 0;
9963 : }
9964 :
9965 : static DEFINE_SPINLOCK(balancing);
9966 :
9967 : /*
9968 : * Scale the max load_balance interval with the number of CPUs in the system.
9969 : * This trades load-balance latency on larger machines for less cross talk.
9970 : */
9971 4 : void update_max_interval(void)
9972 : {
9973 4 : max_load_balance_interval = HZ*num_online_cpus()/10;
9974 4 : }
9975 :
9976 : /*
9977 : * It checks each scheduling domain to see if it is due to be balanced,
9978 : * and initiates a balancing operation if so.
9979 : *
9980 : * Balancing parameters are set up in init_sched_domains.
9981 : */
9982 8806 : static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9983 : {
9984 8806 : int continue_balancing = 1;
9985 8806 : int cpu = rq->cpu;
9986 8806 : int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
9987 8806 : unsigned long interval;
9988 8806 : struct sched_domain *sd;
9989 : /* Earliest time when we have to do rebalance again */
9990 8806 : unsigned long next_balance = jiffies + 60*HZ;
9991 8806 : int update_next_balance = 0;
9992 8806 : int need_serialize, need_decay = 0;
9993 8806 : u64 max_cost = 0;
9994 :
9995 8806 : rcu_read_lock();
9996 35487 : for_each_domain(cpu, sd) {
9997 : /*
9998 : * Decay the newidle max times here because this is a regular
9999 : * visit to all the domains. Decay ~1% per second.
10000 : */
10001 8921 : if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
10002 121 : sd->max_newidle_lb_cost =
10003 121 : (sd->max_newidle_lb_cost * 253) / 256;
10004 121 : sd->next_decay_max_lb_cost = jiffies + HZ;
10005 121 : need_decay = 1;
10006 : }
10007 8921 : max_cost += sd->max_newidle_lb_cost;
10008 :
10009 : /*
10010 : * Stop the load balance at this level. There is another
10011 : * CPU in our sched group which is doing load balancing more
10012 : * actively.
10013 : */
10014 8921 : if (!continue_balancing) {
10015 96 : if (need_decay)
10016 0 : continue;
10017 : break;
10018 : }
10019 :
10020 8825 : interval = get_sd_balance_interval(sd, busy);
10021 :
10022 8812 : need_serialize = sd->flags & SD_SERIALIZE;
10023 8812 : if (need_serialize) {
10024 0 : if (!spin_trylock(&balancing))
10025 0 : goto out;
10026 : }
10027 :
10028 8812 : if (time_after_eq(jiffies, sd->last_balance + interval)) {
10029 6411 : if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
10030 : /*
10031 : * The LBF_DST_PINNED logic could have changed
10032 : * env->dst_cpu, so we can't know our idle
10033 : * state even if we migrated tasks. Update it.
10034 : */
10035 379 : idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
10036 379 : busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10037 : }
10038 6405 : sd->last_balance = jiffies;
10039 6405 : interval = get_sd_balance_interval(sd, busy);
10040 : }
10041 8812 : if (need_serialize)
10042 0 : spin_unlock(&balancing);
10043 8812 : out:
10044 8812 : if (time_after(next_balance, sd->last_balance + interval)) {
10045 8819 : next_balance = sd->last_balance + interval;
10046 8819 : update_next_balance = 1;
10047 : }
10048 : }
10049 8893 : if (need_decay) {
10050 : /*
10051 : * Ensure the rq-wide value also decays but keep it at a
10052 : * reasonable floor to avoid funnies with rq->avg_idle.
10053 : */
10054 121 : rq->max_idle_balance_cost =
10055 121 : max((u64)sysctl_sched_migration_cost, max_cost);
10056 : }
10057 8893 : rcu_read_unlock();
10058 :
10059 : /*
10060 : * next_balance will be updated only when there is a need.
10061 : * When the cpu is attached to null domain for ex, it will not be
10062 : * updated.
10063 : */
10064 8830 : if (likely(update_next_balance)) {
10065 8830 : rq->next_balance = next_balance;
10066 :
10067 : #ifdef CONFIG_NO_HZ_COMMON
10068 : /*
10069 : * If this CPU has been elected to perform the nohz idle
10070 : * balance. Other idle CPUs have already rebalanced with
10071 : * nohz_idle_balance() and nohz.next_balance has been
10072 : * updated accordingly. This CPU is now running the idle load
10073 : * balance for itself and we need to update the
10074 : * nohz.next_balance accordingly.
10075 : */
10076 8830 : if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
10077 9 : nohz.next_balance = rq->next_balance;
10078 : #endif
10079 : }
10080 8830 : }
10081 :
10082 24546 : static inline int on_null_domain(struct rq *rq)
10083 : {
10084 24546 : return unlikely(!rcu_dereference_sched(rq->sd));
10085 : }
10086 :
10087 : #ifdef CONFIG_NO_HZ_COMMON
10088 : /*
10089 : * idle load balancing details
10090 : * - When one of the busy CPUs notice that there may be an idle rebalancing
10091 : * needed, they will kick the idle load balancer, which then does idle
10092 : * load balancing for all the idle CPUs.
10093 : * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
10094 : * anywhere yet.
10095 : */
10096 :
10097 236 : static inline int find_new_ilb(void)
10098 : {
10099 236 : int ilb;
10100 :
10101 241 : for_each_cpu_and(ilb, nohz.idle_cpus_mask,
10102 : housekeeping_cpumask(HK_FLAG_MISC)) {
10103 :
10104 238 : if (ilb == smp_processor_id())
10105 2 : continue;
10106 :
10107 236 : if (idle_cpu(ilb))
10108 233 : return ilb;
10109 : }
10110 :
10111 3 : return nr_cpu_ids;
10112 : }
10113 :
10114 : /*
10115 : * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
10116 : * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
10117 : */
10118 236 : static void kick_ilb(unsigned int flags)
10119 : {
10120 236 : int ilb_cpu;
10121 :
10122 : /*
10123 : * Increase nohz.next_balance only when if full ilb is triggered but
10124 : * not if we only update stats.
10125 : */
10126 236 : if (flags & NOHZ_BALANCE_KICK)
10127 147 : nohz.next_balance = jiffies+1;
10128 :
10129 236 : ilb_cpu = find_new_ilb();
10130 :
10131 236 : if (ilb_cpu >= nr_cpu_ids)
10132 : return;
10133 :
10134 : /*
10135 : * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
10136 : * the first flag owns it; cleared by nohz_csd_func().
10137 : */
10138 234 : flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
10139 234 : if (flags & NOHZ_KICK_MASK)
10140 : return;
10141 :
10142 : /*
10143 : * This way we generate an IPI on the target CPU which
10144 : * is idle. And the softirq performing nohz idle load balance
10145 : * will be run before returning from the IPI.
10146 : */
10147 221 : smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
10148 : }
10149 :
10150 : /*
10151 : * Current decision point for kicking the idle load balancer in the presence
10152 : * of idle CPUs in the system.
10153 : */
10154 25088 : static void nohz_balancer_kick(struct rq *rq)
10155 : {
10156 25088 : unsigned long now = jiffies;
10157 25088 : struct sched_domain_shared *sds;
10158 25088 : struct sched_domain *sd;
10159 25088 : int nr_busy, i, cpu = rq->cpu;
10160 25088 : unsigned int flags = 0;
10161 :
10162 25088 : if (unlikely(rq->idle_balance))
10163 : return;
10164 :
10165 : /*
10166 : * We may be recently in ticked or tickless idle mode. At the first
10167 : * busy tick after returning from idle, we will update the busy stats.
10168 : */
10169 14885 : nohz_balance_exit_idle(rq);
10170 :
10171 : /*
10172 : * None are in tickless mode and hence no need for NOHZ idle load
10173 : * balancing.
10174 : */
10175 15071 : if (likely(!atomic_read(&nohz.nr_cpus)))
10176 : return;
10177 :
10178 2435 : if (READ_ONCE(nohz.has_blocked) &&
10179 2411 : time_after(now, READ_ONCE(nohz.next_blocked)))
10180 96 : flags = NOHZ_STATS_KICK;
10181 :
10182 2435 : if (time_before(now, nohz.next_balance))
10183 57 : goto out;
10184 :
10185 2378 : if (rq->nr_running >= 2) {
10186 147 : flags = NOHZ_KICK_MASK;
10187 147 : goto out;
10188 : }
10189 :
10190 2231 : rcu_read_lock();
10191 :
10192 2233 : sd = rcu_dereference(rq->sd);
10193 2231 : if (sd) {
10194 : /*
10195 : * If there's a CFS task and the current CPU has reduced
10196 : * capacity; kick the ILB to see if there's a better CPU to run
10197 : * on.
10198 : */
10199 2231 : if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
10200 0 : flags = NOHZ_KICK_MASK;
10201 0 : goto unlock;
10202 : }
10203 : }
10204 :
10205 2231 : sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
10206 2232 : if (sd) {
10207 : /*
10208 : * When ASYM_PACKING; see if there's a more preferred CPU
10209 : * currently idle; in which case, kick the ILB to move tasks
10210 : * around.
10211 : */
10212 1 : for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
10213 0 : if (sched_asym_prefer(i, cpu)) {
10214 0 : flags = NOHZ_KICK_MASK;
10215 0 : goto unlock;
10216 : }
10217 : }
10218 : }
10219 :
10220 2231 : sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10221 2229 : if (sd) {
10222 : /*
10223 : * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
10224 : * to run the misfit task on.
10225 : */
10226 0 : if (check_misfit_status(rq, sd)) {
10227 0 : flags = NOHZ_KICK_MASK;
10228 0 : goto unlock;
10229 : }
10230 :
10231 : /*
10232 : * For asymmetric systems, we do not want to nicely balance
10233 : * cache use, instead we want to embrace asymmetry and only
10234 : * ensure tasks have enough CPU capacity.
10235 : *
10236 : * Skip the LLC logic because it's not relevant in that case.
10237 : */
10238 0 : goto unlock;
10239 : }
10240 :
10241 2229 : sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10242 2231 : if (sds) {
10243 : /*
10244 : * If there is an imbalance between LLC domains (IOW we could
10245 : * increase the overall cache use), we need some less-loaded LLC
10246 : * domain to pull some load. Likewise, we may need to spread
10247 : * load within the current LLC domain (e.g. packed SMT cores but
10248 : * other CPUs are idle). We can't really know from here how busy
10249 : * the others are - so just get a nohz balance going if it looks
10250 : * like this LLC domain has tasks we could move.
10251 : */
10252 0 : nr_busy = atomic_read(&sds->nr_busy_cpus);
10253 0 : if (nr_busy > 1) {
10254 0 : flags = NOHZ_KICK_MASK;
10255 0 : goto unlock;
10256 : }
10257 : }
10258 2231 : unlock:
10259 2231 : rcu_read_unlock();
10260 2290 : out:
10261 2437 : if (flags)
10262 233 : kick_ilb(flags);
10263 : }
10264 :
10265 207 : static void set_cpu_sd_state_busy(int cpu)
10266 : {
10267 207 : struct sched_domain *sd;
10268 :
10269 207 : rcu_read_lock();
10270 207 : sd = rcu_dereference(per_cpu(sd_llc, cpu));
10271 :
10272 207 : if (!sd || !sd->nohz_idle)
10273 207 : goto unlock;
10274 0 : sd->nohz_idle = 0;
10275 :
10276 0 : atomic_inc(&sd->shared->nr_busy_cpus);
10277 207 : unlock:
10278 207 : rcu_read_unlock();
10279 207 : }
10280 :
10281 14825 : void nohz_balance_exit_idle(struct rq *rq)
10282 : {
10283 14825 : SCHED_WARN_ON(rq != this_rq());
10284 :
10285 14874 : if (likely(!rq->nohz_tick_stopped))
10286 : return;
10287 :
10288 207 : rq->nohz_tick_stopped = 0;
10289 207 : cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
10290 207 : atomic_dec(&nohz.nr_cpus);
10291 :
10292 207 : set_cpu_sd_state_busy(rq->cpu);
10293 : }
10294 :
10295 208 : static void set_cpu_sd_state_idle(int cpu)
10296 : {
10297 208 : struct sched_domain *sd;
10298 :
10299 208 : rcu_read_lock();
10300 208 : sd = rcu_dereference(per_cpu(sd_llc, cpu));
10301 :
10302 208 : if (!sd || sd->nohz_idle)
10303 208 : goto unlock;
10304 0 : sd->nohz_idle = 1;
10305 :
10306 0 : atomic_dec(&sd->shared->nr_busy_cpus);
10307 208 : unlock:
10308 208 : rcu_read_unlock();
10309 208 : }
10310 :
10311 : /*
10312 : * This routine will record that the CPU is going idle with tick stopped.
10313 : * This info will be used in performing idle load balancing in the future.
10314 : */
10315 753 : void nohz_balance_enter_idle(int cpu)
10316 : {
10317 753 : struct rq *rq = cpu_rq(cpu);
10318 :
10319 753 : SCHED_WARN_ON(cpu != smp_processor_id());
10320 :
10321 : /* If this CPU is going down, then nothing needs to be done: */
10322 753 : if (!cpu_active(cpu))
10323 : return;
10324 :
10325 : /* Spare idle load balancing on CPUs that don't want to be disturbed: */
10326 753 : if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
10327 : return;
10328 :
10329 : /*
10330 : * Can be set safely without rq->lock held
10331 : * If a clear happens, it will have evaluated last additions because
10332 : * rq->lock is held during the check and the clear
10333 : */
10334 753 : rq->has_blocked_load = 1;
10335 :
10336 : /*
10337 : * The tick is still stopped but load could have been added in the
10338 : * meantime. We set the nohz.has_blocked flag to trig a check of the
10339 : * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
10340 : * of nohz.has_blocked can only happen after checking the new load
10341 : */
10342 753 : if (rq->nohz_tick_stopped)
10343 545 : goto out;
10344 :
10345 : /* If we're a completely isolated CPU, we don't play: */
10346 208 : if (on_null_domain(rq))
10347 : return;
10348 :
10349 208 : rq->nohz_tick_stopped = 1;
10350 :
10351 208 : cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
10352 208 : atomic_inc(&nohz.nr_cpus);
10353 :
10354 : /*
10355 : * Ensures that if nohz_idle_balance() fails to observe our
10356 : * @idle_cpus_mask store, it must observe the @has_blocked
10357 : * store.
10358 : */
10359 208 : smp_mb__after_atomic();
10360 :
10361 208 : set_cpu_sd_state_idle(cpu);
10362 :
10363 753 : out:
10364 : /*
10365 : * Each time a cpu enter idle, we assume that it has blocked load and
10366 : * enable the periodic update of the load of idle cpus
10367 : */
10368 753 : WRITE_ONCE(nohz.has_blocked, 1);
10369 : }
10370 :
10371 : /*
10372 : * Internal function that runs load balance for all idle cpus. The load balance
10373 : * can be a simple update of blocked load or a complete load balance with
10374 : * tasks movement depending of flags.
10375 : * The function returns false if the loop has stopped before running
10376 : * through all idle CPUs.
10377 : */
10378 395 : static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
10379 : enum cpu_idle_type idle)
10380 : {
10381 : /* Earliest time when we have to do rebalance again */
10382 395 : unsigned long now = jiffies;
10383 395 : unsigned long next_balance = now + 60*HZ;
10384 395 : bool has_blocked_load = false;
10385 395 : int update_next_balance = 0;
10386 395 : int this_cpu = this_rq->cpu;
10387 395 : int balance_cpu;
10388 395 : int ret = false;
10389 395 : struct rq *rq;
10390 :
10391 395 : SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
10392 :
10393 : /*
10394 : * We assume there will be no idle load after this update and clear
10395 : * the has_blocked flag. If a cpu enters idle in the mean time, it will
10396 : * set the has_blocked flag and trig another update of idle load.
10397 : * Because a cpu that becomes idle, is added to idle_cpus_mask before
10398 : * setting the flag, we are sure to not clear the state and not
10399 : * check the load of an idle cpu.
10400 : */
10401 395 : WRITE_ONCE(nohz.has_blocked, 0);
10402 :
10403 : /*
10404 : * Ensures that if we miss the CPU, we must see the has_blocked
10405 : * store from nohz_balance_enter_idle().
10406 : */
10407 395 : smp_mb();
10408 :
10409 1272 : for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
10410 488 : if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
10411 275 : continue;
10412 :
10413 : /*
10414 : * If this CPU gets work to do, stop the load balancing
10415 : * work being done for other CPUs. Next load
10416 : * balancing owner will pick it up.
10417 : */
10418 213 : if (need_resched()) {
10419 6 : has_blocked_load = true;
10420 6 : goto abort;
10421 : }
10422 :
10423 207 : rq = cpu_rq(balance_cpu);
10424 :
10425 207 : has_blocked_load |= update_nohz_stats(rq, true);
10426 :
10427 : /*
10428 : * If time for next balance is due,
10429 : * do the balance.
10430 : */
10431 207 : if (time_after_eq(jiffies, rq->next_balance)) {
10432 174 : struct rq_flags rf;
10433 :
10434 174 : rq_lock_irqsave(rq, &rf);
10435 174 : update_rq_clock(rq);
10436 174 : rq_unlock_irqrestore(rq, &rf);
10437 :
10438 174 : if (flags & NOHZ_BALANCE_KICK)
10439 61 : rebalance_domains(rq, CPU_IDLE);
10440 : }
10441 :
10442 207 : if (time_after(next_balance, rq->next_balance)) {
10443 167 : next_balance = rq->next_balance;
10444 167 : update_next_balance = 1;
10445 : }
10446 : }
10447 :
10448 : /*
10449 : * next_balance will be updated only when there is a need.
10450 : * When the CPU is attached to null domain for ex, it will not be
10451 : * updated.
10452 : */
10453 389 : if (likely(update_next_balance))
10454 156 : nohz.next_balance = next_balance;
10455 :
10456 : /* Newly idle CPU doesn't need an update */
10457 389 : if (idle != CPU_NEWLY_IDLE) {
10458 208 : update_blocked_averages(this_cpu);
10459 208 : has_blocked_load |= this_rq->has_blocked_load;
10460 : }
10461 :
10462 389 : if (flags & NOHZ_BALANCE_KICK)
10463 135 : rebalance_domains(this_rq, CPU_IDLE);
10464 :
10465 389 : WRITE_ONCE(nohz.next_blocked,
10466 : now + msecs_to_jiffies(LOAD_AVG_PERIOD));
10467 :
10468 : /* The full idle balance loop has been done */
10469 389 : ret = true;
10470 :
10471 389 : abort:
10472 : /* There is still blocked load, enable periodic update */
10473 395 : if (has_blocked_load)
10474 278 : WRITE_ONCE(nohz.has_blocked, 1);
10475 :
10476 395 : return ret;
10477 : }
10478 :
10479 : /*
10480 : * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
10481 : * rebalancing for all the cpus for whom scheduler ticks are stopped.
10482 : */
10483 8793 : static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10484 : {
10485 8793 : unsigned int flags = this_rq->nohz_idle_balance;
10486 :
10487 8793 : if (!flags)
10488 : return false;
10489 :
10490 212 : this_rq->nohz_idle_balance = 0;
10491 :
10492 212 : if (idle != CPU_IDLE)
10493 : return false;
10494 :
10495 211 : _nohz_idle_balance(this_rq, flags, idle);
10496 :
10497 211 : return true;
10498 : }
10499 :
10500 4535 : static void nohz_newidle_balance(struct rq *this_rq)
10501 : {
10502 4535 : int this_cpu = this_rq->cpu;
10503 :
10504 : /*
10505 : * This CPU doesn't want to be disturbed by scheduler
10506 : * housekeeping
10507 : */
10508 4535 : if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
10509 : return;
10510 :
10511 : /* Will wake up very soon. No time for doing anything else*/
10512 4535 : if (this_rq->avg_idle < sysctl_sched_migration_cost)
10513 : return;
10514 :
10515 : /* Don't need to update blocked load of idle CPUs*/
10516 3503 : if (!READ_ONCE(nohz.has_blocked) ||
10517 1977 : time_before(jiffies, READ_ONCE(nohz.next_blocked)))
10518 : return;
10519 :
10520 184 : raw_spin_unlock(&this_rq->lock);
10521 : /*
10522 : * This CPU is going to be idle and blocked load of idle CPUs
10523 : * need to be updated. Run the ilb locally as it is a good
10524 : * candidate for ilb instead of waking up another idle CPU.
10525 : * Kick an normal ilb if we failed to do the update.
10526 : */
10527 184 : if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
10528 3 : kick_ilb(NOHZ_STATS_KICK);
10529 184 : raw_spin_lock(&this_rq->lock);
10530 : }
10531 :
10532 : #else /* !CONFIG_NO_HZ_COMMON */
10533 : static inline void nohz_balancer_kick(struct rq *rq) { }
10534 :
10535 : static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10536 : {
10537 : return false;
10538 : }
10539 :
10540 : static inline void nohz_newidle_balance(struct rq *this_rq) { }
10541 : #endif /* CONFIG_NO_HZ_COMMON */
10542 :
10543 : /*
10544 : * newidle_balance is called by schedule() if this_cpu is about to become
10545 : * idle. Attempts to pull tasks from other CPUs.
10546 : *
10547 : * Returns:
10548 : * < 0 - we released the lock and there are !fair tasks present
10549 : * 0 - failed, no new tasks
10550 : * > 0 - success, new (fair) tasks present
10551 : */
10552 7424 : static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
10553 : {
10554 7424 : unsigned long next_balance = jiffies + HZ;
10555 7424 : int this_cpu = this_rq->cpu;
10556 7424 : struct sched_domain *sd;
10557 7424 : int pulled_task = 0;
10558 7424 : u64 curr_cost = 0;
10559 :
10560 7424 : update_misfit_status(NULL, this_rq);
10561 : /*
10562 : * We must set idle_stamp _before_ calling idle_balance(), such that we
10563 : * measure the duration of idle_balance() as idle time.
10564 : */
10565 7425 : this_rq->idle_stamp = rq_clock(this_rq);
10566 :
10567 : /*
10568 : * Do not pull tasks towards !active CPUs...
10569 : */
10570 7425 : if (!cpu_active(this_cpu))
10571 : return 0;
10572 :
10573 : /*
10574 : * This is OK, because current is on_cpu, which avoids it being picked
10575 : * for load-balance and preemption/IRQs are still disabled avoiding
10576 : * further scheduler activity on it and we're being very careful to
10577 : * re-start the picking loop.
10578 : */
10579 7422 : rq_unpin_lock(this_rq, rf);
10580 :
10581 7422 : if (this_rq->avg_idle < sysctl_sched_migration_cost ||
10582 6390 : !READ_ONCE(this_rq->rd->overload)) {
10583 :
10584 4535 : rcu_read_lock();
10585 9070 : sd = rcu_dereference_check_sched_domain(this_rq->sd);
10586 4535 : if (sd)
10587 4535 : update_next_balance(sd, &next_balance);
10588 4535 : rcu_read_unlock();
10589 :
10590 4535 : nohz_newidle_balance(this_rq);
10591 :
10592 4535 : goto out;
10593 : }
10594 :
10595 2887 : raw_spin_unlock(&this_rq->lock);
10596 :
10597 2886 : update_blocked_averages(this_cpu);
10598 2887 : rcu_read_lock();
10599 11094 : for_each_domain(this_cpu, sd) {
10600 2882 : int continue_balancing = 1;
10601 2882 : u64 t0, domain_cost;
10602 :
10603 2882 : if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10604 0 : update_next_balance(sd, &next_balance);
10605 447 : break;
10606 : }
10607 :
10608 2882 : if (sd->flags & SD_BALANCE_NEWIDLE) {
10609 2881 : t0 = sched_clock_cpu(this_cpu);
10610 :
10611 2881 : pulled_task = load_balance(this_cpu, this_rq,
10612 : sd, CPU_NEWLY_IDLE,
10613 : &continue_balancing);
10614 :
10615 2879 : domain_cost = sched_clock_cpu(this_cpu) - t0;
10616 2881 : if (domain_cost > sd->max_newidle_lb_cost)
10617 28 : sd->max_newidle_lb_cost = domain_cost;
10618 :
10619 2881 : curr_cost += domain_cost;
10620 : }
10621 :
10622 2882 : update_next_balance(sd, &next_balance);
10623 :
10624 : /*
10625 : * Stop searching for tasks to pull if there are
10626 : * now runnable tasks on this rq.
10627 : */
10628 2881 : if (pulled_task || this_rq->nr_running > 0)
10629 : break;
10630 : }
10631 2887 : rcu_read_unlock();
10632 :
10633 2885 : raw_spin_lock(&this_rq->lock);
10634 :
10635 2886 : if (curr_cost > this_rq->max_idle_balance_cost)
10636 0 : this_rq->max_idle_balance_cost = curr_cost;
10637 :
10638 2886 : out:
10639 : /*
10640 : * While browsing the domains, we released the rq lock, a task could
10641 : * have been enqueued in the meantime. Since we're not going idle,
10642 : * pretend we pulled a task.
10643 : */
10644 7421 : if (this_rq->cfs.h_nr_running && !pulled_task)
10645 0 : pulled_task = 1;
10646 :
10647 : /* Move the next balance forward */
10648 7421 : if (time_after(this_rq->next_balance, next_balance))
10649 3029 : this_rq->next_balance = next_balance;
10650 :
10651 : /* Is there a task of a high priority class? */
10652 7421 : if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10653 : pulled_task = -1;
10654 :
10655 7420 : if (pulled_task)
10656 448 : this_rq->idle_stamp = 0;
10657 :
10658 7421 : rq_repin_lock(this_rq, rf);
10659 :
10660 7421 : return pulled_task;
10661 : }
10662 :
10663 : /*
10664 : * run_rebalance_domains is triggered when needed from the scheduler tick.
10665 : * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
10666 : */
10667 8796 : static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
10668 : {
10669 8796 : struct rq *this_rq = this_rq();
10670 8816 : enum cpu_idle_type idle = this_rq->idle_balance ?
10671 8816 : CPU_IDLE : CPU_NOT_IDLE;
10672 :
10673 : /*
10674 : * If this CPU has a pending nohz_balance_kick, then do the
10675 : * balancing on behalf of the other idle CPUs whose ticks are
10676 : * stopped. Do nohz_idle_balance *before* rebalance_domains to
10677 : * give the idle CPUs a chance to load balance. Else we may
10678 : * load balance only within the local sched_domain hierarchy
10679 : * and abort nohz_idle_balance altogether if we pull some load.
10680 : */
10681 8816 : if (nohz_idle_balance(this_rq, idle))
10682 : return;
10683 :
10684 : /* normal load balance */
10685 8617 : update_blocked_averages(this_rq->cpu);
10686 8659 : rebalance_domains(this_rq, idle);
10687 : }
10688 :
10689 : /*
10690 : * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
10691 : */
10692 24332 : void trigger_load_balance(struct rq *rq)
10693 : {
10694 : /*
10695 : * Don't need to rebalance while attached to NULL domain or
10696 : * runqueue CPU is not active
10697 : */
10698 24332 : if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
10699 16 : return;
10700 :
10701 24847 : if (time_after_eq(jiffies, rq->next_balance))
10702 8759 : raise_softirq(SCHED_SOFTIRQ);
10703 :
10704 24969 : nohz_balancer_kick(rq);
10705 : }
10706 :
10707 8 : static void rq_online_fair(struct rq *rq)
10708 : {
10709 8 : update_sysctl();
10710 :
10711 8 : update_runtime_enabled(rq);
10712 8 : }
10713 :
10714 4 : static void rq_offline_fair(struct rq *rq)
10715 : {
10716 4 : update_sysctl();
10717 :
10718 : /* Ensure any throttled groups are reachable by pick_next_task */
10719 4 : unthrottle_offline_cfs_rqs(rq);
10720 4 : }
10721 :
10722 : #endif /* CONFIG_SMP */
10723 :
10724 : /*
10725 : * scheduler tick hitting a task of our scheduling class.
10726 : *
10727 : * NOTE: This function can be called remotely by the tick offload that
10728 : * goes along full dynticks. Therefore no local assumption can be made
10729 : * and everything must be accessed through the @rq and @curr passed in
10730 : * parameters.
10731 : */
10732 14617 : static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
10733 : {
10734 14617 : struct cfs_rq *cfs_rq;
10735 14617 : struct sched_entity *se = &curr->se;
10736 :
10737 14617 : for_each_sched_entity(se) {
10738 14617 : cfs_rq = cfs_rq_of(se);
10739 14617 : entity_tick(cfs_rq, se, queued);
10740 : }
10741 :
10742 14786 : if (static_branch_unlikely(&sched_numa_balancing))
10743 14720 : task_tick_numa(rq, curr);
10744 :
10745 14720 : update_misfit_status(curr, rq);
10746 14746 : update_overutilized_status(task_rq(curr));
10747 14794 : }
10748 :
10749 : /*
10750 : * called on fork with the child task as argument from the parent's context
10751 : * - child not yet on the tasklist
10752 : * - preemption disabled
10753 : */
10754 990 : static void task_fork_fair(struct task_struct *p)
10755 : {
10756 990 : struct cfs_rq *cfs_rq;
10757 990 : struct sched_entity *se = &p->se, *curr;
10758 990 : struct rq *rq = this_rq();
10759 990 : struct rq_flags rf;
10760 :
10761 990 : rq_lock(rq, &rf);
10762 990 : update_rq_clock(rq);
10763 :
10764 990 : cfs_rq = task_cfs_rq(current);
10765 990 : curr = cfs_rq->curr;
10766 990 : if (curr) {
10767 988 : update_curr(cfs_rq);
10768 988 : se->vruntime = curr->vruntime;
10769 : }
10770 990 : place_entity(cfs_rq, se, 1);
10771 :
10772 990 : if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
10773 : /*
10774 : * Upon rescheduling, sched_class::put_prev_task() will place
10775 : * 'current' within the tree based on its new key value.
10776 : */
10777 0 : swap(curr->vruntime, se->vruntime);
10778 0 : resched_curr(rq);
10779 : }
10780 :
10781 990 : se->vruntime -= cfs_rq->min_vruntime;
10782 990 : rq_unlock(rq, &rf);
10783 990 : }
10784 :
10785 : /*
10786 : * Priority of the task has changed. Check to see if we preempt
10787 : * the current task.
10788 : */
10789 : static void
10790 21 : prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
10791 : {
10792 21 : if (!task_on_rq_queued(p))
10793 : return;
10794 :
10795 13 : if (rq->cfs.nr_running == 1)
10796 : return;
10797 :
10798 : /*
10799 : * Reschedule if we are currently running on this runqueue and
10800 : * our priority decreased, or if we are not currently running on
10801 : * this runqueue and our priority is higher than the current's
10802 : */
10803 4 : if (task_current(rq, p)) {
10804 4 : if (p->prio > oldprio)
10805 1 : resched_curr(rq);
10806 : } else
10807 0 : check_preempt_curr(rq, p, 0);
10808 : }
10809 :
10810 4 : static inline bool vruntime_normalized(struct task_struct *p)
10811 : {
10812 4 : struct sched_entity *se = &p->se;
10813 :
10814 : /*
10815 : * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
10816 : * the dequeue_entity(.flags=0) will already have normalized the
10817 : * vruntime.
10818 : */
10819 4 : if (p->on_rq)
10820 : return true;
10821 :
10822 : /*
10823 : * When !on_rq, vruntime of the task has usually NOT been normalized.
10824 : * But there are some cases where it has already been normalized:
10825 : *
10826 : * - A forked child which is waiting for being woken up by
10827 : * wake_up_new_task().
10828 : * - A task which has been woken up by try_to_wake_up() and
10829 : * waiting for actually being woken up by sched_ttwu_pending().
10830 : */
10831 4 : if (!se->sum_exec_runtime ||
10832 2 : (p->state == TASK_WAKING && p->sched_remote_wakeup))
10833 2 : return true;
10834 :
10835 : return false;
10836 : }
10837 :
10838 : #ifdef CONFIG_FAIR_GROUP_SCHED
10839 : /*
10840 : * Propagate the changes of the sched_entity across the tg tree to make it
10841 : * visible to the root
10842 : */
10843 : static void propagate_entity_cfs_rq(struct sched_entity *se)
10844 : {
10845 : struct cfs_rq *cfs_rq;
10846 :
10847 : /* Start to propagate at parent */
10848 : se = se->parent;
10849 :
10850 : for_each_sched_entity(se) {
10851 : cfs_rq = cfs_rq_of(se);
10852 :
10853 : if (cfs_rq_throttled(cfs_rq))
10854 : break;
10855 :
10856 : update_load_avg(cfs_rq, se, UPDATE_TG);
10857 : }
10858 : }
10859 : #else
10860 1877 : static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10861 : #endif
10862 :
10863 889 : static void detach_entity_cfs_rq(struct sched_entity *se)
10864 : {
10865 889 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
10866 :
10867 : /* Catch up with the cfs_rq and remove our load when we leave */
10868 889 : update_load_avg(cfs_rq, se, 0);
10869 890 : detach_entity_load_avg(cfs_rq, se);
10870 890 : update_tg_load_avg(cfs_rq);
10871 890 : propagate_entity_cfs_rq(se);
10872 890 : }
10873 :
10874 987 : static void attach_entity_cfs_rq(struct sched_entity *se)
10875 : {
10876 987 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
10877 :
10878 : #ifdef CONFIG_FAIR_GROUP_SCHED
10879 : /*
10880 : * Since the real-depth could have been changed (only FAIR
10881 : * class maintain depth value), reset depth properly.
10882 : */
10883 : se->depth = se->parent ? se->parent->depth + 1 : 0;
10884 : #endif
10885 :
10886 : /* Synchronize entity with its cfs_rq */
10887 987 : update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10888 987 : attach_entity_load_avg(cfs_rq, se);
10889 987 : update_tg_load_avg(cfs_rq);
10890 987 : propagate_entity_cfs_rq(se);
10891 987 : }
10892 :
10893 4 : static void detach_task_cfs_rq(struct task_struct *p)
10894 : {
10895 4 : struct sched_entity *se = &p->se;
10896 4 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
10897 :
10898 4 : if (!vruntime_normalized(p)) {
10899 : /*
10900 : * Fix up our vruntime so that the current sleep doesn't
10901 : * cause 'unlimited' sleep bonus.
10902 : */
10903 2 : place_entity(cfs_rq, se, 0);
10904 2 : se->vruntime -= cfs_rq->min_vruntime;
10905 : }
10906 :
10907 4 : detach_entity_cfs_rq(se);
10908 4 : }
10909 :
10910 0 : static void attach_task_cfs_rq(struct task_struct *p)
10911 : {
10912 0 : struct sched_entity *se = &p->se;
10913 0 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
10914 :
10915 0 : attach_entity_cfs_rq(se);
10916 :
10917 0 : if (!vruntime_normalized(p))
10918 0 : se->vruntime += cfs_rq->min_vruntime;
10919 0 : }
10920 :
10921 4 : static void switched_from_fair(struct rq *rq, struct task_struct *p)
10922 : {
10923 4 : detach_task_cfs_rq(p);
10924 4 : }
10925 :
10926 0 : static void switched_to_fair(struct rq *rq, struct task_struct *p)
10927 : {
10928 0 : attach_task_cfs_rq(p);
10929 :
10930 0 : if (task_on_rq_queued(p)) {
10931 : /*
10932 : * We were most likely switched from sched_rt, so
10933 : * kick off the schedule if running, otherwise just see
10934 : * if we can still preempt the current task.
10935 : */
10936 0 : if (task_current(rq, p))
10937 0 : resched_curr(rq);
10938 : else
10939 0 : check_preempt_curr(rq, p, 0);
10940 : }
10941 0 : }
10942 :
10943 : /* Account for a task changing its policy or group.
10944 : *
10945 : * This routine is mostly called to set cfs_rq->curr field when a task
10946 : * migrates between groups/classes.
10947 : */
10948 15 : static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
10949 : {
10950 15 : struct sched_entity *se = &p->se;
10951 :
10952 : #ifdef CONFIG_SMP
10953 15 : if (task_on_rq_queued(p)) {
10954 : /*
10955 : * Move the next running task to the front of the list, so our
10956 : * cfs_tasks list becomes MRU one.
10957 : */
10958 15 : list_move(&se->group_node, &rq->cfs_tasks);
10959 : }
10960 : #endif
10961 :
10962 15 : for_each_sched_entity(se) {
10963 15 : struct cfs_rq *cfs_rq = cfs_rq_of(se);
10964 :
10965 15 : set_next_entity(cfs_rq, se);
10966 : /* ensure bandwidth has been allocated on our new cfs_rq */
10967 15 : account_cfs_rq_runtime(cfs_rq, 0);
10968 : }
10969 15 : }
10970 :
10971 4 : void init_cfs_rq(struct cfs_rq *cfs_rq)
10972 : {
10973 4 : cfs_rq->tasks_timeline = RB_ROOT_CACHED;
10974 4 : cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10975 : #ifndef CONFIG_64BIT
10976 : cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10977 : #endif
10978 : #ifdef CONFIG_SMP
10979 4 : raw_spin_lock_init(&cfs_rq->removed.lock);
10980 : #endif
10981 4 : }
10982 :
10983 : #ifdef CONFIG_FAIR_GROUP_SCHED
10984 : static void task_set_group_fair(struct task_struct *p)
10985 : {
10986 : struct sched_entity *se = &p->se;
10987 :
10988 : set_task_rq(p, task_cpu(p));
10989 : se->depth = se->parent ? se->parent->depth + 1 : 0;
10990 : }
10991 :
10992 : static void task_move_group_fair(struct task_struct *p)
10993 : {
10994 : detach_task_cfs_rq(p);
10995 : set_task_rq(p, task_cpu(p));
10996 :
10997 : #ifdef CONFIG_SMP
10998 : /* Tell se's cfs_rq has been changed -- migrated */
10999 : p->se.avg.last_update_time = 0;
11000 : #endif
11001 : attach_task_cfs_rq(p);
11002 : }
11003 :
11004 : static void task_change_group_fair(struct task_struct *p, int type)
11005 : {
11006 : switch (type) {
11007 : case TASK_SET_GROUP:
11008 : task_set_group_fair(p);
11009 : break;
11010 :
11011 : case TASK_MOVE_GROUP:
11012 : task_move_group_fair(p);
11013 : break;
11014 : }
11015 : }
11016 :
11017 : void free_fair_sched_group(struct task_group *tg)
11018 : {
11019 : int i;
11020 :
11021 : destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11022 :
11023 : for_each_possible_cpu(i) {
11024 : if (tg->cfs_rq)
11025 : kfree(tg->cfs_rq[i]);
11026 : if (tg->se)
11027 : kfree(tg->se[i]);
11028 : }
11029 :
11030 : kfree(tg->cfs_rq);
11031 : kfree(tg->se);
11032 : }
11033 :
11034 : int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11035 : {
11036 : struct sched_entity *se;
11037 : struct cfs_rq *cfs_rq;
11038 : int i;
11039 :
11040 : tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
11041 : if (!tg->cfs_rq)
11042 : goto err;
11043 : tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
11044 : if (!tg->se)
11045 : goto err;
11046 :
11047 : tg->shares = NICE_0_LOAD;
11048 :
11049 : init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11050 :
11051 : for_each_possible_cpu(i) {
11052 : cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11053 : GFP_KERNEL, cpu_to_node(i));
11054 : if (!cfs_rq)
11055 : goto err;
11056 :
11057 : se = kzalloc_node(sizeof(struct sched_entity),
11058 : GFP_KERNEL, cpu_to_node(i));
11059 : if (!se)
11060 : goto err_free_rq;
11061 :
11062 : init_cfs_rq(cfs_rq);
11063 : init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11064 : init_entity_runnable_average(se);
11065 : }
11066 :
11067 : return 1;
11068 :
11069 : err_free_rq:
11070 : kfree(cfs_rq);
11071 : err:
11072 : return 0;
11073 : }
11074 :
11075 : void online_fair_sched_group(struct task_group *tg)
11076 : {
11077 : struct sched_entity *se;
11078 : struct rq_flags rf;
11079 : struct rq *rq;
11080 : int i;
11081 :
11082 : for_each_possible_cpu(i) {
11083 : rq = cpu_rq(i);
11084 : se = tg->se[i];
11085 : rq_lock_irq(rq, &rf);
11086 : update_rq_clock(rq);
11087 : attach_entity_cfs_rq(se);
11088 : sync_throttle(tg, i);
11089 : rq_unlock_irq(rq, &rf);
11090 : }
11091 : }
11092 :
11093 : void unregister_fair_sched_group(struct task_group *tg)
11094 : {
11095 : unsigned long flags;
11096 : struct rq *rq;
11097 : int cpu;
11098 :
11099 : for_each_possible_cpu(cpu) {
11100 : if (tg->se[cpu])
11101 : remove_entity_load_avg(tg->se[cpu]);
11102 :
11103 : /*
11104 : * Only empty task groups can be destroyed; so we can speculatively
11105 : * check on_list without danger of it being re-added.
11106 : */
11107 : if (!tg->cfs_rq[cpu]->on_list)
11108 : continue;
11109 :
11110 : rq = cpu_rq(cpu);
11111 :
11112 : raw_spin_lock_irqsave(&rq->lock, flags);
11113 : list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11114 : raw_spin_unlock_irqrestore(&rq->lock, flags);
11115 : }
11116 : }
11117 :
11118 : void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11119 : struct sched_entity *se, int cpu,
11120 : struct sched_entity *parent)
11121 : {
11122 : struct rq *rq = cpu_rq(cpu);
11123 :
11124 : cfs_rq->tg = tg;
11125 : cfs_rq->rq = rq;
11126 : init_cfs_rq_runtime(cfs_rq);
11127 :
11128 : tg->cfs_rq[cpu] = cfs_rq;
11129 : tg->se[cpu] = se;
11130 :
11131 : /* se could be NULL for root_task_group */
11132 : if (!se)
11133 : return;
11134 :
11135 : if (!parent) {
11136 : se->cfs_rq = &rq->cfs;
11137 : se->depth = 0;
11138 : } else {
11139 : se->cfs_rq = parent->my_q;
11140 : se->depth = parent->depth + 1;
11141 : }
11142 :
11143 : se->my_q = cfs_rq;
11144 : /* guarantee group entities always have weight */
11145 : update_load_set(&se->load, NICE_0_LOAD);
11146 : se->parent = parent;
11147 : }
11148 :
11149 : static DEFINE_MUTEX(shares_mutex);
11150 :
11151 : int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11152 : {
11153 : int i;
11154 :
11155 : /*
11156 : * We can't change the weight of the root cgroup.
11157 : */
11158 : if (!tg->se[0])
11159 : return -EINVAL;
11160 :
11161 : shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
11162 :
11163 : mutex_lock(&shares_mutex);
11164 : if (tg->shares == shares)
11165 : goto done;
11166 :
11167 : tg->shares = shares;
11168 : for_each_possible_cpu(i) {
11169 : struct rq *rq = cpu_rq(i);
11170 : struct sched_entity *se = tg->se[i];
11171 : struct rq_flags rf;
11172 :
11173 : /* Propagate contribution to hierarchy */
11174 : rq_lock_irqsave(rq, &rf);
11175 : update_rq_clock(rq);
11176 : for_each_sched_entity(se) {
11177 : update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
11178 : update_cfs_group(se);
11179 : }
11180 : rq_unlock_irqrestore(rq, &rf);
11181 : }
11182 :
11183 : done:
11184 : mutex_unlock(&shares_mutex);
11185 : return 0;
11186 : }
11187 : #else /* CONFIG_FAIR_GROUP_SCHED */
11188 :
11189 0 : void free_fair_sched_group(struct task_group *tg) { }
11190 :
11191 0 : int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11192 : {
11193 0 : return 1;
11194 : }
11195 :
11196 0 : void online_fair_sched_group(struct task_group *tg) { }
11197 :
11198 0 : void unregister_fair_sched_group(struct task_group *tg) { }
11199 :
11200 : #endif /* CONFIG_FAIR_GROUP_SCHED */
11201 :
11202 :
11203 0 : static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
11204 : {
11205 0 : struct sched_entity *se = &task->se;
11206 0 : unsigned int rr_interval = 0;
11207 :
11208 : /*
11209 : * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
11210 : * idle runqueue:
11211 : */
11212 0 : if (rq->cfs.load.weight)
11213 0 : rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
11214 :
11215 0 : return rr_interval;
11216 : }
11217 :
11218 : /*
11219 : * All the scheduling class methods:
11220 : */
11221 : DEFINE_SCHED_CLASS(fair) = {
11222 :
11223 : .enqueue_task = enqueue_task_fair,
11224 : .dequeue_task = dequeue_task_fair,
11225 : .yield_task = yield_task_fair,
11226 : .yield_to_task = yield_to_task_fair,
11227 :
11228 : .check_preempt_curr = check_preempt_wakeup,
11229 :
11230 : .pick_next_task = __pick_next_task_fair,
11231 : .put_prev_task = put_prev_task_fair,
11232 : .set_next_task = set_next_task_fair,
11233 :
11234 : #ifdef CONFIG_SMP
11235 : .balance = balance_fair,
11236 : .select_task_rq = select_task_rq_fair,
11237 : .migrate_task_rq = migrate_task_rq_fair,
11238 :
11239 : .rq_online = rq_online_fair,
11240 : .rq_offline = rq_offline_fair,
11241 :
11242 : .task_dead = task_dead_fair,
11243 : .set_cpus_allowed = set_cpus_allowed_common,
11244 : #endif
11245 :
11246 : .task_tick = task_tick_fair,
11247 : .task_fork = task_fork_fair,
11248 :
11249 : .prio_changed = prio_changed_fair,
11250 : .switched_from = switched_from_fair,
11251 : .switched_to = switched_to_fair,
11252 :
11253 : .get_rr_interval = get_rr_interval_fair,
11254 :
11255 : .update_curr = update_curr_fair,
11256 :
11257 : #ifdef CONFIG_FAIR_GROUP_SCHED
11258 : .task_change_group = task_change_group_fair,
11259 : #endif
11260 :
11261 : #ifdef CONFIG_UCLAMP_TASK
11262 : .uclamp_enabled = 1,
11263 : #endif
11264 : };
11265 :
11266 : #ifdef CONFIG_SCHED_DEBUG
11267 : void print_cfs_stats(struct seq_file *m, int cpu)
11268 : {
11269 : struct cfs_rq *cfs_rq, *pos;
11270 :
11271 : rcu_read_lock();
11272 : for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
11273 : print_cfs_rq(m, cpu, cfs_rq);
11274 : rcu_read_unlock();
11275 : }
11276 :
11277 : #ifdef CONFIG_NUMA_BALANCING
11278 : void show_numa_stats(struct task_struct *p, struct seq_file *m)
11279 : {
11280 : int node;
11281 : unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
11282 : struct numa_group *ng;
11283 :
11284 : rcu_read_lock();
11285 : ng = rcu_dereference(p->numa_group);
11286 : for_each_online_node(node) {
11287 : if (p->numa_faults) {
11288 : tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
11289 : tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
11290 : }
11291 : if (ng) {
11292 : gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
11293 : gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
11294 : }
11295 : print_numa_stats(m, node, tsf, tpf, gsf, gpf);
11296 : }
11297 : rcu_read_unlock();
11298 : }
11299 : #endif /* CONFIG_NUMA_BALANCING */
11300 : #endif /* CONFIG_SCHED_DEBUG */
11301 :
11302 1 : __init void init_sched_fair_class(void)
11303 : {
11304 : #ifdef CONFIG_SMP
11305 1 : open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
11306 :
11307 : #ifdef CONFIG_NO_HZ_COMMON
11308 1 : nohz.next_balance = jiffies;
11309 1 : nohz.next_blocked = jiffies;
11310 1 : zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
11311 : #endif
11312 : #endif /* SMP */
11313 :
11314 1 : }
11315 :
11316 : /*
11317 : * Helper functions to facilitate extracting info from tracepoints.
11318 : */
11319 :
11320 0 : const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
11321 : {
11322 : #ifdef CONFIG_SMP
11323 0 : return cfs_rq ? &cfs_rq->avg : NULL;
11324 : #else
11325 : return NULL;
11326 : #endif
11327 : }
11328 : EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
11329 :
11330 0 : char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
11331 : {
11332 0 : if (!cfs_rq) {
11333 0 : if (str)
11334 0 : strlcpy(str, "(null)", len);
11335 : else
11336 : return NULL;
11337 : }
11338 :
11339 0 : cfs_rq_tg_path(cfs_rq, str, len);
11340 : return str;
11341 : }
11342 : EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
11343 :
11344 0 : int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
11345 : {
11346 0 : return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
11347 : }
11348 : EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
11349 :
11350 0 : const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
11351 : {
11352 : #ifdef CONFIG_SMP
11353 0 : return rq ? &rq->avg_rt : NULL;
11354 : #else
11355 : return NULL;
11356 : #endif
11357 : }
11358 : EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
11359 :
11360 0 : const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
11361 : {
11362 : #ifdef CONFIG_SMP
11363 0 : return rq ? &rq->avg_dl : NULL;
11364 : #else
11365 : return NULL;
11366 : #endif
11367 : }
11368 : EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
11369 :
11370 0 : const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
11371 : {
11372 : #if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
11373 0 : return rq ? &rq->avg_irq : NULL;
11374 : #else
11375 : return NULL;
11376 : #endif
11377 : }
11378 : EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
11379 :
11380 0 : int sched_trace_rq_cpu(struct rq *rq)
11381 : {
11382 0 : return rq ? cpu_of(rq) : -1;
11383 : }
11384 : EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
11385 :
11386 0 : int sched_trace_rq_cpu_capacity(struct rq *rq)
11387 : {
11388 0 : return rq ?
11389 : #ifdef CONFIG_SMP
11390 0 : rq->cpu_capacity
11391 : #else
11392 : SCHED_CAPACITY_SCALE
11393 : #endif
11394 0 : : -1;
11395 : }
11396 : EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
11397 :
11398 0 : const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
11399 : {
11400 : #ifdef CONFIG_SMP
11401 0 : return rd ? rd->span : NULL;
11402 : #else
11403 : return NULL;
11404 : #endif
11405 : }
11406 : EXPORT_SYMBOL_GPL(sched_trace_rd_span);
11407 :
11408 0 : int sched_trace_rq_nr_running(struct rq *rq)
11409 : {
11410 0 : return rq ? rq->nr_running : -1;
11411 : }
11412 : EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
|