Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Per Entity Load Tracking
4 : *
5 : * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 : *
7 : * Interactivity improvements by Mike Galbraith
8 : * (C) 2007 Mike Galbraith <efault@gmx.de>
9 : *
10 : * Various enhancements by Dmitry Adamushko.
11 : * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 : *
13 : * Group scheduling enhancements by Srivatsa Vaddagiri
14 : * Copyright IBM Corporation, 2007
15 : * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 : *
17 : * Scaled math optimizations by Thomas Gleixner
18 : * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 : *
20 : * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 : * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 : *
23 : * Move PELT related code from fair.c into this pelt.c file
24 : * Author: Vincent Guittot <vincent.guittot@linaro.org>
25 : */
26 :
27 : #include <linux/sched.h>
28 : #include "sched.h"
29 : #include "pelt.h"
30 :
31 : /*
32 : * Approximate:
33 : * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
34 : */
35 443475 : static u64 decay_load(u64 val, u64 n)
36 : {
37 443475 : unsigned int local_n;
38 :
39 443475 : if (unlikely(n > LOAD_AVG_PERIOD * 63))
40 : return 0;
41 :
42 : /* after bounds checking we can collapse to 32-bit */
43 443232 : local_n = n;
44 :
45 : /*
46 : * As y^PERIOD = 1/2, we can combine
47 : * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
48 : * With a look-up table which covers y^n (n<PERIOD)
49 : *
50 : * To achieve constant time decay_load.
51 : */
52 443232 : if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
53 12553 : val >>= local_n / LOAD_AVG_PERIOD;
54 12553 : local_n %= LOAD_AVG_PERIOD;
55 : }
56 :
57 443232 : val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
58 443232 : return val;
59 : }
60 :
61 52572 : static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
62 : {
63 52572 : u32 c1, c2, c3 = d3; /* y^0 == 1 */
64 :
65 : /*
66 : * c1 = d1 y^p
67 : */
68 52572 : c1 = decay_load((u64)d1, periods);
69 :
70 : /*
71 : * p-1
72 : * c2 = 1024 \Sum y^n
73 : * n=1
74 : *
75 : * inf inf
76 : * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
77 : * n=0 n=p
78 : */
79 52719 : c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
80 :
81 52783 : return c1 + c2 + c3;
82 : }
83 :
84 : /*
85 : * Accumulate the three separate parts of the sum; d1 the remainder
86 : * of the last (incomplete) period, d2 the span of full periods and d3
87 : * the remainder of the (incomplete) current period.
88 : *
89 : * d1 d2 d3
90 : * ^ ^ ^
91 : * | | |
92 : * |<->|<----------------->|<--->|
93 : * ... |---x---|------| ... |------|-----x (now)
94 : *
95 : * p-1
96 : * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
97 : * n=1
98 : *
99 : * = u y^p + (Step 1)
100 : *
101 : * p-1
102 : * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
103 : * n=1
104 : */
105 : static __always_inline u32
106 173092 : accumulate_sum(u64 delta, struct sched_avg *sa,
107 : unsigned long load, unsigned long runnable, int running)
108 : {
109 173092 : u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
110 173092 : u64 periods;
111 :
112 173092 : delta += sa->period_contrib;
113 173092 : periods = delta / 1024; /* A period is 1024us (~1ms) */
114 :
115 : /*
116 : * Step 1: decay old *_sum if we crossed period boundaries.
117 : */
118 173092 : if (periods) {
119 113368 : sa->load_sum = decay_load(sa->load_sum, periods);
120 226879 : sa->runnable_sum =
121 113314 : decay_load(sa->runnable_sum, periods);
122 113565 : sa->util_sum = decay_load((u64)(sa->util_sum), periods);
123 :
124 : /*
125 : * Step 2
126 : */
127 113761 : delta %= 1024;
128 112489 : if (load) {
129 : /*
130 : * This relies on the:
131 : *
132 : * if (!load)
133 : * runnable = running = 0;
134 : *
135 : * clause from ___update_load_sum(); this results in
136 : * the below usage of @contrib to dissapear entirely,
137 : * so no point in calculating it.
138 : */
139 52722 : contrib = __accumulate_pelt_segments(periods,
140 52722 : 1024 - sa->period_contrib, delta);
141 : }
142 : }
143 173614 : sa->period_contrib = delta;
144 :
145 146221 : if (load)
146 91372 : sa->load_sum += load * contrib;
147 146221 : if (runnable)
148 91425 : sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
149 146221 : if (running)
150 78832 : sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
151 :
152 173614 : return periods;
153 : }
154 :
155 : /*
156 : * We can represent the historical contribution to runnable average as the
157 : * coefficients of a geometric series. To do this we sub-divide our runnable
158 : * history into segments of approximately 1ms (1024us); label the segment that
159 : * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
160 : *
161 : * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
162 : * p0 p1 p2
163 : * (now) (~1ms ago) (~2ms ago)
164 : *
165 : * Let u_i denote the fraction of p_i that the entity was runnable.
166 : *
167 : * We then designate the fractions u_i as our co-efficients, yielding the
168 : * following representation of historical load:
169 : * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
170 : *
171 : * We choose y based on the with of a reasonably scheduling period, fixing:
172 : * y^32 = 0.5
173 : *
174 : * This means that the contribution to load ~32ms ago (u_32) will be weighted
175 : * approximately half as much as the contribution to load within the last ms
176 : * (u_0).
177 : *
178 : * When a period "rolls over" and we have new u_0`, multiplying the previous
179 : * sum again by y is sufficient to update:
180 : * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
181 : * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
182 : */
183 : static __always_inline int
184 229795 : ___update_load_sum(u64 now, struct sched_avg *sa,
185 : unsigned long load, unsigned long runnable, int running)
186 : {
187 229795 : u64 delta;
188 :
189 229795 : delta = now - sa->last_update_time;
190 : /*
191 : * This should only happen when time goes backwards, which it
192 : * unfortunately does during sched clock init when we swap over to TSC.
193 : */
194 74410 : if ((s64)delta < 0) {
195 0 : sa->last_update_time = now;
196 0 : return 0;
197 : }
198 :
199 : /*
200 : * Use 1024ns as the unit of measurement since it's a reasonable
201 : * approximation of 1us and fast to compute.
202 : */
203 229795 : delta >>= 10;
204 229795 : if (!delta)
205 : return 0;
206 :
207 173092 : sa->last_update_time += delta << 10;
208 :
209 : /*
210 : * running is a subset of runnable (weight) so running can't be set if
211 : * runnable is clear. But there are some corner cases where the current
212 : * se has been already dequeued but cfs_rq->curr still points to it.
213 : * This means that weight will be 0 but not running for a sched_entity
214 : * but also for a cfs_rq if the latter becomes idle. As an example,
215 : * this happens during idle_balance() which calls
216 : * update_blocked_averages().
217 : *
218 : * Also see the comment in accumulate_sum().
219 : */
220 145753 : if (!load)
221 54913 : runnable = running = 0;
222 :
223 : /*
224 : * Now we know we crossed measurement unit boundaries. The *_avg
225 : * accrues by two steps:
226 : *
227 : * Step 1: accumulate *_sum since last_update_time. If we haven't
228 : * crossed period boundaries, finish.
229 : */
230 337131 : if (!accumulate_sum(delta, sa, load, runnable, running))
231 9220 : return 0;
232 :
233 : return 1;
234 : }
235 :
236 : /*
237 : * When syncing *_avg with *_sum, we must take into account the current
238 : * position in the PELT segment otherwise the remaining part of the segment
239 : * will be considered as idle time whereas it's not yet elapsed and this will
240 : * generate unwanted oscillation in the range [1002..1024[.
241 : *
242 : * The max value of *_sum varies with the position in the time segment and is
243 : * equals to :
244 : *
245 : * LOAD_AVG_MAX*y + sa->period_contrib
246 : *
247 : * which can be simplified into:
248 : *
249 : * LOAD_AVG_MAX - 1024 + sa->period_contrib
250 : *
251 : * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
252 : *
253 : * The same care must be taken when a sched entity is added, updated or
254 : * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
255 : * and the cfs rq, to which they are attached, have the same position in the
256 : * time segment because they use the same clock. This means that we can use
257 : * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
258 : * if it's more convenient.
259 : */
260 : static __always_inline void
261 112248 : ___update_load_avg(struct sched_avg *sa, unsigned long load)
262 : {
263 112248 : u32 divider = get_pelt_divider(sa);
264 :
265 : /*
266 : * Step 2: update *_avg.
267 : */
268 112248 : sa->load_avg = div_u64(load * sa->load_sum, divider);
269 112248 : sa->runnable_avg = div_u64(sa->runnable_sum, divider);
270 112248 : WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
271 : }
272 :
273 : /*
274 : * sched_entity:
275 : *
276 : * task:
277 : * se_weight() = se->load.weight
278 : * se_runnable() = !!on_rq
279 : *
280 : * group: [ see update_cfs_group() ]
281 : * se_weight() = tg->weight * grq->load_avg / tg->load_avg
282 : * se_runnable() = grq->h_nr_running
283 : *
284 : * runnable_sum = se_runnable() * runnable = grq->runnable_sum
285 : * runnable_avg = runnable_sum
286 : *
287 : * load_sum := runnable
288 : * load_avg = se_weight(se) * load_sum
289 : *
290 : * cfq_rq:
291 : *
292 : * runnable_sum = \Sum se->avg.runnable_sum
293 : * runnable_avg = \Sum se->avg.runnable_avg
294 : *
295 : * load_sum = \Sum se_weight(se) * se->avg.load_sum
296 : * load_avg = \Sum se->avg.load_avg
297 : */
298 :
299 1690 : int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
300 : {
301 1690 : if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
302 206 : ___update_load_avg(&se->avg, se_weight(se));
303 103 : trace_pelt_se_tp(se);
304 103 : return 1;
305 : }
306 :
307 : return 0;
308 : }
309 :
310 74410 : int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
311 : {
312 0 : if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
313 74410 : cfs_rq->curr == se)) {
314 :
315 77784 : ___update_load_avg(&se->avg, se_weight(se));
316 38937 : cfs_se_util_change(&se->avg);
317 38937 : trace_pelt_se_tp(se);
318 38937 : return 1;
319 : }
320 :
321 : return 0;
322 : }
323 :
324 89019 : int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
325 : {
326 0 : if (___update_load_sum(now, &cfs_rq->avg,
327 89019 : scale_load_down(cfs_rq->load.weight),
328 89019 : cfs_rq->h_nr_running,
329 89019 : cfs_rq->curr != NULL)) {
330 :
331 33445 : ___update_load_avg(&cfs_rq->avg, 1);
332 33445 : trace_pelt_cfs_tp(cfs_rq);
333 33445 : return 1;
334 : }
335 :
336 : return 0;
337 : }
338 :
339 : /*
340 : * rt_rq:
341 : *
342 : * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
343 : * util_sum = cpu_scale * load_sum
344 : * runnable_sum = util_sum
345 : *
346 : * load_avg and runnable_avg are not supported and meaningless.
347 : *
348 : */
349 :
350 12525 : int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
351 : {
352 12525 : if (___update_load_sum(now, &rq->avg_rt,
353 : running,
354 : running,
355 : running)) {
356 :
357 11699 : ___update_load_avg(&rq->avg_rt, 1);
358 11699 : trace_pelt_rt_tp(rq);
359 11699 : return 1;
360 : }
361 :
362 : return 0;
363 : }
364 :
365 : /*
366 : * dl_rq:
367 : *
368 : * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
369 : * util_sum = cpu_scale * load_sum
370 : * runnable_sum = util_sum
371 : *
372 : * load_avg and runnable_avg are not supported and meaningless.
373 : *
374 : */
375 :
376 12493 : int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
377 : {
378 12493 : if (___update_load_sum(now, &rq->avg_dl,
379 : running,
380 : running,
381 : running)) {
382 :
383 11646 : ___update_load_avg(&rq->avg_dl, 1);
384 11646 : trace_pelt_dl_tp(rq);
385 11646 : return 1;
386 : }
387 :
388 : return 0;
389 : }
390 :
391 : #ifdef CONFIG_SCHED_THERMAL_PRESSURE
392 : /*
393 : * thermal:
394 : *
395 : * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
396 : *
397 : * util_avg and runnable_load_avg are not supported and meaningless.
398 : *
399 : * Unlike rt/dl utilization tracking that track time spent by a cpu
400 : * running a rt/dl task through util_avg, the average thermal pressure is
401 : * tracked through load_avg. This is because thermal pressure signal is
402 : * time weighted "delta" capacity unlike util_avg which is binary.
403 : * "delta capacity" = actual capacity -
404 : * capped capacity a cpu due to a thermal event.
405 : */
406 :
407 : int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
408 : {
409 : if (___update_load_sum(now, &rq->avg_thermal,
410 : capacity,
411 : capacity,
412 : capacity)) {
413 : ___update_load_avg(&rq->avg_thermal, 1);
414 : trace_pelt_thermal_tp(rq);
415 : return 1;
416 : }
417 :
418 : return 0;
419 : }
420 : #endif
421 :
422 : #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
423 : /*
424 : * irq:
425 : *
426 : * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
427 : * util_sum = cpu_scale * load_sum
428 : * runnable_sum = util_sum
429 : *
430 : * load_avg and runnable_avg are not supported and meaningless.
431 : *
432 : */
433 :
434 19802 : int update_irq_load_avg(struct rq *rq, u64 running)
435 : {
436 19802 : int ret = 0;
437 :
438 : /*
439 : * We can't use clock_pelt because irq time is not accounted in
440 : * clock_task. Instead we directly scale the running time to
441 : * reflect the real amount of computation
442 : */
443 19802 : running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
444 19802 : running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
445 :
446 : /*
447 : * We know the time that has been used by interrupt since last update
448 : * but we don't when. Let be pessimistic and assume that interrupt has
449 : * happened just before the update. This is not so far from reality
450 : * because interrupt will most probably wake up task and trig an update
451 : * of rq clock during which the metric is updated.
452 : * We start to decay with normal context time and then we add the
453 : * interrupt context time.
454 : * We can safely remove running from rq->clock because
455 : * rq->clock += delta with delta >= running
456 : */
457 19802 : ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
458 : 0,
459 : 0,
460 : 0);
461 19856 : ret += ___update_load_sum(rq->clock, &rq->avg_irq,
462 : 1,
463 : 1,
464 : 1);
465 :
466 19856 : if (ret) {
467 16418 : ___update_load_avg(&rq->avg_irq, 1);
468 16418 : trace_pelt_irq_tp(rq);
469 : }
470 :
471 19989 : return ret;
472 : }
473 : #endif
|