Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : #include "cgroup-internal.h"
3 :
4 : #include <linux/sched/cputime.h>
5 :
6 : static DEFINE_SPINLOCK(cgroup_rstat_lock);
7 : static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
8 :
9 : static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
10 :
11 44741 : static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
12 : {
13 44741 : return per_cpu_ptr(cgrp->rstat_cpu, cpu);
14 : }
15 :
16 : /**
17 : * cgroup_rstat_updated - keep track of updated rstat_cpu
18 : * @cgrp: target cgroup
19 : * @cpu: cpu on which rstat_cpu was updated
20 : *
21 : * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
22 : * rstat_cpu->updated_children list. See the comment on top of
23 : * cgroup_rstat_cpu definition for details.
24 : */
25 43613 : void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
26 : {
27 43613 : raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
28 43613 : struct cgroup *parent;
29 43613 : unsigned long flags;
30 :
31 : /* nothing to do for root */
32 43613 : if (!cgroup_parent(cgrp))
33 : return;
34 :
35 : /*
36 : * Speculative already-on-list test. This may race leading to
37 : * temporary inaccuracies, which is fine.
38 : *
39 : * Because @parent's updated_children is terminated with @parent
40 : * instead of NULL, we can tell whether @cgrp is on the list by
41 : * testing the next pointer for NULL.
42 : */
43 43613 : if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
44 : return;
45 :
46 147 : raw_spin_lock_irqsave(cpu_lock, flags);
47 :
48 : /* put @cgrp and all ancestors on the corresponding updated lists */
49 147 : for (parent = cgroup_parent(cgrp); parent;
50 309 : cgrp = parent, parent = cgroup_parent(cgrp)) {
51 297 : struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
52 297 : struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
53 :
54 : /*
55 : * Both additions and removals are bottom-up. If a cgroup
56 : * is already in the tree, all ancestors are.
57 : */
58 297 : if (rstatc->updated_next)
59 : break;
60 :
61 162 : rstatc->updated_next = prstatc->updated_children;
62 162 : prstatc->updated_children = cgrp;
63 : }
64 :
65 147 : raw_spin_unlock_irqrestore(cpu_lock, flags);
66 : }
67 :
68 : /**
69 : * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
70 : * @pos: current position
71 : * @root: root of the tree to traversal
72 : * @cpu: target cpu
73 : *
74 : * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts
75 : * the traversal and %NULL return indicates the end. During traversal,
76 : * each returned cgroup is unlinked from the tree. Must be called with the
77 : * matching cgroup_rstat_cpu_lock held.
78 : *
79 : * The only ordering guarantee is that, for a parent and a child pair
80 : * covered by a given traversal, if a child is visited, its parent is
81 : * guaranteed to be visited afterwards.
82 : */
83 278 : static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
84 : struct cgroup *root, int cpu)
85 : {
86 278 : struct cgroup_rstat_cpu *rstatc;
87 :
88 278 : if (pos == root)
89 : return NULL;
90 :
91 : /*
92 : * We're gonna walk down to the first leaf and visit/remove it. We
93 : * can pick whatever unvisited node as the starting point.
94 : */
95 200 : if (!pos)
96 : pos = root;
97 : else
98 0 : pos = cgroup_parent(pos);
99 :
100 : /* walk down to the first leaf */
101 200 : while (true) {
102 200 : rstatc = cgroup_rstat_cpu(pos, cpu);
103 200 : if (rstatc->updated_children == pos)
104 : break;
105 : pos = rstatc->updated_children;
106 : }
107 :
108 : /*
109 : * Unlink @pos from the tree. As the updated_children list is
110 : * singly linked, we have to walk it to find the removal point.
111 : * However, due to the way we traverse, @pos will be the first
112 : * child in most cases. The only exception is @root.
113 : */
114 200 : if (rstatc->updated_next) {
115 78 : struct cgroup *parent = cgroup_parent(pos);
116 78 : struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
117 78 : struct cgroup_rstat_cpu *nrstatc;
118 78 : struct cgroup **nextp;
119 :
120 78 : nextp = &prstatc->updated_children;
121 272 : while (true) {
122 272 : nrstatc = cgroup_rstat_cpu(*nextp, cpu);
123 175 : if (*nextp == pos)
124 : break;
125 :
126 97 : WARN_ON_ONCE(*nextp == parent);
127 97 : nextp = &nrstatc->updated_next;
128 : }
129 :
130 78 : *nextp = rstatc->updated_next;
131 78 : rstatc->updated_next = NULL;
132 :
133 78 : return pos;
134 : }
135 :
136 : /* only happens for @root */
137 : return NULL;
138 : }
139 :
140 : /* see cgroup_rstat_flush() */
141 50 : static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
142 : __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
143 : {
144 50 : int cpu;
145 :
146 150 : lockdep_assert_held(&cgroup_rstat_lock);
147 :
148 250 : for_each_possible_cpu(cpu) {
149 200 : raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
150 : cpu);
151 200 : struct cgroup *pos = NULL;
152 :
153 200 : raw_spin_lock(cpu_lock);
154 278 : while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
155 78 : struct cgroup_subsys_state *css;
156 :
157 78 : cgroup_base_stat_flush(pos, cpu);
158 :
159 78 : rcu_read_lock();
160 78 : list_for_each_entry_rcu(css, &pos->rstat_css_list,
161 : rstat_css_node)
162 0 : css->ss->css_rstat_flush(css, cpu);
163 78 : rcu_read_unlock();
164 : }
165 200 : raw_spin_unlock(cpu_lock);
166 :
167 : /* if @may_sleep, play nice and yield if necessary */
168 400 : if (may_sleep && (need_resched() ||
169 250 : spin_needbreak(&cgroup_rstat_lock))) {
170 0 : spin_unlock_irq(&cgroup_rstat_lock);
171 0 : if (!cond_resched())
172 0 : cpu_relax();
173 250 : spin_lock_irq(&cgroup_rstat_lock);
174 : }
175 : }
176 50 : }
177 :
178 : /**
179 : * cgroup_rstat_flush - flush stats in @cgrp's subtree
180 : * @cgrp: target cgroup
181 : *
182 : * Collect all per-cpu stats in @cgrp's subtree into the global counters
183 : * and propagate them upwards. After this function returns, all cgroups in
184 : * the subtree have up-to-date ->stat.
185 : *
186 : * This also gets all cgroups in the subtree including @cgrp off the
187 : * ->updated_children lists.
188 : *
189 : * This function may block.
190 : */
191 50 : void cgroup_rstat_flush(struct cgroup *cgrp)
192 : {
193 50 : might_sleep();
194 :
195 50 : spin_lock_irq(&cgroup_rstat_lock);
196 50 : cgroup_rstat_flush_locked(cgrp, true);
197 50 : spin_unlock_irq(&cgroup_rstat_lock);
198 50 : }
199 :
200 : /**
201 : * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
202 : * @cgrp: target cgroup
203 : *
204 : * This function can be called from any context.
205 : */
206 0 : void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
207 : {
208 0 : unsigned long flags;
209 :
210 0 : spin_lock_irqsave(&cgroup_rstat_lock, flags);
211 0 : cgroup_rstat_flush_locked(cgrp, false);
212 0 : spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
213 0 : }
214 :
215 : /**
216 : * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
217 : * @cgrp: target cgroup
218 : *
219 : * Flush stats in @cgrp's subtree and prevent further flushes. Must be
220 : * paired with cgroup_rstat_flush_release().
221 : *
222 : * This function may block.
223 : */
224 0 : void cgroup_rstat_flush_hold(struct cgroup *cgrp)
225 : __acquires(&cgroup_rstat_lock)
226 : {
227 0 : might_sleep();
228 0 : spin_lock_irq(&cgroup_rstat_lock);
229 0 : cgroup_rstat_flush_locked(cgrp, true);
230 0 : }
231 :
232 : /**
233 : * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
234 : */
235 0 : void cgroup_rstat_flush_release(void)
236 : __releases(&cgroup_rstat_lock)
237 : {
238 0 : spin_unlock_irq(&cgroup_rstat_lock);
239 0 : }
240 :
241 50 : int cgroup_rstat_init(struct cgroup *cgrp)
242 : {
243 50 : int cpu;
244 :
245 : /* the root cgrp has rstat_cpu preallocated */
246 50 : if (!cgrp->rstat_cpu) {
247 49 : cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
248 49 : if (!cgrp->rstat_cpu)
249 : return -ENOMEM;
250 : }
251 :
252 : /* ->updated_children list is self terminated */
253 250 : for_each_possible_cpu(cpu) {
254 200 : struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
255 :
256 200 : rstatc->updated_children = cgrp;
257 250 : u64_stats_init(&rstatc->bsync);
258 : }
259 :
260 : return 0;
261 : }
262 :
263 25 : void cgroup_rstat_exit(struct cgroup *cgrp)
264 : {
265 25 : int cpu;
266 :
267 25 : cgroup_rstat_flush(cgrp);
268 :
269 : /* sanity check */
270 150 : for_each_possible_cpu(cpu) {
271 100 : struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
272 :
273 100 : if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
274 100 : WARN_ON_ONCE(rstatc->updated_next))
275 : return;
276 : }
277 :
278 25 : free_percpu(cgrp->rstat_cpu);
279 25 : cgrp->rstat_cpu = NULL;
280 : }
281 :
282 1 : void __init cgroup_rstat_boot(void)
283 : {
284 1 : int cpu;
285 :
286 6 : for_each_possible_cpu(cpu)
287 5 : raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
288 :
289 1 : BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
290 1 : }
291 :
292 : /*
293 : * Functions for cgroup basic resource statistics implemented on top of
294 : * rstat.
295 : */
296 156 : static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
297 : struct cgroup_base_stat *src_bstat)
298 : {
299 156 : dst_bstat->cputime.utime += src_bstat->cputime.utime;
300 156 : dst_bstat->cputime.stime += src_bstat->cputime.stime;
301 156 : dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
302 78 : }
303 :
304 156 : static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
305 : struct cgroup_base_stat *src_bstat)
306 : {
307 156 : dst_bstat->cputime.utime -= src_bstat->cputime.utime;
308 156 : dst_bstat->cputime.stime -= src_bstat->cputime.stime;
309 156 : dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
310 : }
311 :
312 78 : static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
313 : {
314 78 : struct cgroup *parent = cgroup_parent(cgrp);
315 78 : struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
316 78 : struct cgroup_base_stat cur, delta;
317 78 : unsigned seq;
318 :
319 : /* fetch the current per-cpu values */
320 78 : do {
321 78 : seq = __u64_stats_fetch_begin(&rstatc->bsync);
322 78 : cur.cputime = rstatc->bstat.cputime;
323 78 : } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
324 :
325 : /* propagate percpu delta to global */
326 78 : delta = cur;
327 78 : cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
328 78 : cgroup_base_stat_add(&cgrp->bstat, &delta);
329 78 : cgroup_base_stat_add(&rstatc->last_bstat, &delta);
330 :
331 : /* propagate global delta to parent */
332 78 : if (parent) {
333 78 : delta = cgrp->bstat;
334 78 : cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
335 78 : cgroup_base_stat_add(&parent->bstat, &delta);
336 78 : cgroup_base_stat_add(&cgrp->last_bstat, &delta);
337 : }
338 78 : }
339 :
340 : static struct cgroup_rstat_cpu *
341 43498 : cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
342 : {
343 43498 : struct cgroup_rstat_cpu *rstatc;
344 :
345 43532 : rstatc = get_cpu_ptr(cgrp->rstat_cpu);
346 43614 : u64_stats_update_begin(&rstatc->bsync);
347 43614 : return rstatc;
348 : }
349 :
350 43614 : static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
351 : struct cgroup_rstat_cpu *rstatc)
352 : {
353 43614 : u64_stats_update_end(&rstatc->bsync);
354 43614 : cgroup_rstat_updated(cgrp, smp_processor_id());
355 43635 : put_cpu_ptr(rstatc);
356 : }
357 :
358 29590 : void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
359 : {
360 29590 : struct cgroup_rstat_cpu *rstatc;
361 :
362 29590 : rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
363 29636 : rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
364 29636 : cgroup_base_stat_cputime_account_end(cgrp, rstatc);
365 29662 : }
366 :
367 13908 : void __cgroup_account_cputime_field(struct cgroup *cgrp,
368 : enum cpu_usage_stat index, u64 delta_exec)
369 : {
370 13908 : struct cgroup_rstat_cpu *rstatc;
371 :
372 13908 : rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
373 :
374 13978 : switch (index) {
375 630 : case CPUTIME_USER:
376 : case CPUTIME_NICE:
377 630 : rstatc->bstat.cputime.utime += delta_exec;
378 630 : break;
379 13348 : case CPUTIME_SYSTEM:
380 : case CPUTIME_IRQ:
381 : case CPUTIME_SOFTIRQ:
382 13348 : rstatc->bstat.cputime.stime += delta_exec;
383 13348 : break;
384 : default:
385 : break;
386 : }
387 :
388 13978 : cgroup_base_stat_cputime_account_end(cgrp, rstatc);
389 13992 : }
390 :
391 : /*
392 : * compute the cputime for the root cgroup by getting the per cpu data
393 : * at a global level, then categorizing the fields in a manner consistent
394 : * with how it is done by __cgroup_account_cputime_field for each bit of
395 : * cpu time attributed to a cgroup.
396 : */
397 0 : static void root_cgroup_cputime(struct task_cputime *cputime)
398 : {
399 0 : int i;
400 :
401 0 : cputime->stime = 0;
402 0 : cputime->utime = 0;
403 0 : cputime->sum_exec_runtime = 0;
404 0 : for_each_possible_cpu(i) {
405 0 : struct kernel_cpustat kcpustat;
406 0 : u64 *cpustat = kcpustat.cpustat;
407 0 : u64 user = 0;
408 0 : u64 sys = 0;
409 :
410 0 : kcpustat_cpu_fetch(&kcpustat, i);
411 :
412 0 : user += cpustat[CPUTIME_USER];
413 0 : user += cpustat[CPUTIME_NICE];
414 0 : cputime->utime += user;
415 :
416 0 : sys += cpustat[CPUTIME_SYSTEM];
417 0 : sys += cpustat[CPUTIME_IRQ];
418 0 : sys += cpustat[CPUTIME_SOFTIRQ];
419 0 : cputime->stime += sys;
420 :
421 0 : cputime->sum_exec_runtime += user;
422 0 : cputime->sum_exec_runtime += sys;
423 0 : cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
424 0 : cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
425 0 : cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
426 : }
427 0 : }
428 :
429 0 : void cgroup_base_stat_cputime_show(struct seq_file *seq)
430 : {
431 0 : struct cgroup *cgrp = seq_css(seq)->cgroup;
432 0 : u64 usage, utime, stime;
433 0 : struct task_cputime cputime;
434 :
435 0 : if (cgroup_parent(cgrp)) {
436 0 : cgroup_rstat_flush_hold(cgrp);
437 0 : usage = cgrp->bstat.cputime.sum_exec_runtime;
438 0 : cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
439 : &utime, &stime);
440 0 : cgroup_rstat_flush_release();
441 : } else {
442 0 : root_cgroup_cputime(&cputime);
443 0 : usage = cputime.sum_exec_runtime;
444 0 : utime = cputime.utime;
445 0 : stime = cputime.stime;
446 : }
447 :
448 0 : do_div(usage, NSEC_PER_USEC);
449 0 : do_div(utime, NSEC_PER_USEC);
450 0 : do_div(stime, NSEC_PER_USEC);
451 :
452 0 : seq_printf(seq, "usage_usec %llu\n"
453 : "user_usec %llu\n"
454 : "system_usec %llu\n",
455 : usage, utime, stime);
456 0 : }
|