LCOV - code coverage report
Current view: top level - kernel/sched - cputime.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 125 146 85.6 %
Date: 2021-04-22 12:43:58 Functions: 10 13 76.9 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * Simple CPU accounting cgroup controller
       4             :  */
       5             : #include "sched.h"
       6             : 
       7             : #ifdef CONFIG_IRQ_TIME_ACCOUNTING
       8             : 
       9             : /*
      10             :  * There are no locks covering percpu hardirq/softirq time.
      11             :  * They are only modified in vtime_account, on corresponding CPU
      12             :  * with interrupts disabled. So, writes are safe.
      13             :  * They are read and saved off onto struct rq in update_rq_clock().
      14             :  * This may result in other CPU reading this CPU's irq time and can
      15             :  * race with irq/vtime_account on this CPU. We would either get old
      16             :  * or new value with a side effect of accounting a slice of irq time to wrong
      17             :  * task when irq is in progress while we read rq->clock. That is a worthy
      18             :  * compromise in place of having locks on each irq in account_system_time.
      19             :  */
      20             : DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
      21             : 
      22             : static int sched_clock_irqtime;
      23             : 
      24             : void enable_sched_clock_irqtime(void)
      25             : {
      26             :         sched_clock_irqtime = 1;
      27             : }
      28             : 
      29             : void disable_sched_clock_irqtime(void)
      30             : {
      31             :         sched_clock_irqtime = 0;
      32             : }
      33             : 
      34             : static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
      35             :                                   enum cpu_usage_stat idx)
      36             : {
      37             :         u64 *cpustat = kcpustat_this_cpu->cpustat;
      38             : 
      39             :         u64_stats_update_begin(&irqtime->sync);
      40             :         cpustat[idx] += delta;
      41             :         irqtime->total += delta;
      42             :         irqtime->tick_delta += delta;
      43             :         u64_stats_update_end(&irqtime->sync);
      44             : }
      45             : 
      46             : /*
      47             :  * Called after incrementing preempt_count on {soft,}irq_enter
      48             :  * and before decrementing preempt_count on {soft,}irq_exit.
      49             :  */
      50             : void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
      51             : {
      52             :         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
      53             :         unsigned int pc;
      54             :         s64 delta;
      55             :         int cpu;
      56             : 
      57             :         if (!sched_clock_irqtime)
      58             :                 return;
      59             : 
      60             :         cpu = smp_processor_id();
      61             :         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
      62             :         irqtime->irq_start_time += delta;
      63             :         pc = preempt_count() - offset;
      64             : 
      65             :         /*
      66             :          * We do not account for softirq time from ksoftirqd here.
      67             :          * We want to continue accounting softirq time to ksoftirqd thread
      68             :          * in that case, so as not to confuse scheduler with a special task
      69             :          * that do not consume any time, but still wants to run.
      70             :          */
      71             :         if (pc & HARDIRQ_MASK)
      72             :                 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
      73             :         else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
      74             :                 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
      75             : }
      76             : 
      77             : static u64 irqtime_tick_accounted(u64 maxtime)
      78             : {
      79             :         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
      80             :         u64 delta;
      81             : 
      82             :         delta = min(irqtime->tick_delta, maxtime);
      83             :         irqtime->tick_delta -= delta;
      84             : 
      85             :         return delta;
      86             : }
      87             : 
      88             : #else /* CONFIG_IRQ_TIME_ACCOUNTING */
      89             : 
      90             : #define sched_clock_irqtime     (0)
      91             : 
      92             : static u64 irqtime_tick_accounted(u64 dummy)
      93             : {
      94             :         return 0;
      95             : }
      96             : 
      97             : #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
      98             : 
      99       15235 : static inline void task_group_account_field(struct task_struct *p, int index,
     100             :                                             u64 tmp)
     101             : {
     102             :         /*
     103             :          * Since all updates are sure to touch the root cgroup, we
     104             :          * get ourselves ahead and touch it first. If the root cgroup
     105             :          * is the only cgroup, then nothing else should be necessary.
     106             :          *
     107             :          */
     108       15235 :         __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
     109             : 
     110       15235 :         cgroup_account_cputime_field(p, index, tmp);
     111       15421 : }
     112             : 
     113             : /*
     114             :  * Account user CPU time to a process.
     115             :  * @p: the process that the CPU time gets accounted to
     116             :  * @cputime: the CPU time spent in user space since the last update
     117             :  */
     118         604 : void account_user_time(struct task_struct *p, u64 cputime)
     119             : {
     120         604 :         int index;
     121             : 
     122             :         /* Add user time to process. */
     123         604 :         p->utime += cputime;
     124         604 :         account_group_user_time(p, cputime);
     125             : 
     126         605 :         index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
     127             : 
     128             :         /* Add user time to cpustat. */
     129         605 :         task_group_account_field(p, index, cputime);
     130             : 
     131             :         /* Account for user time used */
     132         604 :         acct_account_cputime(p);
     133         606 : }
     134             : 
     135             : /*
     136             :  * Account guest CPU time to a process.
     137             :  * @p: the process that the CPU time gets accounted to
     138             :  * @cputime: the CPU time spent in virtual machine since the last update
     139             :  */
     140           0 : void account_guest_time(struct task_struct *p, u64 cputime)
     141             : {
     142           0 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     143             : 
     144             :         /* Add guest time to process. */
     145           0 :         p->utime += cputime;
     146           0 :         account_group_user_time(p, cputime);
     147           0 :         p->gtime += cputime;
     148             : 
     149             :         /* Add guest time to cpustat. */
     150           0 :         if (task_nice(p) > 0) {
     151           0 :                 cpustat[CPUTIME_NICE] += cputime;
     152           0 :                 cpustat[CPUTIME_GUEST_NICE] += cputime;
     153             :         } else {
     154           0 :                 cpustat[CPUTIME_USER] += cputime;
     155           0 :                 cpustat[CPUTIME_GUEST] += cputime;
     156             :         }
     157           0 : }
     158             : 
     159             : /*
     160             :  * Account system CPU time to a process and desired cpustat field
     161             :  * @p: the process that the CPU time gets accounted to
     162             :  * @cputime: the CPU time spent in kernel space since the last update
     163             :  * @index: pointer to cpustat field that has to be updated
     164             :  */
     165       14470 : void account_system_index_time(struct task_struct *p,
     166             :                                u64 cputime, enum cpu_usage_stat index)
     167             : {
     168             :         /* Add system time to process. */
     169       14470 :         p->stime += cputime;
     170       14470 :         account_group_system_time(p, cputime);
     171             : 
     172             :         /* Add system time to cpustat. */
     173       14785 :         task_group_account_field(p, index, cputime);
     174             : 
     175             :         /* Account for system time used */
     176       14824 :         acct_account_cputime(p);
     177       14852 : }
     178             : 
     179             : /*
     180             :  * Account system CPU time to a process.
     181             :  * @p: the process that the CPU time gets accounted to
     182             :  * @hardirq_offset: the offset to subtract from hardirq_count()
     183             :  * @cputime: the CPU time spent in kernel space since the last update
     184             :  */
     185       14540 : void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
     186             : {
     187       14540 :         int index;
     188             : 
     189       14540 :         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
     190           0 :                 account_guest_time(p, cputime);
     191           0 :                 return;
     192             :         }
     193             : 
     194       14540 :         if (hardirq_count() - hardirq_offset)
     195             :                 index = CPUTIME_IRQ;
     196       14651 :         else if (in_serving_softirq())
     197             :                 index = CPUTIME_SOFTIRQ;
     198             :         else
     199       11966 :                 index = CPUTIME_SYSTEM;
     200             : 
     201       14540 :         account_system_index_time(p, cputime, index);
     202             : }
     203             : 
     204             : /*
     205             :  * Account for involuntary wait time.
     206             :  * @cputime: the CPU time spent in involuntary wait
     207             :  */
     208       25652 : void account_steal_time(u64 cputime)
     209             : {
     210           0 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     211             : 
     212       25770 :         cpustat[CPUTIME_STEAL] += cputime;
     213           0 : }
     214             : 
     215             : /*
     216             :  * Account for idle time.
     217             :  * @cputime: the CPU time spent in idle wait
     218             :  */
     219       10370 : void account_idle_time(u64 cputime)
     220             : {
     221       10370 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     222       10372 :         struct rq *rq = this_rq();
     223             : 
     224       10418 :         if (atomic_read(&rq->nr_iowait) > 0)
     225          95 :                 cpustat[CPUTIME_IOWAIT] += cputime;
     226             :         else
     227       10333 :                 cpustat[CPUTIME_IDLE] += cputime;
     228       10428 : }
     229             : 
     230             : /*
     231             :  * When a guest is interrupted for a longer amount of time, missed clock
     232             :  * ticks are not redelivered later. Due to that, this function may on
     233             :  * occasion account more time than the calling functions think elapsed.
     234             :  */
     235       25286 : static __always_inline u64 steal_account_process_time(u64 maxtime)
     236             : {
     237             : #ifdef CONFIG_PARAVIRT
     238      101787 :         if (static_key_false(&paravirt_steal_enabled)) {
     239       25469 :                 u64 steal;
     240             : 
     241       25469 :                 steal = paravirt_steal_clock(smp_processor_id());
     242       25626 :                 steal -= this_rq()->prev_steal_time;
     243       25652 :                 steal = min(steal, maxtime);
     244       25652 :                 account_steal_time(steal);
     245       25770 :                 this_rq()->prev_steal_time += steal;
     246             : 
     247         695 :                 return steal;
     248             :         }
     249             : #endif
     250             :         return 0;
     251             : }
     252             : 
     253             : /*
     254             :  * Account how much elapsed time was spent in steal, irq, or softirq time.
     255             :  */
     256             : static inline u64 account_other_time(u64 max)
     257             : {
     258             :         u64 accounted;
     259             : 
     260             :         lockdep_assert_irqs_disabled();
     261             : 
     262             :         accounted = steal_account_process_time(max);
     263             : 
     264             :         if (accounted < max)
     265             :                 accounted += irqtime_tick_accounted(max - accounted);
     266             : 
     267             :         return accounted;
     268             : }
     269             : 
     270             : #ifdef CONFIG_64BIT
     271        1045 : static inline u64 read_sum_exec_runtime(struct task_struct *t)
     272             : {
     273        1045 :         return t->se.sum_exec_runtime;
     274             : }
     275             : #else
     276             : static u64 read_sum_exec_runtime(struct task_struct *t)
     277             : {
     278             :         u64 ns;
     279             :         struct rq_flags rf;
     280             :         struct rq *rq;
     281             : 
     282             :         rq = task_rq_lock(t, &rf);
     283             :         ns = t->se.sum_exec_runtime;
     284             :         task_rq_unlock(rq, t, &rf);
     285             : 
     286             :         return ns;
     287             : }
     288             : #endif
     289             : 
     290             : /*
     291             :  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
     292             :  * tasks (sum on group iteration) belonging to @tsk's group.
     293             :  */
     294        1045 : void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
     295             : {
     296        1045 :         struct signal_struct *sig = tsk->signal;
     297        1045 :         u64 utime, stime;
     298        1045 :         struct task_struct *t;
     299        1045 :         unsigned int seq, nextseq;
     300        1045 :         unsigned long flags;
     301             : 
     302             :         /*
     303             :          * Update current task runtime to account pending time since last
     304             :          * scheduler action or thread_group_cputime() call. This thread group
     305             :          * might have other running tasks on different CPUs, but updating
     306             :          * their runtime can affect syscall performance, so we skip account
     307             :          * those pending times and rely only on values updated on tick or
     308             :          * other scheduler action.
     309             :          */
     310        1045 :         if (same_thread_group(current, tsk))
     311          49 :                 (void) task_sched_runtime(current);
     312             : 
     313        1045 :         rcu_read_lock();
     314             :         /* Attempt a lockless read on the first round. */
     315             :         nextseq = 0;
     316        1045 :         do {
     317        1045 :                 seq = nextseq;
     318        2090 :                 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
     319        1045 :                 times->utime = sig->utime;
     320        1045 :                 times->stime = sig->stime;
     321        1045 :                 times->sum_exec_runtime = sig->sum_sched_runtime;
     322             : 
     323        2090 :                 for_each_thread(tsk, t) {
     324        1045 :                         task_cputime(t, &utime, &stime);
     325        1045 :                         times->utime += utime;
     326        1045 :                         times->stime += stime;
     327        1045 :                         times->sum_exec_runtime += read_sum_exec_runtime(t);
     328             :                 }
     329             :                 /* If lockless access failed, take the lock. */
     330        1045 :                 nextseq = 1;
     331        2090 :         } while (need_seqretry(&sig->stats_lock, seq));
     332        1045 :         done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
     333        1045 :         rcu_read_unlock();
     334        1045 : }
     335             : 
     336             : #ifdef CONFIG_IRQ_TIME_ACCOUNTING
     337             : /*
     338             :  * Account a tick to a process and cpustat
     339             :  * @p: the process that the CPU time gets accounted to
     340             :  * @user_tick: is the tick from userspace
     341             :  * @rq: the pointer to rq
     342             :  *
     343             :  * Tick demultiplexing follows the order
     344             :  * - pending hardirq update
     345             :  * - pending softirq update
     346             :  * - user_time
     347             :  * - idle_time
     348             :  * - system time
     349             :  *   - check for guest_time
     350             :  *   - else account as system_time
     351             :  *
     352             :  * Check for hardirq is done both for system and user time as there is
     353             :  * no timer going off while we are on hardirq and hence we may never get an
     354             :  * opportunity to update it solely in system time.
     355             :  * p->stime and friends are only updated on system time and not on irq
     356             :  * softirq as those do not count in task exec_runtime any more.
     357             :  */
     358             : static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
     359             :                                          int ticks)
     360             : {
     361             :         u64 other, cputime = TICK_NSEC * ticks;
     362             : 
     363             :         /*
     364             :          * When returning from idle, many ticks can get accounted at
     365             :          * once, including some ticks of steal, irq, and softirq time.
     366             :          * Subtract those ticks from the amount of time accounted to
     367             :          * idle, or potentially user or system time. Due to rounding,
     368             :          * other time can exceed ticks occasionally.
     369             :          */
     370             :         other = account_other_time(ULONG_MAX);
     371             :         if (other >= cputime)
     372             :                 return;
     373             : 
     374             :         cputime -= other;
     375             : 
     376             :         if (this_cpu_ksoftirqd() == p) {
     377             :                 /*
     378             :                  * ksoftirqd time do not get accounted in cpu_softirq_time.
     379             :                  * So, we have to handle it separately here.
     380             :                  * Also, p->stime needs to be updated for ksoftirqd.
     381             :                  */
     382             :                 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
     383             :         } else if (user_tick) {
     384             :                 account_user_time(p, cputime);
     385             :         } else if (p == this_rq()->idle) {
     386             :                 account_idle_time(cputime);
     387             :         } else if (p->flags & PF_VCPU) { /* System time or guest time */
     388             :                 account_guest_time(p, cputime);
     389             :         } else {
     390             :                 account_system_index_time(p, cputime, CPUTIME_SYSTEM);
     391             :         }
     392             : }
     393             : 
     394             : static void irqtime_account_idle_ticks(int ticks)
     395             : {
     396             :         irqtime_account_process_tick(current, 0, ticks);
     397             : }
     398             : #else /* CONFIG_IRQ_TIME_ACCOUNTING */
     399             : static inline void irqtime_account_idle_ticks(int ticks) { }
     400             : static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
     401             :                                                 int nr_ticks) { }
     402             : #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
     403             : 
     404             : /*
     405             :  * Use precise platform statistics if available:
     406             :  */
     407             : #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
     408             : 
     409             : # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
     410             : void vtime_task_switch(struct task_struct *prev)
     411             : {
     412             :         if (is_idle_task(prev))
     413             :                 vtime_account_idle(prev);
     414             :         else
     415             :                 vtime_account_kernel(prev);
     416             : 
     417             :         vtime_flush(prev);
     418             :         arch_vtime_task_switch(prev);
     419             : }
     420             : # endif
     421             : 
     422             : void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
     423             : {
     424             :         unsigned int pc = preempt_count() - offset;
     425             : 
     426             :         if (pc & HARDIRQ_OFFSET) {
     427             :                 vtime_account_hardirq(tsk);
     428             :         } else if (pc & SOFTIRQ_OFFSET) {
     429             :                 vtime_account_softirq(tsk);
     430             :         } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
     431             :                    is_idle_task(tsk)) {
     432             :                 vtime_account_idle(tsk);
     433             :         } else {
     434             :                 vtime_account_kernel(tsk);
     435             :         }
     436             : }
     437             : 
     438             : void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
     439             :                     u64 *ut, u64 *st)
     440             : {
     441             :         *ut = curr->utime;
     442             :         *st = curr->stime;
     443             : }
     444             : 
     445             : void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     446             : {
     447             :         *ut = p->utime;
     448             :         *st = p->stime;
     449             : }
     450             : EXPORT_SYMBOL_GPL(task_cputime_adjusted);
     451             : 
     452             : void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     453             : {
     454             :         struct task_cputime cputime;
     455             : 
     456             :         thread_group_cputime(p, &cputime);
     457             : 
     458             :         *ut = cputime.utime;
     459             :         *st = cputime.stime;
     460             : }
     461             : 
     462             : #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
     463             : 
     464             : /*
     465             :  * Account a single tick of CPU time.
     466             :  * @p: the process that the CPU time gets accounted to
     467             :  * @user_tick: indicates if the tick is a user or a system tick
     468             :  */
     469       24591 : void account_process_tick(struct task_struct *p, int user_tick)
     470             : {
     471       24591 :         u64 cputime, steal;
     472             : 
     473       24591 :         if (vtime_accounting_enabled_this_cpu())
     474             :                 return;
     475             : 
     476       24591 :         if (sched_clock_irqtime) {
     477             :                 irqtime_account_process_tick(p, user_tick, 1);
     478             :                 return;
     479             :         }
     480             : 
     481       24591 :         cputime = TICK_NSEC;
     482       24591 :         steal = steal_account_process_time(ULONG_MAX);
     483             : 
     484       25191 :         if (steal >= cputime)
     485             :                 return;
     486             : 
     487       25146 :         cputime -= steal;
     488             : 
     489       25146 :         if (user_tick)
     490         606 :                 account_user_time(p, cputime);
     491       24540 :         else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
     492       14818 :                 account_system_time(p, HARDIRQ_OFFSET, cputime);
     493             :         else
     494        9736 :                 account_idle_time(cputime);
     495             : }
     496             : 
     497             : /*
     498             :  * Account multiple ticks of idle time.
     499             :  * @ticks: number of stolen ticks
     500             :  */
     501         695 : void account_idle_ticks(unsigned long ticks)
     502             : {
     503         695 :         u64 cputime, steal;
     504             : 
     505         695 :         if (sched_clock_irqtime) {
     506             :                 irqtime_account_idle_ticks(ticks);
     507             :                 return;
     508             :         }
     509             : 
     510         695 :         cputime = ticks * TICK_NSEC;
     511         695 :         steal = steal_account_process_time(ULONG_MAX);
     512             : 
     513         695 :         if (steal >= cputime)
     514             :                 return;
     515             : 
     516         695 :         cputime -= steal;
     517         695 :         account_idle_time(cputime);
     518             : }
     519             : 
     520             : /*
     521             :  * Adjust tick based cputime random precision against scheduler runtime
     522             :  * accounting.
     523             :  *
     524             :  * Tick based cputime accounting depend on random scheduling timeslices of a
     525             :  * task to be interrupted or not by the timer.  Depending on these
     526             :  * circumstances, the number of these interrupts may be over or
     527             :  * under-optimistic, matching the real user and system cputime with a variable
     528             :  * precision.
     529             :  *
     530             :  * Fix this by scaling these tick based values against the total runtime
     531             :  * accounted by the CFS scheduler.
     532             :  *
     533             :  * This code provides the following guarantees:
     534             :  *
     535             :  *   stime + utime == rtime
     536             :  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
     537             :  *
     538             :  * Assuming that rtime_i+1 >= rtime_i.
     539             :  */
     540        1045 : void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
     541             :                     u64 *ut, u64 *st)
     542             : {
     543        1045 :         u64 rtime, stime, utime;
     544        1045 :         unsigned long flags;
     545             : 
     546             :         /* Serialize concurrent callers such that we can honour our guarantees */
     547        1045 :         raw_spin_lock_irqsave(&prev->lock, flags);
     548        1045 :         rtime = curr->sum_exec_runtime;
     549             : 
     550             :         /*
     551             :          * This is possible under two circumstances:
     552             :          *  - rtime isn't monotonic after all (a bug);
     553             :          *  - we got reordered by the lock.
     554             :          *
     555             :          * In both cases this acts as a filter such that the rest of the code
     556             :          * can assume it is monotonic regardless of anything else.
     557             :          */
     558        1045 :         if (prev->stime + prev->utime >= rtime)
     559          23 :                 goto out;
     560             : 
     561        1022 :         stime = curr->stime;
     562        1022 :         utime = curr->utime;
     563             : 
     564             :         /*
     565             :          * If either stime or utime are 0, assume all runtime is userspace.
     566             :          * Once a task gets some ticks, the monotonicy code at 'update:'
     567             :          * will ensure things converge to the observed ratio.
     568             :          */
     569        1022 :         if (stime == 0) {
     570          44 :                 utime = rtime;
     571          44 :                 goto update;
     572             :         }
     573             : 
     574         978 :         if (utime == 0) {
     575         773 :                 stime = rtime;
     576         773 :                 goto update;
     577             :         }
     578             : 
     579         205 :         stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
     580             : 
     581        1022 : update:
     582             :         /*
     583             :          * Make sure stime doesn't go backwards; this preserves monotonicity
     584             :          * for utime because rtime is monotonic.
     585             :          *
     586             :          *  utime_i+1 = rtime_i+1 - stime_i
     587             :          *            = rtime_i+1 - (rtime_i - utime_i)
     588             :          *            = (rtime_i+1 - rtime_i) + utime_i
     589             :          *            >= utime_i
     590             :          */
     591        1022 :         if (stime < prev->stime)
     592             :                 stime = prev->stime;
     593        1022 :         utime = rtime - stime;
     594             : 
     595             :         /*
     596             :          * Make sure utime doesn't go backwards; this still preserves
     597             :          * monotonicity for stime, analogous argument to above.
     598             :          */
     599        1022 :         if (utime < prev->utime) {
     600          13 :                 utime = prev->utime;
     601          13 :                 stime = rtime - utime;
     602             :         }
     603             : 
     604        1022 :         prev->stime = stime;
     605        1022 :         prev->utime = utime;
     606        1045 : out:
     607        1045 :         *ut = prev->utime;
     608        1045 :         *st = prev->stime;
     609        1045 :         raw_spin_unlock_irqrestore(&prev->lock, flags);
     610        1045 : }
     611             : 
     612           0 : void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     613             : {
     614           0 :         struct task_cputime cputime = {
     615           0 :                 .sum_exec_runtime = p->se.sum_exec_runtime,
     616             :         };
     617             : 
     618           0 :         task_cputime(p, &cputime.utime, &cputime.stime);
     619           0 :         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
     620           0 : }
     621             : EXPORT_SYMBOL_GPL(task_cputime_adjusted);
     622             : 
     623        1045 : void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     624             : {
     625        1045 :         struct task_cputime cputime;
     626             : 
     627        1045 :         thread_group_cputime(p, &cputime);
     628        1045 :         cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
     629        1045 : }
     630             : #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
     631             : 
     632             : #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
     633             : static u64 vtime_delta(struct vtime *vtime)
     634             : {
     635             :         unsigned long long clock;
     636             : 
     637             :         clock = sched_clock();
     638             :         if (clock < vtime->starttime)
     639             :                 return 0;
     640             : 
     641             :         return clock - vtime->starttime;
     642             : }
     643             : 
     644             : static u64 get_vtime_delta(struct vtime *vtime)
     645             : {
     646             :         u64 delta = vtime_delta(vtime);
     647             :         u64 other;
     648             : 
     649             :         /*
     650             :          * Unlike tick based timing, vtime based timing never has lost
     651             :          * ticks, and no need for steal time accounting to make up for
     652             :          * lost ticks. Vtime accounts a rounded version of actual
     653             :          * elapsed time. Limit account_other_time to prevent rounding
     654             :          * errors from causing elapsed vtime to go negative.
     655             :          */
     656             :         other = account_other_time(delta);
     657             :         WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
     658             :         vtime->starttime += delta;
     659             : 
     660             :         return delta - other;
     661             : }
     662             : 
     663             : static void vtime_account_system(struct task_struct *tsk,
     664             :                                  struct vtime *vtime)
     665             : {
     666             :         vtime->stime += get_vtime_delta(vtime);
     667             :         if (vtime->stime >= TICK_NSEC) {
     668             :                 account_system_time(tsk, irq_count(), vtime->stime);
     669             :                 vtime->stime = 0;
     670             :         }
     671             : }
     672             : 
     673             : static void vtime_account_guest(struct task_struct *tsk,
     674             :                                 struct vtime *vtime)
     675             : {
     676             :         vtime->gtime += get_vtime_delta(vtime);
     677             :         if (vtime->gtime >= TICK_NSEC) {
     678             :                 account_guest_time(tsk, vtime->gtime);
     679             :                 vtime->gtime = 0;
     680             :         }
     681             : }
     682             : 
     683             : static void __vtime_account_kernel(struct task_struct *tsk,
     684             :                                    struct vtime *vtime)
     685             : {
     686             :         /* We might have scheduled out from guest path */
     687             :         if (vtime->state == VTIME_GUEST)
     688             :                 vtime_account_guest(tsk, vtime);
     689             :         else
     690             :                 vtime_account_system(tsk, vtime);
     691             : }
     692             : 
     693             : void vtime_account_kernel(struct task_struct *tsk)
     694             : {
     695             :         struct vtime *vtime = &tsk->vtime;
     696             : 
     697             :         if (!vtime_delta(vtime))
     698             :                 return;
     699             : 
     700             :         write_seqcount_begin(&vtime->seqcount);
     701             :         __vtime_account_kernel(tsk, vtime);
     702             :         write_seqcount_end(&vtime->seqcount);
     703             : }
     704             : 
     705             : void vtime_user_enter(struct task_struct *tsk)
     706             : {
     707             :         struct vtime *vtime = &tsk->vtime;
     708             : 
     709             :         write_seqcount_begin(&vtime->seqcount);
     710             :         vtime_account_system(tsk, vtime);
     711             :         vtime->state = VTIME_USER;
     712             :         write_seqcount_end(&vtime->seqcount);
     713             : }
     714             : 
     715             : void vtime_user_exit(struct task_struct *tsk)
     716             : {
     717             :         struct vtime *vtime = &tsk->vtime;
     718             : 
     719             :         write_seqcount_begin(&vtime->seqcount);
     720             :         vtime->utime += get_vtime_delta(vtime);
     721             :         if (vtime->utime >= TICK_NSEC) {
     722             :                 account_user_time(tsk, vtime->utime);
     723             :                 vtime->utime = 0;
     724             :         }
     725             :         vtime->state = VTIME_SYS;
     726             :         write_seqcount_end(&vtime->seqcount);
     727             : }
     728             : 
     729             : void vtime_guest_enter(struct task_struct *tsk)
     730             : {
     731             :         struct vtime *vtime = &tsk->vtime;
     732             :         /*
     733             :          * The flags must be updated under the lock with
     734             :          * the vtime_starttime flush and update.
     735             :          * That enforces a right ordering and update sequence
     736             :          * synchronization against the reader (task_gtime())
     737             :          * that can thus safely catch up with a tickless delta.
     738             :          */
     739             :         write_seqcount_begin(&vtime->seqcount);
     740             :         vtime_account_system(tsk, vtime);
     741             :         tsk->flags |= PF_VCPU;
     742             :         vtime->state = VTIME_GUEST;
     743             :         write_seqcount_end(&vtime->seqcount);
     744             : }
     745             : EXPORT_SYMBOL_GPL(vtime_guest_enter);
     746             : 
     747             : void vtime_guest_exit(struct task_struct *tsk)
     748             : {
     749             :         struct vtime *vtime = &tsk->vtime;
     750             : 
     751             :         write_seqcount_begin(&vtime->seqcount);
     752             :         vtime_account_guest(tsk, vtime);
     753             :         tsk->flags &= ~PF_VCPU;
     754             :         vtime->state = VTIME_SYS;
     755             :         write_seqcount_end(&vtime->seqcount);
     756             : }
     757             : EXPORT_SYMBOL_GPL(vtime_guest_exit);
     758             : 
     759             : void vtime_account_idle(struct task_struct *tsk)
     760             : {
     761             :         account_idle_time(get_vtime_delta(&tsk->vtime));
     762             : }
     763             : 
     764             : void vtime_task_switch_generic(struct task_struct *prev)
     765             : {
     766             :         struct vtime *vtime = &prev->vtime;
     767             : 
     768             :         write_seqcount_begin(&vtime->seqcount);
     769             :         if (vtime->state == VTIME_IDLE)
     770             :                 vtime_account_idle(prev);
     771             :         else
     772             :                 __vtime_account_kernel(prev, vtime);
     773             :         vtime->state = VTIME_INACTIVE;
     774             :         vtime->cpu = -1;
     775             :         write_seqcount_end(&vtime->seqcount);
     776             : 
     777             :         vtime = &current->vtime;
     778             : 
     779             :         write_seqcount_begin(&vtime->seqcount);
     780             :         if (is_idle_task(current))
     781             :                 vtime->state = VTIME_IDLE;
     782             :         else if (current->flags & PF_VCPU)
     783             :                 vtime->state = VTIME_GUEST;
     784             :         else
     785             :                 vtime->state = VTIME_SYS;
     786             :         vtime->starttime = sched_clock();
     787             :         vtime->cpu = smp_processor_id();
     788             :         write_seqcount_end(&vtime->seqcount);
     789             : }
     790             : 
     791             : void vtime_init_idle(struct task_struct *t, int cpu)
     792             : {
     793             :         struct vtime *vtime = &t->vtime;
     794             :         unsigned long flags;
     795             : 
     796             :         local_irq_save(flags);
     797             :         write_seqcount_begin(&vtime->seqcount);
     798             :         vtime->state = VTIME_IDLE;
     799             :         vtime->starttime = sched_clock();
     800             :         vtime->cpu = cpu;
     801             :         write_seqcount_end(&vtime->seqcount);
     802             :         local_irq_restore(flags);
     803             : }
     804             : 
     805             : u64 task_gtime(struct task_struct *t)
     806             : {
     807             :         struct vtime *vtime = &t->vtime;
     808             :         unsigned int seq;
     809             :         u64 gtime;
     810             : 
     811             :         if (!vtime_accounting_enabled())
     812             :                 return t->gtime;
     813             : 
     814             :         do {
     815             :                 seq = read_seqcount_begin(&vtime->seqcount);
     816             : 
     817             :                 gtime = t->gtime;
     818             :                 if (vtime->state == VTIME_GUEST)
     819             :                         gtime += vtime->gtime + vtime_delta(vtime);
     820             : 
     821             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     822             : 
     823             :         return gtime;
     824             : }
     825             : 
     826             : /*
     827             :  * Fetch cputime raw values from fields of task_struct and
     828             :  * add up the pending nohz execution time since the last
     829             :  * cputime snapshot.
     830             :  */
     831             : void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
     832             : {
     833             :         struct vtime *vtime = &t->vtime;
     834             :         unsigned int seq;
     835             :         u64 delta;
     836             : 
     837             :         if (!vtime_accounting_enabled()) {
     838             :                 *utime = t->utime;
     839             :                 *stime = t->stime;
     840             :                 return;
     841             :         }
     842             : 
     843             :         do {
     844             :                 seq = read_seqcount_begin(&vtime->seqcount);
     845             : 
     846             :                 *utime = t->utime;
     847             :                 *stime = t->stime;
     848             : 
     849             :                 /* Task is sleeping or idle, nothing to add */
     850             :                 if (vtime->state < VTIME_SYS)
     851             :                         continue;
     852             : 
     853             :                 delta = vtime_delta(vtime);
     854             : 
     855             :                 /*
     856             :                  * Task runs either in user (including guest) or kernel space,
     857             :                  * add pending nohz time to the right place.
     858             :                  */
     859             :                 if (vtime->state == VTIME_SYS)
     860             :                         *stime += vtime->stime + delta;
     861             :                 else
     862             :                         *utime += vtime->utime + delta;
     863             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     864             : }
     865             : 
     866             : static int vtime_state_fetch(struct vtime *vtime, int cpu)
     867             : {
     868             :         int state = READ_ONCE(vtime->state);
     869             : 
     870             :         /*
     871             :          * We raced against a context switch, fetch the
     872             :          * kcpustat task again.
     873             :          */
     874             :         if (vtime->cpu != cpu && vtime->cpu != -1)
     875             :                 return -EAGAIN;
     876             : 
     877             :         /*
     878             :          * Two possible things here:
     879             :          * 1) We are seeing the scheduling out task (prev) or any past one.
     880             :          * 2) We are seeing the scheduling in task (next) but it hasn't
     881             :          *    passed though vtime_task_switch() yet so the pending
     882             :          *    cputime of the prev task may not be flushed yet.
     883             :          *
     884             :          * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
     885             :          */
     886             :         if (state == VTIME_INACTIVE)
     887             :                 return -EAGAIN;
     888             : 
     889             :         return state;
     890             : }
     891             : 
     892             : static u64 kcpustat_user_vtime(struct vtime *vtime)
     893             : {
     894             :         if (vtime->state == VTIME_USER)
     895             :                 return vtime->utime + vtime_delta(vtime);
     896             :         else if (vtime->state == VTIME_GUEST)
     897             :                 return vtime->gtime + vtime_delta(vtime);
     898             :         return 0;
     899             : }
     900             : 
     901             : static int kcpustat_field_vtime(u64 *cpustat,
     902             :                                 struct task_struct *tsk,
     903             :                                 enum cpu_usage_stat usage,
     904             :                                 int cpu, u64 *val)
     905             : {
     906             :         struct vtime *vtime = &tsk->vtime;
     907             :         unsigned int seq;
     908             : 
     909             :         do {
     910             :                 int state;
     911             : 
     912             :                 seq = read_seqcount_begin(&vtime->seqcount);
     913             : 
     914             :                 state = vtime_state_fetch(vtime, cpu);
     915             :                 if (state < 0)
     916             :                         return state;
     917             : 
     918             :                 *val = cpustat[usage];
     919             : 
     920             :                 /*
     921             :                  * Nice VS unnice cputime accounting may be inaccurate if
     922             :                  * the nice value has changed since the last vtime update.
     923             :                  * But proper fix would involve interrupting target on nice
     924             :                  * updates which is a no go on nohz_full (although the scheduler
     925             :                  * may still interrupt the target if rescheduling is needed...)
     926             :                  */
     927             :                 switch (usage) {
     928             :                 case CPUTIME_SYSTEM:
     929             :                         if (state == VTIME_SYS)
     930             :                                 *val += vtime->stime + vtime_delta(vtime);
     931             :                         break;
     932             :                 case CPUTIME_USER:
     933             :                         if (task_nice(tsk) <= 0)
     934             :                                 *val += kcpustat_user_vtime(vtime);
     935             :                         break;
     936             :                 case CPUTIME_NICE:
     937             :                         if (task_nice(tsk) > 0)
     938             :                                 *val += kcpustat_user_vtime(vtime);
     939             :                         break;
     940             :                 case CPUTIME_GUEST:
     941             :                         if (state == VTIME_GUEST && task_nice(tsk) <= 0)
     942             :                                 *val += vtime->gtime + vtime_delta(vtime);
     943             :                         break;
     944             :                 case CPUTIME_GUEST_NICE:
     945             :                         if (state == VTIME_GUEST && task_nice(tsk) > 0)
     946             :                                 *val += vtime->gtime + vtime_delta(vtime);
     947             :                         break;
     948             :                 default:
     949             :                         break;
     950             :                 }
     951             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     952             : 
     953             :         return 0;
     954             : }
     955             : 
     956             : u64 kcpustat_field(struct kernel_cpustat *kcpustat,
     957             :                    enum cpu_usage_stat usage, int cpu)
     958             : {
     959             :         u64 *cpustat = kcpustat->cpustat;
     960             :         u64 val = cpustat[usage];
     961             :         struct rq *rq;
     962             :         int err;
     963             : 
     964             :         if (!vtime_accounting_enabled_cpu(cpu))
     965             :                 return val;
     966             : 
     967             :         rq = cpu_rq(cpu);
     968             : 
     969             :         for (;;) {
     970             :                 struct task_struct *curr;
     971             : 
     972             :                 rcu_read_lock();
     973             :                 curr = rcu_dereference(rq->curr);
     974             :                 if (WARN_ON_ONCE(!curr)) {
     975             :                         rcu_read_unlock();
     976             :                         return cpustat[usage];
     977             :                 }
     978             : 
     979             :                 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
     980             :                 rcu_read_unlock();
     981             : 
     982             :                 if (!err)
     983             :                         return val;
     984             : 
     985             :                 cpu_relax();
     986             :         }
     987             : }
     988             : EXPORT_SYMBOL_GPL(kcpustat_field);
     989             : 
     990             : static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
     991             :                                     const struct kernel_cpustat *src,
     992             :                                     struct task_struct *tsk, int cpu)
     993             : {
     994             :         struct vtime *vtime = &tsk->vtime;
     995             :         unsigned int seq;
     996             : 
     997             :         do {
     998             :                 u64 *cpustat;
     999             :                 u64 delta;
    1000             :                 int state;
    1001             : 
    1002             :                 seq = read_seqcount_begin(&vtime->seqcount);
    1003             : 
    1004             :                 state = vtime_state_fetch(vtime, cpu);
    1005             :                 if (state < 0)
    1006             :                         return state;
    1007             : 
    1008             :                 *dst = *src;
    1009             :                 cpustat = dst->cpustat;
    1010             : 
    1011             :                 /* Task is sleeping, dead or idle, nothing to add */
    1012             :                 if (state < VTIME_SYS)
    1013             :                         continue;
    1014             : 
    1015             :                 delta = vtime_delta(vtime);
    1016             : 
    1017             :                 /*
    1018             :                  * Task runs either in user (including guest) or kernel space,
    1019             :                  * add pending nohz time to the right place.
    1020             :                  */
    1021             :                 if (state == VTIME_SYS) {
    1022             :                         cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
    1023             :                 } else if (state == VTIME_USER) {
    1024             :                         if (task_nice(tsk) > 0)
    1025             :                                 cpustat[CPUTIME_NICE] += vtime->utime + delta;
    1026             :                         else
    1027             :                                 cpustat[CPUTIME_USER] += vtime->utime + delta;
    1028             :                 } else {
    1029             :                         WARN_ON_ONCE(state != VTIME_GUEST);
    1030             :                         if (task_nice(tsk) > 0) {
    1031             :                                 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
    1032             :                                 cpustat[CPUTIME_NICE] += vtime->gtime + delta;
    1033             :                         } else {
    1034             :                                 cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
    1035             :                                 cpustat[CPUTIME_USER] += vtime->gtime + delta;
    1036             :                         }
    1037             :                 }
    1038             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
    1039             : 
    1040             :         return 0;
    1041             : }
    1042             : 
    1043             : void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
    1044             : {
    1045             :         const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
    1046             :         struct rq *rq;
    1047             :         int err;
    1048             : 
    1049             :         if (!vtime_accounting_enabled_cpu(cpu)) {
    1050             :                 *dst = *src;
    1051             :                 return;
    1052             :         }
    1053             : 
    1054             :         rq = cpu_rq(cpu);
    1055             : 
    1056             :         for (;;) {
    1057             :                 struct task_struct *curr;
    1058             : 
    1059             :                 rcu_read_lock();
    1060             :                 curr = rcu_dereference(rq->curr);
    1061             :                 if (WARN_ON_ONCE(!curr)) {
    1062             :                         rcu_read_unlock();
    1063             :                         *dst = *src;
    1064             :                         return;
    1065             :                 }
    1066             : 
    1067             :                 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
    1068             :                 rcu_read_unlock();
    1069             : 
    1070             :                 if (!err)
    1071             :                         return;
    1072             : 
    1073             :                 cpu_relax();
    1074             :         }
    1075             : }
    1076             : EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
    1077             : 
    1078             : #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

Generated by: LCOV version 1.14