LCOV - landlock.info - kernel/time/timer.c

LCOV - code coverage report

Current view:	top level - kernel/time - timer.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	456	569	80.1 %
Date:	2021-04-22 12:43:58	Functions:	47	63	74.6 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  *  Kernel internal timers
       4             :  *
       5             :  *  Copyright (C) 1991, 1992  Linus Torvalds
       6             :  *
       7             :  *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
       8             :  *
       9             :  *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
      10             :  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
      11             :  *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
      12             :  *              serialize accesses to xtime/lost_ticks).
      13             :  *                              Copyright (C) 1998  Andrea Arcangeli
      14             :  *  1999-03-10  Improved NTP compatibility by Ulrich Windl
      15             :  *  2002-05-31  Move sys_sysinfo here and make its locking sane, Robert Love
      16             :  *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
      17             :  *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
      18             :  *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
      19             :  */
      20             : 
      21             : #include <linux/kernel_stat.h>
      22             : #include <linux/export.h>
      23             : #include <linux/interrupt.h>
      24             : #include <linux/percpu.h>
      25             : #include <linux/init.h>
      26             : #include <linux/mm.h>
      27             : #include <linux/swap.h>
      28             : #include <linux/pid_namespace.h>
      29             : #include <linux/notifier.h>
      30             : #include <linux/thread_info.h>
      31             : #include <linux/time.h>
      32             : #include <linux/jiffies.h>
      33             : #include <linux/posix-timers.h>
      34             : #include <linux/cpu.h>
      35             : #include <linux/syscalls.h>
      36             : #include <linux/delay.h>
      37             : #include <linux/tick.h>
      38             : #include <linux/kallsyms.h>
      39             : #include <linux/irq_work.h>
      40             : #include <linux/sched/signal.h>
      41             : #include <linux/sched/sysctl.h>
      42             : #include <linux/sched/nohz.h>
      43             : #include <linux/sched/debug.h>
      44             : #include <linux/slab.h>
      45             : #include <linux/compat.h>
      46             : #include <linux/random.h>
      47             : 
      48             : #include <linux/uaccess.h>
      49             : #include <asm/unistd.h>
      50             : #include <asm/div64.h>
      51             : #include <asm/timex.h>
      52             : #include <asm/io.h>
      53             : 
      54             : #include "tick-internal.h"
      55             : 
      56             : #define CREATE_TRACE_POINTS
      57             : #include <trace/events/timer.h>
      58             : 
      59             : __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
      60             : 
      61             : EXPORT_SYMBOL(jiffies_64);
      62             : 
      63             : /*
      64             :  * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
      65             :  * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
      66             :  * level has a different granularity.
      67             :  *
      68             :  * The level granularity is:            LVL_CLK_DIV ^ lvl
      69             :  * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
      70             :  *
      71             :  * The array level of a newly armed timer depends on the relative expiry
      72             :  * time. The farther the expiry time is away the higher the array level and
      73             :  * therefor the granularity becomes.
      74             :  *
      75             :  * Contrary to the original timer wheel implementation, which aims for 'exact'
      76             :  * expiry of the timers, this implementation removes the need for recascading
      77             :  * the timers into the lower array levels. The previous 'classic' timer wheel
      78             :  * implementation of the kernel already violated the 'exact' expiry by adding
      79             :  * slack to the expiry time to provide batched expiration. The granularity
      80             :  * levels provide implicit batching.
      81             :  *
      82             :  * This is an optimization of the original timer wheel implementation for the
      83             :  * majority of the timer wheel use cases: timeouts. The vast majority of
      84             :  * timeout timers (networking, disk I/O ...) are canceled before expiry. If
      85             :  * the timeout expires it indicates that normal operation is disturbed, so it
      86             :  * does not matter much whether the timeout comes with a slight delay.
      87             :  *
      88             :  * The only exception to this are networking timers with a small expiry
      89             :  * time. They rely on the granularity. Those fit into the first wheel level,
      90             :  * which has HZ granularity.
      91             :  *
      92             :  * We don't have cascading anymore. timers with a expiry time above the
      93             :  * capacity of the last wheel level are force expired at the maximum timeout
      94             :  * value of the last wheel level. From data sampling we know that the maximum
      95             :  * value observed is 5 days (network connection tracking), so this should not
      96             :  * be an issue.
      97             :  *
      98             :  * The currently chosen array constants values are a good compromise between
      99             :  * array size and granularity.
     100             :  *
     101             :  * This results in the following granularity and range levels:
     102             :  *
     103             :  * HZ 1000 steps
     104             :  * Level Offset  Granularity            Range
     105             :  *  0      0         1 ms                0 ms -         63 ms
     106             :  *  1     64         8 ms               64 ms -        511 ms
     107             :  *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
     108             :  *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
     109             :  *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
     110             :  *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
     111             :  *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
     112             :  *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
     113             :  *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
     114             :  *
     115             :  * HZ  300
     116             :  * Level Offset  Granularity            Range
     117             :  *  0      0         3 ms                0 ms -        210 ms
     118             :  *  1     64        26 ms              213 ms -       1703 ms (213ms - ~1s)
     119             :  *  2    128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
     120             :  *  3    192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
     121             :  *  4    256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
     122             :  *  5    320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
     123             :  *  6    384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
     124             :  *  7    448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
     125             :  *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
     126             :  *
     127             :  * HZ  250
     128             :  * Level Offset  Granularity            Range
     129             :  *  0      0         4 ms                0 ms -        255 ms
     130             :  *  1     64        32 ms              256 ms -       2047 ms (256ms - ~2s)
     131             :  *  2    128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
     132             :  *  3    192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
     133             :  *  4    256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
     134             :  *  5    320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
     135             :  *  6    384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
     136             :  *  7    448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
     137             :  *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
     138             :  *
     139             :  * HZ  100
     140             :  * Level Offset  Granularity            Range
     141             :  *  0      0         10 ms               0 ms -        630 ms
     142             :  *  1     64         80 ms             640 ms -       5110 ms (640ms - ~5s)
     143             :  *  2    128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
     144             :  *  3    192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
     145             :  *  4    256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
     146             :  *  5    320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
     147             :  *  6    384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
     148             :  *  7    448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
     149             :  */
     150             : 
     151             : /* Clock divisor for the next level */
     152             : #define LVL_CLK_SHIFT   3
     153             : #define LVL_CLK_DIV     (1UL << LVL_CLK_SHIFT)
     154             : #define LVL_CLK_MASK    (LVL_CLK_DIV - 1)
     155             : #define LVL_SHIFT(n)    ((n) * LVL_CLK_SHIFT)
     156             : #define LVL_GRAN(n)     (1UL << LVL_SHIFT(n))
     157             : 
     158             : /*
     159             :  * The time start value for each level to select the bucket at enqueue
     160             :  * time. We start from the last possible delta of the previous level
     161             :  * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
     162             :  */
     163             : #define LVL_START(n)    ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
     164             : 
     165             : /* Size of each clock level */
     166             : #define LVL_BITS        6
     167             : #define LVL_SIZE        (1UL << LVL_BITS)
     168             : #define LVL_MASK        (LVL_SIZE - 1)
     169             : #define LVL_OFFS(n)     ((n) * LVL_SIZE)
     170             : 
     171             : /* Level depth */
     172             : #if HZ > 100
     173             : # define LVL_DEPTH      9
     174             : # else
     175             : # define LVL_DEPTH      8
     176             : #endif
     177             : 
     178             : /* The cutoff (max. capacity of the wheel) */
     179             : #define WHEEL_TIMEOUT_CUTOFF    (LVL_START(LVL_DEPTH))
     180             : #define WHEEL_TIMEOUT_MAX       (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
     181             : 
     182             : /*
     183             :  * The resulting wheel size. If NOHZ is configured we allocate two
     184             :  * wheels so we have a separate storage for the deferrable timers.
     185             :  */
     186             : #define WHEEL_SIZE      (LVL_SIZE * LVL_DEPTH)
     187             : 
     188             : #ifdef CONFIG_NO_HZ_COMMON
     189             : # define NR_BASES       2
     190             : # define BASE_STD       0
     191             : # define BASE_DEF       1
     192             : #else
     193             : # define NR_BASES       1
     194             : # define BASE_STD       0
     195             : # define BASE_DEF       0
     196             : #endif
     197             : 
     198             : struct timer_base {
     199             :         raw_spinlock_t          lock;
     200             :         struct timer_list       *running_timer;
     201             : #ifdef CONFIG_PREEMPT_RT
     202             :         spinlock_t              expiry_lock;
     203             :         atomic_t                timer_waiters;
     204             : #endif
     205             :         unsigned long           clk;
     206             :         unsigned long           next_expiry;
     207             :         unsigned int            cpu;
     208             :         bool                    next_expiry_recalc;
     209             :         bool                    is_idle;
     210             :         DECLARE_BITMAP(pending_map, WHEEL_SIZE);
     211             :         struct hlist_head       vectors[WHEEL_SIZE];
     212             : } ____cacheline_aligned;
     213             : 
     214             : static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
     215             : 
     216             : #ifdef CONFIG_NO_HZ_COMMON
     217             : 
     218             : static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
     219             : static DEFINE_MUTEX(timer_keys_mutex);
     220             : 
     221             : static void timer_update_keys(struct work_struct *work);
     222             : static DECLARE_WORK(timer_update_work, timer_update_keys);
     223             : 
     224             : #ifdef CONFIG_SMP
     225             : unsigned int sysctl_timer_migration = 1;
     226             : 
     227             : DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
     228             : 
     229           1 : static void timers_update_migration(void)
     230             : {
     231           1 :         if (sysctl_timer_migration && tick_nohz_active)
     232           1 :                 static_branch_enable(&timers_migration_enabled);
     233             :         else
     234           0 :                 static_branch_disable(&timers_migration_enabled);
     235           1 : }
     236             : #else
     237             : static inline void timers_update_migration(void) { }
     238             : #endif /* !CONFIG_SMP */
     239             : 
     240           1 : static void timer_update_keys(struct work_struct *work)
     241             : {
     242           1 :         mutex_lock(&timer_keys_mutex);
     243           1 :         timers_update_migration();
     244           1 :         static_branch_enable(&timers_nohz_active);
     245           1 :         mutex_unlock(&timer_keys_mutex);
     246           1 : }
     247             : 
     248           1 : void timers_update_nohz(void)
     249             : {
     250           1 :         schedule_work(&timer_update_work);
     251           1 : }
     252             : 
     253           0 : int timer_migration_handler(struct ctl_table *table, int write,
     254             :                             void *buffer, size_t *lenp, loff_t *ppos)
     255             : {
     256           0 :         int ret;
     257             : 
     258           0 :         mutex_lock(&timer_keys_mutex);
     259           0 :         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
     260           0 :         if (!ret && write)
     261           0 :                 timers_update_migration();
     262           0 :         mutex_unlock(&timer_keys_mutex);
     263           0 :         return ret;
     264             : }
     265             : 
     266        3661 : static inline bool is_timers_nohz_active(void)
     267             : {
     268        3661 :         return static_branch_unlikely(&timers_nohz_active);
     269             : }
     270             : #else
     271             : static inline bool is_timers_nohz_active(void) { return false; }
     272             : #endif /* NO_HZ_COMMON */
     273             : 
     274         220 : static unsigned long round_jiffies_common(unsigned long j, int cpu,
     275             :                 bool force_up)
     276             : {
     277         220 :         int rem;
     278         220 :         unsigned long original = j;
     279             : 
     280             :         /*
     281             :          * We don't want all cpus firing their timers at once hitting the
     282             :          * same lock or cachelines, so we skew each extra cpu with an extra
     283             :          * 3 jiffies. This 3 jiffies came originally from the mm/ code which
     284             :          * already did this.
     285             :          * The skew is done by adding 3*cpunr, then round, then subtract this
     286             :          * extra offset again.
     287             :          */
     288         220 :         j += cpu * 3;
     289             : 
     290         220 :         rem = j % HZ;
     291             : 
     292             :         /*
     293             :          * If the target jiffie is just after a whole second (which can happen
     294             :          * due to delays of the timer irq, long irq off times etc etc) then
     295             :          * we should round down to the whole second, not up. Use 1/4th second
     296             :          * as cutoff for this rounding as an extreme upper bound for this.
     297             :          * But never round down if @force_up is set.
     298             :          */
     299         220 :         if (rem < HZ/4 && !force_up) /* round down */
     300         157 :                 j = j - rem;
     301             :         else /* round up */
     302          63 :                 j = j - rem + HZ;
     303             : 
     304             :         /* now that we have rounded, subtract the extra skew again */
     305         220 :         j -= cpu * 3;
     306             : 
     307             :         /*
     308             :          * Make sure j is still in the future. Otherwise return the
     309             :          * unmodified value.
     310             :          */
     311         220 :         return time_is_after_jiffies(j) ? j : original;
     312             : }
     313             : 
     314             : /**
     315             :  * __round_jiffies - function to round jiffies to a full second
     316             :  * @j: the time in (absolute) jiffies that should be rounded
     317             :  * @cpu: the processor number on which the timeout will happen
     318             :  *
     319             :  * __round_jiffies() rounds an absolute time in the future (in jiffies)
     320             :  * up or down to (approximately) full seconds. This is useful for timers
     321             :  * for which the exact time they fire does not matter too much, as long as
     322             :  * they fire approximately every X seconds.
     323             :  *
     324             :  * By rounding these timers to whole seconds, all such timers will fire
     325             :  * at the same time, rather than at various times spread out. The goal
     326             :  * of this is to have the CPU wake up less, which saves power.
     327             :  *
     328             :  * The exact rounding is skewed for each processor to avoid all
     329             :  * processors firing at the exact same time, which could lead
     330             :  * to lock contention or spurious cache line bouncing.
     331             :  *
     332             :  * The return value is the rounded version of the @j parameter.
     333             :  */
     334           0 : unsigned long __round_jiffies(unsigned long j, int cpu)
     335             : {
     336           0 :         return round_jiffies_common(j, cpu, false);
     337             : }
     338             : EXPORT_SYMBOL_GPL(__round_jiffies);
     339             : 
     340             : /**
     341             :  * __round_jiffies_relative - function to round jiffies to a full second
     342             :  * @j: the time in (relative) jiffies that should be rounded
     343             :  * @cpu: the processor number on which the timeout will happen
     344             :  *
     345             :  * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
     346             :  * up or down to (approximately) full seconds. This is useful for timers
     347             :  * for which the exact time they fire does not matter too much, as long as
     348             :  * they fire approximately every X seconds.
     349             :  *
     350             :  * By rounding these timers to whole seconds, all such timers will fire
     351             :  * at the same time, rather than at various times spread out. The goal
     352             :  * of this is to have the CPU wake up less, which saves power.
     353             :  *
     354             :  * The exact rounding is skewed for each processor to avoid all
     355             :  * processors firing at the exact same time, which could lead
     356             :  * to lock contention or spurious cache line bouncing.
     357             :  *
     358             :  * The return value is the rounded version of the @j parameter.
     359             :  */
     360         158 : unsigned long __round_jiffies_relative(unsigned long j, int cpu)
     361             : {
     362         158 :         unsigned long j0 = jiffies;
     363             : 
     364             :         /* Use j0 because jiffies might change while we run */
     365         158 :         return round_jiffies_common(j + j0, cpu, false) - j0;
     366             : }
     367             : EXPORT_SYMBOL_GPL(__round_jiffies_relative);
     368             : 
     369             : /**
     370             :  * round_jiffies - function to round jiffies to a full second
     371             :  * @j: the time in (absolute) jiffies that should be rounded
     372             :  *
     373             :  * round_jiffies() rounds an absolute time in the future (in jiffies)
     374             :  * up or down to (approximately) full seconds. This is useful for timers
     375             :  * for which the exact time they fire does not matter too much, as long as
     376             :  * they fire approximately every X seconds.
     377             :  *
     378             :  * By rounding these timers to whole seconds, all such timers will fire
     379             :  * at the same time, rather than at various times spread out. The goal
     380             :  * of this is to have the CPU wake up less, which saves power.
     381             :  *
     382             :  * The return value is the rounded version of the @j parameter.
     383             :  */
     384           1 : unsigned long round_jiffies(unsigned long j)
     385             : {
     386           1 :         return round_jiffies_common(j, raw_smp_processor_id(), false);
     387             : }
     388             : EXPORT_SYMBOL_GPL(round_jiffies);
     389             : 
     390             : /**
     391             :  * round_jiffies_relative - function to round jiffies to a full second
     392             :  * @j: the time in (relative) jiffies that should be rounded
     393             :  *
     394             :  * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
     395             :  * up or down to (approximately) full seconds. This is useful for timers
     396             :  * for which the exact time they fire does not matter too much, as long as
     397             :  * they fire approximately every X seconds.
     398             :  *
     399             :  * By rounding these timers to whole seconds, all such timers will fire
     400             :  * at the same time, rather than at various times spread out. The goal
     401             :  * of this is to have the CPU wake up less, which saves power.
     402             :  *
     403             :  * The return value is the rounded version of the @j parameter.
     404             :  */
     405         158 : unsigned long round_jiffies_relative(unsigned long j)
     406             : {
     407         158 :         return __round_jiffies_relative(j, raw_smp_processor_id());
     408             : }
     409             : EXPORT_SYMBOL_GPL(round_jiffies_relative);
     410             : 
     411             : /**
     412             :  * __round_jiffies_up - function to round jiffies up to a full second
     413             :  * @j: the time in (absolute) jiffies that should be rounded
     414             :  * @cpu: the processor number on which the timeout will happen
     415             :  *
     416             :  * This is the same as __round_jiffies() except that it will never
     417             :  * round down.  This is useful for timeouts for which the exact time
     418             :  * of firing does not matter too much, as long as they don't fire too
     419             :  * early.
     420             :  */
     421           0 : unsigned long __round_jiffies_up(unsigned long j, int cpu)
     422             : {
     423           0 :         return round_jiffies_common(j, cpu, true);
     424             : }
     425             : EXPORT_SYMBOL_GPL(__round_jiffies_up);
     426             : 
     427             : /**
     428             :  * __round_jiffies_up_relative - function to round jiffies up to a full second
     429             :  * @j: the time in (relative) jiffies that should be rounded
     430             :  * @cpu: the processor number on which the timeout will happen
     431             :  *
     432             :  * This is the same as __round_jiffies_relative() except that it will never
     433             :  * round down.  This is useful for timeouts for which the exact time
     434             :  * of firing does not matter too much, as long as they don't fire too
     435             :  * early.
     436             :  */
     437           0 : unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
     438             : {
     439           0 :         unsigned long j0 = jiffies;
     440             : 
     441             :         /* Use j0 because jiffies might change while we run */
     442           0 :         return round_jiffies_common(j + j0, cpu, true) - j0;
     443             : }
     444             : EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
     445             : 
     446             : /**
     447             :  * round_jiffies_up - function to round jiffies up to a full second
     448             :  * @j: the time in (absolute) jiffies that should be rounded
     449             :  *
     450             :  * This is the same as round_jiffies() except that it will never
     451             :  * round down.  This is useful for timeouts for which the exact time
     452             :  * of firing does not matter too much, as long as they don't fire too
     453             :  * early.
     454             :  */
     455          61 : unsigned long round_jiffies_up(unsigned long j)
     456             : {
     457          61 :         return round_jiffies_common(j, raw_smp_processor_id(), true);
     458             : }
     459             : EXPORT_SYMBOL_GPL(round_jiffies_up);
     460             : 
     461             : /**
     462             :  * round_jiffies_up_relative - function to round jiffies up to a full second
     463             :  * @j: the time in (relative) jiffies that should be rounded
     464             :  *
     465             :  * This is the same as round_jiffies_relative() except that it will never
     466             :  * round down.  This is useful for timeouts for which the exact time
     467             :  * of firing does not matter too much, as long as they don't fire too
     468             :  * early.
     469             :  */
     470           0 : unsigned long round_jiffies_up_relative(unsigned long j)
     471             : {
     472           0 :         return __round_jiffies_up_relative(j, raw_smp_processor_id());
     473             : }
     474             : EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
     475             : 
     476             : 
     477       10220 : static inline unsigned int timer_get_idx(struct timer_list *timer)
     478             : {
     479       10220 :         return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
     480             : }
     481             : 
     482        5544 : static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
     483             : {
     484        5544 :         timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
     485        5544 :                         idx << TIMER_ARRAYSHIFT;
     486             : }
     487             : 
     488             : /*
     489             :  * Helper function to calculate the array index for a given expiry
     490             :  * time.
     491             :  */
     492        5547 : static inline unsigned calc_index(unsigned long expires, unsigned lvl,
     493             :                                   unsigned long *bucket_expiry)
     494             : {
     495             : 
     496             :         /*
     497             :          * The timer wheel has to guarantee that a timer does not fire
     498             :          * early. Early expiry can happen due to:
     499             :          * - Timer is armed at the edge of a tick
     500             :          * - Truncation of the expiry time in the outer wheel levels
     501             :          *
     502             :          * Round up with level granularity to prevent this.
     503             :          */
     504        5547 :         expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
     505        5547 :         *bucket_expiry = expires << LVL_SHIFT(lvl);
     506        5547 :         return LVL_OFFS(lvl) + (expires & LVL_MASK);
     507             : }
     508             : 
     509        5547 : static int calc_wheel_index(unsigned long expires, unsigned long clk,
     510             :                             unsigned long *bucket_expiry)
     511             : {
     512        5547 :         unsigned long delta = expires - clk;
     513        5547 :         unsigned int idx;
     514             : 
     515        5547 :         if (delta < LVL_START(1)) {
     516        4833 :                 idx = calc_index(expires, 0, bucket_expiry);
     517         714 :         } else if (delta < LVL_START(2)) {
     518         303 :                 idx = calc_index(expires, 1, bucket_expiry);
     519         411 :         } else if (delta < LVL_START(3)) {
     520          83 :                 idx = calc_index(expires, 2, bucket_expiry);
     521         328 :         } else if (delta < LVL_START(4)) {
     522         320 :                 idx = calc_index(expires, 3, bucket_expiry);
     523           8 :         } else if (delta < LVL_START(5)) {
     524           3 :                 idx = calc_index(expires, 4, bucket_expiry);
     525           5 :         } else if (delta < LVL_START(6)) {
     526           4 :                 idx = calc_index(expires, 5, bucket_expiry);
     527           1 :         } else if (delta < LVL_START(7)) {
     528           1 :                 idx = calc_index(expires, 6, bucket_expiry);
     529           0 :         } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
     530           0 :                 idx = calc_index(expires, 7, bucket_expiry);
     531           0 :         } else if ((long) delta < 0) {
     532           0 :                 idx = clk & LVL_MASK;
     533           0 :                 *bucket_expiry = clk;
     534             :         } else {
     535             :                 /*
     536             :                  * Force expire obscene large timeouts to expire at the
     537             :                  * capacity limit of the wheel.
     538             :                  */
     539           0 :                 if (delta >= WHEEL_TIMEOUT_CUTOFF)
     540           0 :                         expires = clk + WHEEL_TIMEOUT_MAX;
     541             : 
     542           0 :                 idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
     543             :         }
     544        5547 :         return idx;
     545             : }
     546             : 
     547             : static void
     548        3660 : trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
     549             : {
     550        3660 :         if (!is_timers_nohz_active())
     551             :                 return;
     552             : 
     553             :         /*
     554             :          * TODO: This wants some optimizing similar to the code below, but we
     555             :          * will do that when we switch from push to pull for deferrable timers.
     556             :          */
     557        3647 :         if (timer->flags & TIMER_DEFERRABLE) {
     558        3660 :                 if (tick_nohz_full_cpu(base->cpu))
     559             :                         wake_up_nohz_cpu(base->cpu);
     560             :                 return;
     561             :         }
     562             : 
     563             :         /*
     564             :          * We might have to IPI the remote CPU if the base is idle and the
     565             :          * timer is not deferrable. If the other CPU is on the way to idle
     566             :          * then it can't set base->is_idle as we hold the base lock:
     567             :          */
     568        3529 :         if (base->is_idle)
     569          10 :                 wake_up_nohz_cpu(base->cpu);
     570             : }
     571             : 
     572             : /*
     573             :  * Enqueue the timer into the hash bucket, mark it pending in
     574             :  * the bitmap, store the index in the timer flags then wake up
     575             :  * the target CPU if needed.
     576             :  */
     577        5544 : static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
     578             :                           unsigned int idx, unsigned long bucket_expiry)
     579             : {
     580             : 
     581        5544 :         hlist_add_head(&timer->entry, base->vectors + idx);
     582        5544 :         __set_bit(idx, base->pending_map);
     583        5544 :         timer_set_idx(timer, idx);
     584             : 
     585        5544 :         trace_timer_start(timer, timer->expires, timer->flags);
     586             : 
     587             :         /*
     588             :          * Check whether this is the new first expiring timer. The
     589             :          * effective expiry time of the timer is required here
     590             :          * (bucket_expiry) instead of timer->expires.
     591             :          */
     592        5544 :         if (time_before(bucket_expiry, base->next_expiry)) {
     593             :                 /*
     594             :                  * Set the next expiry time and kick the CPU so it
     595             :                  * can reevaluate the wheel:
     596             :                  */
     597        3660 :                 base->next_expiry = bucket_expiry;
     598        3660 :                 base->next_expiry_recalc = false;
     599        3660 :                 trigger_dyntick_cpu(base, timer);
     600             :         }
     601        5544 : }
     602             : 
     603        5382 : static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
     604             : {
     605        5382 :         unsigned long bucket_expiry;
     606        5382 :         unsigned int idx;
     607             : 
     608        5382 :         idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
     609        5382 :         enqueue_timer(base, timer, idx, bucket_expiry);
     610        5382 : }
     611             : 
     612             : #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
     613             : 
     614             : static const struct debug_obj_descr timer_debug_descr;
     615             : 
     616             : static void *timer_debug_hint(void *addr)
     617             : {
     618             :         return ((struct timer_list *) addr)->function;
     619             : }
     620             : 
     621             : static bool timer_is_static_object(void *addr)
     622             : {
     623             :         struct timer_list *timer = addr;
     624             : 
     625             :         return (timer->entry.pprev == NULL &&
     626             :                 timer->entry.next == TIMER_ENTRY_STATIC);
     627             : }
     628             : 
     629             : /*
     630             :  * fixup_init is called when:
     631             :  * - an active object is initialized
     632             :  */
     633             : static bool timer_fixup_init(void *addr, enum debug_obj_state state)
     634             : {
     635             :         struct timer_list *timer = addr;
     636             : 
     637             :         switch (state) {
     638             :         case ODEBUG_STATE_ACTIVE:
     639             :                 del_timer_sync(timer);
     640             :                 debug_object_init(timer, &timer_debug_descr);
     641             :                 return true;
     642             :         default:
     643             :                 return false;
     644             :         }
     645             : }
     646             : 
     647             : /* Stub timer callback for improperly used timers. */
     648             : static void stub_timer(struct timer_list *unused)
     649             : {
     650             :         WARN_ON(1);
     651             : }
     652             : 
     653             : /*
     654             :  * fixup_activate is called when:
     655             :  * - an active object is activated
     656             :  * - an unknown non-static object is activated
     657             :  */
     658             : static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
     659             : {
     660             :         struct timer_list *timer = addr;
     661             : 
     662             :         switch (state) {
     663             :         case ODEBUG_STATE_NOTAVAILABLE:
     664             :                 timer_setup(timer, stub_timer, 0);
     665             :                 return true;
     666             : 
     667             :         case ODEBUG_STATE_ACTIVE:
     668             :                 WARN_ON(1);
     669             :                 fallthrough;
     670             :         default:
     671             :                 return false;
     672             :         }
     673             : }
     674             : 
     675             : /*
     676             :  * fixup_free is called when:
     677             :  * - an active object is freed
     678             :  */
     679             : static bool timer_fixup_free(void *addr, enum debug_obj_state state)
     680             : {
     681             :         struct timer_list *timer = addr;
     682             : 
     683             :         switch (state) {
     684             :         case ODEBUG_STATE_ACTIVE:
     685             :                 del_timer_sync(timer);
     686             :                 debug_object_free(timer, &timer_debug_descr);
     687             :                 return true;
     688             :         default:
     689             :                 return false;
     690             :         }
     691             : }
     692             : 
     693             : /*
     694             :  * fixup_assert_init is called when:
     695             :  * - an untracked/uninit-ed object is found
     696             :  */
     697             : static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
     698             : {
     699             :         struct timer_list *timer = addr;
     700             : 
     701             :         switch (state) {
     702             :         case ODEBUG_STATE_NOTAVAILABLE:
     703             :                 timer_setup(timer, stub_timer, 0);
     704             :                 return true;
     705             :         default:
     706             :                 return false;
     707             :         }
     708             : }
     709             : 
     710             : static const struct debug_obj_descr timer_debug_descr = {
     711             :         .name                   = "timer_list",
     712             :         .debug_hint             = timer_debug_hint,
     713             :         .is_static_object       = timer_is_static_object,
     714             :         .fixup_init             = timer_fixup_init,
     715             :         .fixup_activate         = timer_fixup_activate,
     716             :         .fixup_free             = timer_fixup_free,
     717             :         .fixup_assert_init      = timer_fixup_assert_init,
     718             : };
     719             : 
     720             : static inline void debug_timer_init(struct timer_list *timer)
     721             : {
     722             :         debug_object_init(timer, &timer_debug_descr);
     723             : }
     724             : 
     725             : static inline void debug_timer_activate(struct timer_list *timer)
     726             : {
     727             :         debug_object_activate(timer, &timer_debug_descr);
     728             : }
     729             : 
     730             : static inline void debug_timer_deactivate(struct timer_list *timer)
     731             : {
     732             :         debug_object_deactivate(timer, &timer_debug_descr);
     733             : }
     734             : 
     735             : static inline void debug_timer_assert_init(struct timer_list *timer)
     736             : {
     737             :         debug_object_assert_init(timer, &timer_debug_descr);
     738             : }
     739             : 
     740             : static void do_init_timer(struct timer_list *timer,
     741             :                           void (*func)(struct timer_list *),
     742             :                           unsigned int flags,
     743             :                           const char *name, struct lock_class_key *key);
     744             : 
     745             : void init_timer_on_stack_key(struct timer_list *timer,
     746             :                              void (*func)(struct timer_list *),
     747             :                              unsigned int flags,
     748             :                              const char *name, struct lock_class_key *key)
     749             : {
     750             :         debug_object_init_on_stack(timer, &timer_debug_descr);
     751             :         do_init_timer(timer, func, flags, name, key);
     752             : }
     753             : EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
     754             : 
     755             : void destroy_timer_on_stack(struct timer_list *timer)
     756             : {
     757             :         debug_object_free(timer, &timer_debug_descr);
     758             : }
     759             : EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
     760             : 
     761             : #else
     762        5606 : static inline void debug_timer_init(struct timer_list *timer) { }
     763        5544 : static inline void debug_timer_activate(struct timer_list *timer) { }
     764        5511 : static inline void debug_timer_deactivate(struct timer_list *timer) { }
     765        4964 : static inline void debug_timer_assert_init(struct timer_list *timer) { }
     766             : #endif
     767             : 
     768        5606 : static inline void debug_init(struct timer_list *timer)
     769             : {
     770        5606 :         debug_timer_init(timer);
     771        5606 :         trace_timer_init(timer);
     772             : }
     773             : 
     774        5511 : static inline void debug_deactivate(struct timer_list *timer)
     775             : {
     776        5511 :         debug_timer_deactivate(timer);
     777        5511 :         trace_timer_cancel(timer);
     778             : }
     779             : 
     780        4964 : static inline void debug_assert_init(struct timer_list *timer)
     781             : {
     782        4964 :         debug_timer_assert_init(timer);
     783             : }
     784             : 
     785        5606 : static void do_init_timer(struct timer_list *timer,
     786             :                           void (*func)(struct timer_list *),
     787             :                           unsigned int flags,
     788             :                           const char *name, struct lock_class_key *key)
     789             : {
     790        5606 :         timer->entry.pprev = NULL;
     791        5606 :         timer->function = func;
     792        5606 :         if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
     793           0 :                 flags &= TIMER_INIT_FLAGS;
     794        5606 :         timer->flags = flags | raw_smp_processor_id();
     795        5606 :         lockdep_init_map(&timer->lockdep_map, name, key, 0);
     796        5606 : }
     797             : 
     798             : /**
     799             :  * init_timer_key - initialize a timer
     800             :  * @timer: the timer to be initialized
     801             :  * @func: timer callback function
     802             :  * @flags: timer flags
     803             :  * @name: name of the timer
     804             :  * @key: lockdep class key of the fake lock used for tracking timer
     805             :  *       sync lock dependencies
     806             :  *
     807             :  * init_timer_key() must be done to a timer prior calling *any* of the
     808             :  * other timer functions.
     809             :  */
     810        5606 : void init_timer_key(struct timer_list *timer,
     811             :                     void (*func)(struct timer_list *), unsigned int flags,
     812             :                     const char *name, struct lock_class_key *key)
     813             : {
     814        5606 :         debug_init(timer);
     815        5606 :         do_init_timer(timer, func, flags, name, key);
     816        1037 : }
     817             : EXPORT_SYMBOL(init_timer_key);
     818             : 
     819        5511 : static inline void detach_timer(struct timer_list *timer, bool clear_pending)
     820             : {
     821        5511 :         struct hlist_node *entry = &timer->entry;
     822             : 
     823        5511 :         debug_deactivate(timer);
     824             : 
     825        5515 :         __hlist_del(entry);
     826        5515 :         if (clear_pending)
     827        5350 :                 entry->pprev = NULL;
     828        5515 :         entry->next = LIST_POISON2;
     829        5515 : }
     830             : 
     831       10055 : static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
     832             :                              bool clear_pending)
     833             : {
     834       10055 :         unsigned idx = timer_get_idx(timer);
     835             : 
     836       10055 :         if (!timer_pending(timer))
     837             :                 return 0;
     838             : 
     839        3260 :         if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
     840        1613 :                 __clear_bit(idx, base->pending_map);
     841        1613 :                 base->next_expiry_recalc = true;
     842             :         }
     843             : 
     844        1630 :         detach_timer(timer, clear_pending);
     845        1630 :         return 1;
     846             : }
     847             : 
     848       15773 : static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
     849             : {
     850       15773 :         struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
     851             : 
     852             :         /*
     853             :          * If the timer is deferrable and NO_HZ_COMMON is set then we need
     854             :          * to use the deferrable base.
     855             :          */
     856       15773 :         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
     857         345 :                 base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
     858       15773 :         return base;
     859             : }
     860             : 
     861          20 : static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
     862             : {
     863          40 :         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
     864             : 
     865             :         /*
     866             :          * If the timer is deferrable and NO_HZ_COMMON is set then we need
     867             :          * to use the deferrable base.
     868             :          */
     869          20 :         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
     870           1 :                 base = this_cpu_ptr(&timer_bases[BASE_DEF]);
     871          20 :         return base;
     872             : }
     873             : 
     874       10249 : static inline struct timer_base *get_timer_base(u32 tflags)
     875             : {
     876       10249 :         return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
     877             : }
     878             : 
     879             : static inline struct timer_base *
     880        5350 : get_target_base(struct timer_base *base, unsigned tflags)
     881             : {
     882             : #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
     883        5350 :         if (static_branch_likely(&timers_migration_enabled) &&
     884        5334 :             !(tflags & TIMER_PINNED))
     885        5378 :                 return get_timer_cpu_base(tflags, get_nohz_timer_target());
     886             : #endif
     887          40 :         return get_timer_this_cpu_base(tflags);
     888             : }
     889             : 
     890        5860 : static inline void forward_timer_base(struct timer_base *base)
     891             : {
     892        5860 :         unsigned long jnow = READ_ONCE(jiffies);
     893             : 
     894             :         /*
     895             :          * No need to forward if we are close enough below jiffies.
     896             :          * Also while executing timers, base->clk is 1 offset ahead
     897             :          * of jiffies to avoid endless requeuing to current jffies.
     898             :          */
     899        5860 :         if ((long)(jnow - base->clk) < 1)
     900             :                 return;
     901             : 
     902             :         /*
     903             :          * If the next expiry value is > jiffies, then we fast forward to
     904             :          * jiffies otherwise we forward to the next expiry value.
     905             :          */
     906        1305 :         if (time_after(base->next_expiry, jnow)) {
     907        1123 :                 base->clk = jnow;
     908             :         } else {
     909         364 :                 if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
     910             :                         return;
     911         182 :                 base->clk = base->next_expiry;
     912             :         }
     913             : }
     914             : 
     915             : 
     916             : /*
     917             :  * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
     918             :  * that all timers which are tied to this base are locked, and the base itself
     919             :  * is locked too.
     920             :  *
     921             :  * So __run_timers/migrate_timers can safely modify all timers which could
     922             :  * be found in the base->vectors array.
     923             :  *
     924             :  * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
     925             :  * to wait until the migration is done.
     926             :  */
     927       10249 : static struct timer_base *lock_timer_base(struct timer_list *timer,
     928             :                                           unsigned long *flags)
     929             :         __acquires(timer->base->lock)
     930             : {
     931       10249 :         for (;;) {
     932       10249 :                 struct timer_base *base;
     933       10249 :                 u32 tf;
     934             : 
     935             :                 /*
     936             :                  * We need to use READ_ONCE() here, otherwise the compiler
     937             :                  * might re-read @tf between the check for TIMER_MIGRATING
     938             :                  * and spin_lock().
     939             :                  */
     940       10249 :                 tf = READ_ONCE(timer->flags);
     941             : 
     942       10249 :                 if (!(tf & TIMER_MIGRATING)) {
     943       10249 :                         base = get_timer_base(tf);
     944       10249 :                         raw_spin_lock_irqsave(&base->lock, *flags);
     945       10249 :                         if (timer->flags == tf)
     946       10249 :                                 return base;
     947           0 :                         raw_spin_unlock_irqrestore(&base->lock, *flags);
     948             :                 }
     949           0 :                 cpu_relax();
     950             :         }
     951             : }
     952             : 
     953             : #define MOD_TIMER_PENDING_ONLY          0x01
     954             : #define MOD_TIMER_REDUCE                0x02
     955             : #define MOD_TIMER_NOTPENDING            0x04
     956             : 
     957             : static inline int
     958        5534 : __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
     959             : {
     960        5534 :         unsigned long clk = 0, flags, bucket_expiry;
     961        5534 :         struct timer_base *base, *new_base;
     962        5534 :         unsigned int idx = UINT_MAX;
     963        5534 :         int ret = 0;
     964             : 
     965        5534 :         BUG_ON(!timer->function);
     966             : 
     967             :         /*
     968             :          * This is a common optimization triggered by the networking code - if
     969             :          * the timer is re-modified to have the same timeout or ends up in the
     970             :          * same array bucket then just return:
     971             :          */
     972        5534 :         if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
     973             :                 /*
     974             :                  * The downside of this optimization is that it can result in
     975             :                  * larger granularity than you would get from adding a new
     976             :                  * timer with this expiry.
     977             :                  */
     978         349 :                 long diff = timer->expires - expires;
     979             : 
     980         349 :                 if (!diff)
     981             :                         return 1;
     982         165 :                 if (options & MOD_TIMER_REDUCE && diff <= 0)
     983             :                         return 1;
     984             : 
     985             :                 /*
     986             :                  * We lock timer base and calculate the bucket index right
     987             :                  * here. If the timer ends up in the same bucket, then we
     988             :                  * just update the expiry time and avoid the whole
     989             :                  * dequeue/enqueue dance.
     990             :                  */
     991         165 :                 base = lock_timer_base(timer, &flags);
     992         165 :                 forward_timer_base(base);
     993             : 
     994         165 :                 if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
     995           0 :                     time_before_eq(timer->expires, expires)) {
     996           0 :                         ret = 1;
     997           0 :                         goto out_unlock;
     998             :                 }
     999             : 
    1000         165 :                 clk = base->clk;
    1001         165 :                 idx = calc_wheel_index(expires, clk, &bucket_expiry);
    1002             : 
    1003             :                 /*
    1004             :                  * Retrieve and compare the array index of the pending
    1005             :                  * timer. If it matches set the expiry to the new value so a
    1006             :                  * subsequent call will exit in the expires check above.
    1007             :                  */
    1008         165 :                 if (idx == timer_get_idx(timer)) {
    1009           0 :                         if (!(options & MOD_TIMER_REDUCE))
    1010           0 :                                 timer->expires = expires;
    1011           0 :                         else if (time_after(timer->expires, expires))
    1012           0 :                                 timer->expires = expires;
    1013           0 :                         ret = 1;
    1014           0 :                         goto out_unlock;
    1015             :                 }
    1016             :         } else {
    1017        5185 :                 base = lock_timer_base(timer, &flags);
    1018        5185 :                 forward_timer_base(base);
    1019             :         }
    1020             : 
    1021        5350 :         ret = detach_if_pending(timer, base, false);
    1022        5350 :         if (!ret && (options & MOD_TIMER_PENDING_ONLY))
    1023           0 :                 goto out_unlock;
    1024             : 
    1025        5350 :         new_base = get_target_base(base, timer->flags);
    1026             : 
    1027        5350 :         if (base != new_base) {
    1028             :                 /*
    1029             :                  * We are trying to schedule the timer on the new base.
    1030             :                  * However we can't change timer's base while it is running,
    1031             :                  * otherwise del_timer_sync() can't detect that the timer's
    1032             :                  * handler yet has not finished. This also guarantees that the
    1033             :                  * timer is serialized wrt itself.
    1034             :                  */
    1035         318 :                 if (likely(base->running_timer != timer)) {
    1036             :                         /* See the comment in lock_timer_base() */
    1037         316 :                         timer->flags |= TIMER_MIGRATING;
    1038             : 
    1039         316 :                         raw_spin_unlock(&base->lock);
    1040         316 :                         base = new_base;
    1041         316 :                         raw_spin_lock(&base->lock);
    1042         316 :                         WRITE_ONCE(timer->flags,
    1043             :                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
    1044         316 :                         forward_timer_base(base);
    1045             :                 }
    1046             :         }
    1047             : 
    1048        5350 :         debug_timer_activate(timer);
    1049             : 
    1050        5350 :         timer->expires = expires;
    1051             :         /*
    1052             :          * If 'idx' was calculated above and the base time did not advance
    1053             :          * between calculating 'idx' and possibly switching the base, only
    1054             :          * enqueue_timer() is required. Otherwise we need to (re)calculate
    1055             :          * the wheel index via internal_add_timer().
    1056             :          */
    1057        5350 :         if (idx != UINT_MAX && clk == base->clk)
    1058         162 :                 enqueue_timer(base, timer, idx, bucket_expiry);
    1059             :         else
    1060        5188 :                 internal_add_timer(base, timer);
    1061             : 
    1062        5350 : out_unlock:
    1063        5350 :         raw_spin_unlock_irqrestore(&base->lock, flags);
    1064             : 
    1065        5350 :         return ret;
    1066             : }
    1067             : 
    1068             : /**
    1069             :  * mod_timer_pending - modify a pending timer's timeout
    1070             :  * @timer: the pending timer to be modified
    1071             :  * @expires: new timeout in jiffies
    1072             :  *
    1073             :  * mod_timer_pending() is the same for pending timers as mod_timer(),
    1074             :  * but will not re-activate and modify already deleted timers.
    1075             :  *
    1076             :  * It is useful for unserialized use of timers.
    1077             :  */
    1078           0 : int mod_timer_pending(struct timer_list *timer, unsigned long expires)
    1079             : {
    1080           0 :         return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
    1081             : }
    1082             : EXPORT_SYMBOL(mod_timer_pending);
    1083             : 
    1084             : /**
    1085             :  * mod_timer - modify a timer's timeout
    1086             :  * @timer: the timer to be modified
    1087             :  * @expires: new timeout in jiffies
    1088             :  *
    1089             :  * mod_timer() is a more efficient way to update the expire field of an
    1090             :  * active timer (if the timer is inactive it will be activated)
    1091             :  *
    1092             :  * mod_timer(timer, expires) is equivalent to:
    1093             :  *
    1094             :  *     del_timer(timer); timer->expires = expires; add_timer(timer);
    1095             :  *
    1096             :  * Note that if there are multiple unserialized concurrent users of the
    1097             :  * same timer, then mod_timer() is the only safe way to modify the timeout,
    1098             :  * since add_timer() cannot modify an already running timer.
    1099             :  *
    1100             :  * The function returns whether it has modified a pending timer or not.
    1101             :  * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
    1102             :  * active timer returns 1.)
    1103             :  */
    1104         411 : int mod_timer(struct timer_list *timer, unsigned long expires)
    1105             : {
    1106         411 :         return __mod_timer(timer, expires, 0);
    1107             : }
    1108             : EXPORT_SYMBOL(mod_timer);
    1109             : 
    1110             : /**
    1111             :  * timer_reduce - Modify a timer's timeout if it would reduce the timeout
    1112             :  * @timer:      The timer to be modified
    1113             :  * @expires:    New timeout in jiffies
    1114             :  *
    1115             :  * timer_reduce() is very similar to mod_timer(), except that it will only
    1116             :  * modify a running timer if that would reduce the expiration time (it will
    1117             :  * start a timer that isn't running).
    1118             :  */
    1119           1 : int timer_reduce(struct timer_list *timer, unsigned long expires)
    1120             : {
    1121           1 :         return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
    1122             : }
    1123             : EXPORT_SYMBOL(timer_reduce);
    1124             : 
    1125             : /**
    1126             :  * add_timer - start a timer
    1127             :  * @timer: the timer to be added
    1128             :  *
    1129             :  * The kernel will do a ->function(@timer) callback from the
    1130             :  * timer interrupt at the ->expires point in the future. The
    1131             :  * current time is 'jiffies'.
    1132             :  *
    1133             :  * The timer's ->expires, ->function fields must be set prior calling this
    1134             :  * function.
    1135             :  *
    1136             :  * Timers with an ->expires field in the past will be executed in the next
    1137             :  * timer tick.
    1138             :  */
    1139         553 : void add_timer(struct timer_list *timer)
    1140             : {
    1141         553 :         BUG_ON(timer_pending(timer));
    1142         553 :         __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
    1143         553 : }
    1144             : EXPORT_SYMBOL(add_timer);
    1145             : 
    1146             : /**
    1147             :  * add_timer_on - start a timer on a particular CPU
    1148             :  * @timer: the timer to be added
    1149             :  * @cpu: the CPU to start it on
    1150             :  *
    1151             :  * This is not very scalable on SMP. Double adds are not possible.
    1152             :  */
    1153         194 : void add_timer_on(struct timer_list *timer, int cpu)
    1154             : {
    1155         194 :         struct timer_base *new_base, *base;
    1156         194 :         unsigned long flags;
    1157             : 
    1158         194 :         BUG_ON(timer_pending(timer) || !timer->function);
    1159             : 
    1160         194 :         new_base = get_timer_cpu_base(timer->flags, cpu);
    1161             : 
    1162             :         /*
    1163             :          * If @timer was on a different CPU, it should be migrated with the
    1164             :          * old base locked to prevent other operations proceeding with the
    1165             :          * wrong base locked.  See lock_timer_base().
    1166             :          */
    1167         194 :         base = lock_timer_base(timer, &flags);
    1168         194 :         if (base != new_base) {
    1169          72 :                 timer->flags |= TIMER_MIGRATING;
    1170             : 
    1171          72 :                 raw_spin_unlock(&base->lock);
    1172          72 :                 base = new_base;
    1173          72 :                 raw_spin_lock(&base->lock);
    1174          72 :                 WRITE_ONCE(timer->flags,
    1175             :                            (timer->flags & ~TIMER_BASEMASK) | cpu);
    1176             :         }
    1177         194 :         forward_timer_base(base);
    1178             : 
    1179         194 :         debug_timer_activate(timer);
    1180         194 :         internal_add_timer(base, timer);
    1181         194 :         raw_spin_unlock_irqrestore(&base->lock, flags);
    1182         194 : }
    1183             : EXPORT_SYMBOL_GPL(add_timer_on);
    1184             : 
    1185             : /**
    1186             :  * del_timer - deactivate a timer.
    1187             :  * @timer: the timer to be deactivated
    1188             :  *
    1189             :  * del_timer() deactivates a timer - this works on both active and inactive
    1190             :  * timers.
    1191             :  *
    1192             :  * The function returns whether it has deactivated a pending timer or not.
    1193             :  * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
    1194             :  * active timer returns 1.)
    1195             :  */
    1196         272 : int del_timer(struct timer_list *timer)
    1197             : {
    1198         272 :         struct timer_base *base;
    1199         272 :         unsigned long flags;
    1200         272 :         int ret = 0;
    1201             : 
    1202         272 :         debug_assert_init(timer);
    1203             : 
    1204         272 :         if (timer_pending(timer)) {
    1205          13 :                 base = lock_timer_base(timer, &flags);
    1206          13 :                 ret = detach_if_pending(timer, base, true);
    1207          13 :                 raw_spin_unlock_irqrestore(&base->lock, flags);
    1208             :         }
    1209             : 
    1210         272 :         return ret;
    1211             : }
    1212             : EXPORT_SYMBOL(del_timer);
    1213             : 
    1214             : /**
    1215             :  * try_to_del_timer_sync - Try to deactivate a timer
    1216             :  * @timer: timer to delete
    1217             :  *
    1218             :  * This function tries to deactivate a timer. Upon successful (ret >= 0)
    1219             :  * exit the timer is not queued and the handler is not running on any CPU.
    1220             :  */
    1221        4692 : int try_to_del_timer_sync(struct timer_list *timer)
    1222             : {
    1223        4692 :         struct timer_base *base;
    1224        4692 :         unsigned long flags;
    1225        4692 :         int ret = -1;
    1226             : 
    1227        4692 :         debug_assert_init(timer);
    1228             : 
    1229        4692 :         base = lock_timer_base(timer, &flags);
    1230             : 
    1231        4692 :         if (base->running_timer != timer)
    1232        4692 :                 ret = detach_if_pending(timer, base, true);
    1233             : 
    1234        4692 :         raw_spin_unlock_irqrestore(&base->lock, flags);
    1235             : 
    1236        4692 :         return ret;
    1237             : }
    1238             : EXPORT_SYMBOL(try_to_del_timer_sync);
    1239             : 
    1240           0 : bool timer_curr_running(struct timer_list *timer)
    1241             : {
    1242           0 :         int i;
    1243             : 
    1244           0 :         for (i = 0; i < NR_BASES; i++) {
    1245           0 :                 struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
    1246             : 
    1247           0 :                 if (base->running_timer == timer)
    1248             :                         return true;
    1249             :         }
    1250             : 
    1251             :         return false;
    1252             : }
    1253             : 
    1254             : #ifdef CONFIG_PREEMPT_RT
    1255             : static __init void timer_base_init_expiry_lock(struct timer_base *base)
    1256             : {
    1257             :         spin_lock_init(&base->expiry_lock);
    1258             : }
    1259             : 
    1260             : static inline void timer_base_lock_expiry(struct timer_base *base)
    1261             : {
    1262             :         spin_lock(&base->expiry_lock);
    1263             : }
    1264             : 
    1265             : static inline void timer_base_unlock_expiry(struct timer_base *base)
    1266             : {
    1267             :         spin_unlock(&base->expiry_lock);
    1268             : }
    1269             : 
    1270             : /*
    1271             :  * The counterpart to del_timer_wait_running().
    1272             :  *
    1273             :  * If there is a waiter for base->expiry_lock, then it was waiting for the
    1274             :  * timer callback to finish. Drop expiry_lock and reaquire it. That allows
    1275             :  * the waiter to acquire the lock and make progress.
    1276             :  */
    1277             : static void timer_sync_wait_running(struct timer_base *base)
    1278             : {
    1279             :         if (atomic_read(&base->timer_waiters)) {
    1280             :                 spin_unlock(&base->expiry_lock);
    1281             :                 spin_lock(&base->expiry_lock);
    1282             :         }
    1283             : }
    1284             : 
    1285             : /*
    1286             :  * This function is called on PREEMPT_RT kernels when the fast path
    1287             :  * deletion of a timer failed because the timer callback function was
    1288             :  * running.
    1289             :  *
    1290             :  * This prevents priority inversion, if the softirq thread on a remote CPU
    1291             :  * got preempted, and it prevents a life lock when the task which tries to
    1292             :  * delete a timer preempted the softirq thread running the timer callback
    1293             :  * function.
    1294             :  */
    1295             : static void del_timer_wait_running(struct timer_list *timer)
    1296             : {
    1297             :         u32 tf;
    1298             : 
    1299             :         tf = READ_ONCE(timer->flags);
    1300             :         if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
    1301             :                 struct timer_base *base = get_timer_base(tf);
    1302             : 
    1303             :                 /*
    1304             :                  * Mark the base as contended and grab the expiry lock,
    1305             :                  * which is held by the softirq across the timer
    1306             :                  * callback. Drop the lock immediately so the softirq can
    1307             :                  * expire the next timer. In theory the timer could already
    1308             :                  * be running again, but that's more than unlikely and just
    1309             :                  * causes another wait loop.
    1310             :                  */
    1311             :                 atomic_inc(&base->timer_waiters);
    1312             :                 spin_lock_bh(&base->expiry_lock);
    1313             :                 atomic_dec(&base->timer_waiters);
    1314             :                 spin_unlock_bh(&base->expiry_lock);
    1315             :         }
    1316             : }
    1317             : #else
    1318           8 : static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
    1319        4508 : static inline void timer_base_lock_expiry(struct timer_base *base) { }
    1320        4517 : static inline void timer_base_unlock_expiry(struct timer_base *base) { }
    1321        3286 : static inline void timer_sync_wait_running(struct timer_base *base) { }
    1322           0 : static inline void del_timer_wait_running(struct timer_list *timer) { }
    1323             : #endif
    1324             : 
    1325             : #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
    1326             : /**
    1327             :  * del_timer_sync - deactivate a timer and wait for the handler to finish.
    1328             :  * @timer: the timer to be deactivated
    1329             :  *
    1330             :  * This function only differs from del_timer() on SMP: besides deactivating
    1331             :  * the timer it also makes sure the handler has finished executing on other
    1332             :  * CPUs.
    1333             :  *
    1334             :  * Synchronization rules: Callers must prevent restarting of the timer,
    1335             :  * otherwise this function is meaningless. It must not be called from
    1336             :  * interrupt contexts unless the timer is an irqsafe one. The caller must
    1337             :  * not hold locks which would prevent completion of the timer's
    1338             :  * handler. The timer's handler must not call add_timer_on(). Upon exit the
    1339             :  * timer is not queued and the handler is not running on any CPU.
    1340             :  *
    1341             :  * Note: For !irqsafe timers, you must not hold locks that are held in
    1342             :  *   interrupt context while calling this function. Even if the lock has
    1343             :  *   nothing to do with the timer in question.  Here's why::
    1344             :  *
    1345             :  *    CPU0                             CPU1
    1346             :  *    ----                             ----
    1347             :  *                                     <SOFTIRQ>
    1348             :  *                                       call_timer_fn();
    1349             :  *                                       base->running_timer = mytimer;
    1350             :  *    spin_lock_irq(somelock);
    1351             :  *                                     <IRQ>
    1352             :  *                                        spin_lock(somelock);
    1353             :  *    del_timer_sync(mytimer);
    1354             :  *    while (base->running_timer == mytimer);
    1355             :  *
    1356             :  * Now del_timer_sync() will never return and never release somelock.
    1357             :  * The interrupt on the other CPU is waiting to grab somelock but
    1358             :  * it has interrupted the softirq that CPU0 is waiting to finish.
    1359             :  *
    1360             :  * The function returns whether it has deactivated a pending timer or not.
    1361             :  */
    1362        4692 : int del_timer_sync(struct timer_list *timer)
    1363             : {
    1364        4692 :         int ret;
    1365             : 
    1366             : #ifdef CONFIG_LOCKDEP
    1367        4692 :         unsigned long flags;
    1368             : 
    1369             :         /*
    1370             :          * If lockdep gives a backtrace here, please reference
    1371             :          * the synchronization rules above.
    1372             :          */
    1373        9384 :         local_irq_save(flags);
    1374        4692 :         lock_map_acquire(&timer->lockdep_map);
    1375        4692 :         lock_map_release(&timer->lockdep_map);
    1376        4692 :         local_irq_restore(flags);
    1377             : #endif
    1378             :         /*
    1379             :          * don't use it in hardirq context, because it
    1380             :          * could lead to deadlock.
    1381             :          */
    1382        9384 :         WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
    1383             : 
    1384             :         /*
    1385             :          * Must be able to sleep on PREEMPT_RT because of the slowpath in
    1386             :          * del_timer_wait_running().
    1387             :          */
    1388             :         if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
    1389        4692 :                 lockdep_assert_preemption_enabled();
    1390             : 
    1391        4692 :         do {
    1392        4692 :                 ret = try_to_del_timer_sync(timer);
    1393             : 
    1394        4692 :                 if (unlikely(ret < 0)) {
    1395           0 :                         del_timer_wait_running(timer);
    1396           0 :                         cpu_relax();
    1397             :                 }
    1398        4692 :         } while (ret < 0);
    1399             : 
    1400        4692 :         return ret;
    1401             : }
    1402             : EXPORT_SYMBOL(del_timer_sync);
    1403             : #endif
    1404             : 
    1405        3887 : static void call_timer_fn(struct timer_list *timer,
    1406             :                           void (*fn)(struct timer_list *),
    1407             :                           unsigned long baseclk)
    1408             : {
    1409        3887 :         int count = preempt_count();
    1410             : 
    1411             : #ifdef CONFIG_LOCKDEP
    1412             :         /*
    1413             :          * It is permissible to free the timer from inside the
    1414             :          * function that is called from it, this we need to take into
    1415             :          * account for lockdep too. To avoid bogus "held lock freed"
    1416             :          * warnings as well as problems when looking into
    1417             :          * timer->lockdep_map, make a copy and use that here.
    1418             :          */
    1419        3887 :         struct lockdep_map lockdep_map;
    1420             : 
    1421        3887 :         lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
    1422             : #endif
    1423             :         /*
    1424             :          * Couple the lock chain with the lock chain at
    1425             :          * del_timer_sync() by acquiring the lock_map around the fn()
    1426             :          * call here and in del_timer_sync().
    1427             :          */
    1428        3887 :         lock_map_acquire(&lockdep_map);
    1429             : 
    1430        3887 :         trace_timer_expire_entry(timer, baseclk);
    1431        3887 :         fn(timer);
    1432        3890 :         trace_timer_expire_exit(timer);
    1433             : 
    1434        3890 :         lock_map_release(&lockdep_map);
    1435             : 
    1436        3890 :         if (count != preempt_count()) {
    1437           0 :                 WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
    1438             :                           fn, count, preempt_count());
    1439             :                 /*
    1440             :                  * Restore the preempt count. That gives us a decent
    1441             :                  * chance to survive and extract information. If the
    1442             :                  * callback kept a lock held, bad luck, but not worse
    1443             :                  * than the BUG() we had.
    1444             :                  */
    1445           0 :                 preempt_count_set(count);
    1446             :         }
    1447        3890 : }
    1448             : 
    1449        3796 : static void expire_timers(struct timer_base *base, struct hlist_head *head)
    1450             : {
    1451             :         /*
    1452             :          * This value is required only for tracing. base->clk was
    1453             :          * incremented directly before expire_timers was called. But expiry
    1454             :          * is related to the old base->clk value.
    1455             :          */
    1456        3796 :         unsigned long baseclk = base->clk - 1;
    1457             : 
    1458        7686 :         while (!hlist_empty(head)) {
    1459        3884 :                 struct timer_list *timer;
    1460        3884 :                 void (*fn)(struct timer_list *);
    1461             : 
    1462        3884 :                 timer = hlist_entry(head->first, struct timer_list, entry);
    1463             : 
    1464        3884 :                 base->running_timer = timer;
    1465        3884 :                 detach_timer(timer, true);
    1466             : 
    1467        3890 :                 fn = timer->function;
    1468             : 
    1469        3890 :                 if (timer->flags & TIMER_IRQSAFE) {
    1470         604 :                         raw_spin_unlock(&base->lock);
    1471         604 :                         call_timer_fn(timer, fn, baseclk);
    1472         602 :                         base->running_timer = NULL;
    1473         602 :                         raw_spin_lock(&base->lock);
    1474             :                 } else {
    1475        3286 :                         raw_spin_unlock_irq(&base->lock);
    1476        3285 :                         call_timer_fn(timer, fn, baseclk);
    1477        3286 :                         base->running_timer = NULL;
    1478        3286 :                         timer_sync_wait_running(base);
    1479        3286 :                         raw_spin_lock_irq(&base->lock);
    1480             :                 }
    1481             :         }
    1482        3802 : }
    1483             : 
    1484        4611 : static int collect_expired_timers(struct timer_base *base,
    1485             :                                   struct hlist_head *heads)
    1486             : {
    1487        4611 :         unsigned long clk = base->clk = base->next_expiry;
    1488        4611 :         struct hlist_head *vec;
    1489        4611 :         int i, levels = 0;
    1490        4611 :         unsigned int idx;
    1491             : 
    1492        5588 :         for (i = 0; i < LVL_DEPTH; i++) {
    1493        5588 :                 idx = (clk & LVL_MASK) + i * LVL_SIZE;
    1494             : 
    1495        5588 :                 if (__test_and_clear_bit(idx, base->pending_map)) {
    1496        3797 :                         vec = base->vectors + idx;
    1497        3797 :                         hlist_move_list(vec, heads++);
    1498        3797 :                         levels++;
    1499             :                 }
    1500             :                 /* Is it time to look at the next level? */
    1501        5593 :                 if (clk & LVL_CLK_MASK)
    1502             :                         break;
    1503             :                 /* Shift clock for the next level granularity */
    1504         977 :                 clk >>= LVL_CLK_SHIFT;
    1505             :         }
    1506        4616 :         return levels;
    1507             : }
    1508             : 
    1509             : /*
    1510             :  * Find the next pending bucket of a level. Search from level start (@offset)
    1511             :  * + @clk upwards and if nothing there, search from start of the level
    1512             :  * (@offset) up to @offset + clk.
    1513             :  */
    1514       26633 : static int next_pending_bucket(struct timer_base *base, unsigned offset,
    1515             :                                unsigned clk)
    1516             : {
    1517       26633 :         unsigned pos, start = offset + clk;
    1518       26633 :         unsigned end = offset + LVL_SIZE;
    1519             : 
    1520       26633 :         pos = find_next_bit(base->pending_map, end, start);
    1521       26646 :         if (pos < end)
    1522        5650 :                 return pos - start;
    1523             : 
    1524       20996 :         pos = find_next_bit(base->pending_map, start, offset);
    1525       21004 :         return pos < start ? pos + LVL_SIZE - start : -1;
    1526             : }
    1527             : 
    1528             : /*
    1529             :  * Search the first expiring timer in the various clock levels. Caller must
    1530             :  * hold base->lock.
    1531             :  */
    1532        4718 : static unsigned long __next_timer_interrupt(struct timer_base *base)
    1533             : {
    1534        4718 :         unsigned long clk, next, adj;
    1535        4718 :         unsigned lvl, offset = 0;
    1536             : 
    1537        4718 :         next = base->clk + NEXT_TIMER_MAX_DELTA;
    1538        4718 :         clk = base->clk;
    1539       29090 :         for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
    1540       26638 :                 int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
    1541       26656 :                 unsigned long lvl_clk = clk & LVL_CLK_MASK;
    1542             : 
    1543       26656 :                 if (pos >= 0) {
    1544        7962 :                         unsigned long tmp = clk + (unsigned long) pos;
    1545             : 
    1546        7962 :                         tmp <<= LVL_SHIFT(lvl);
    1547        7962 :                         if (time_before(tmp, next))
    1548        4586 :                                 next = tmp;
    1549             : 
    1550             :                         /*
    1551             :                          * If the next expiration happens before we reach
    1552             :                          * the next level, no need to check further.
    1553             :                          */
    1554        7962 :                         if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
    1555             :                                 break;
    1556             :                 }
    1557             :                 /*
    1558             :                  * Clock for the next level. If the current level clock lower
    1559             :                  * bits are zero, we look at the next level as is. If not we
    1560             :                  * need to advance it by one because that's going to be the
    1561             :                  * next expiring bucket in that level. base->clk is the next
    1562             :                  * expiring jiffie. So in case of:
    1563             :                  *
    1564             :                  * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
    1565             :                  *  0    0    0    0    0    0
    1566             :                  *
    1567             :                  * we have to look at all levels @index 0. With
    1568             :                  *
    1569             :                  * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
    1570             :                  *  0    0    0    0    0    2
    1571             :                  *
    1572             :                  * LVL0 has the next expiring bucket @index 2. The upper
    1573             :                  * levels have the next expiring bucket @index 1.
    1574             :                  *
    1575             :                  * In case that the propagation wraps the next level the same
    1576             :                  * rules apply:
    1577             :                  *
    1578             :                  * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
    1579             :                  *  0    0    0    0    F    2
    1580             :                  *
    1581             :                  * So after looking at LVL0 we get:
    1582             :                  *
    1583             :                  * LVL5 LVL4 LVL3 LVL2 LVL1
    1584             :                  *  0    0    0    1    0
    1585             :                  *
    1586             :                  * So no propagation from LVL1 to LVL2 because that happened
    1587             :                  * with the add already, but then we need to propagate further
    1588             :                  * from LVL2 to LVL3.
    1589             :                  *
    1590             :                  * So the simple check whether the lower bits of the current
    1591             :                  * level are 0 or not is sufficient for all cases.
    1592             :                  */
    1593       24372 :                 adj = lvl_clk ? 1 : 0;
    1594       24372 :                 clk >>= LVL_CLK_SHIFT;
    1595       24372 :                 clk += adj;
    1596             :         }
    1597             : 
    1598        4736 :         base->next_expiry_recalc = false;
    1599             : 
    1600        4736 :         return next;
    1601             : }
    1602             : 
    1603             : #ifdef CONFIG_NO_HZ_COMMON
    1604             : /*
    1605             :  * Check, if the next hrtimer event is before the next timer wheel
    1606             :  * event:
    1607             :  */
    1608        1221 : static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
    1609             : {
    1610        1221 :         u64 nextevt = hrtimer_get_next_event();
    1611             : 
    1612             :         /*
    1613             :          * If high resolution timers are enabled
    1614             :          * hrtimer_get_next_event() returns KTIME_MAX.
    1615             :          */
    1616        1218 :         if (expires <= nextevt)
    1617             :                 return expires;
    1618             : 
    1619             :         /*
    1620             :          * If the next timer is already expired, return the tick base
    1621             :          * time so the tick is fired immediately.
    1622             :          */
    1623          76 :         if (nextevt <= basem)
    1624             :                 return basem;
    1625             : 
    1626             :         /*
    1627             :          * Round up to the next jiffie. High resolution timers are
    1628             :          * off, so the hrtimers are expired in the tick and we need to
    1629             :          * make sure that this tick really expires the timer to avoid
    1630             :          * a ping pong of the nohz stop code.
    1631             :          *
    1632             :          * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
    1633             :          */
    1634          75 :         return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
    1635             : }
    1636             : 
    1637             : /**
    1638             :  * get_next_timer_interrupt - return the time (clock mono) of the next timer
    1639             :  * @basej:      base time jiffies
    1640             :  * @basem:      base time clock monotonic
    1641             :  *
    1642             :  * Returns the tick aligned clock monotonic time of the next pending
    1643             :  * timer or KTIME_MAX if no timer is pending.
    1644             :  */
    1645        1218 : u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
    1646             : {
    1647        1218 :         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
    1648        1220 :         u64 expires = KTIME_MAX;
    1649        1220 :         unsigned long nextevt;
    1650        1220 :         bool is_max_delta;
    1651             : 
    1652             :         /*
    1653             :          * Pretend that there is no timer pending if the cpu is offline.
    1654             :          * Possible pending timers will be migrated later to an active cpu.
    1655             :          */
    1656        1220 :         if (cpu_is_offline(smp_processor_id()))
    1657             :                 return expires;
    1658             : 
    1659        1221 :         raw_spin_lock(&base->lock);
    1660        1223 :         if (base->next_expiry_recalc)
    1661         117 :                 base->next_expiry = __next_timer_interrupt(base);
    1662        1223 :         nextevt = base->next_expiry;
    1663        1223 :         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
    1664             : 
    1665             :         /*
    1666             :          * We have a fresh next event. Check whether we can forward the
    1667             :          * base. We can only do that when @basej is past base->clk
    1668             :          * otherwise we might rewind base->clk.
    1669             :          */
    1670        1223 :         if (time_after(basej, base->clk)) {
    1671         345 :                 if (time_after(nextevt, basej))
    1672         345 :                         base->clk = basej;
    1673           0 :                 else if (time_after(nextevt, base->clk))
    1674           0 :                         base->clk = nextevt;
    1675             :         }
    1676             : 
    1677        1223 :         if (time_before_eq(nextevt, basej)) {
    1678           2 :                 expires = basem;
    1679           2 :                 base->is_idle = false;
    1680             :         } else {
    1681        1221 :                 if (!is_max_delta)
    1682        1217 :                         expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
    1683             :                 /*
    1684             :                  * If we expect to sleep more than a tick, mark the base idle.
    1685             :                  * Also the tick is stopped so any added timer must forward
    1686             :                  * the base clk itself to keep granularity small. This idle
    1687             :                  * logic is only maintained for the BASE_STD base, deferrable
    1688             :                  * timers may still see large granularity skew (by design).
    1689             :                  */
    1690        1221 :                 if ((expires - basem) > TICK_NSEC)
    1691        1096 :                         base->is_idle = true;
    1692             :         }
    1693        1223 :         raw_spin_unlock(&base->lock);
    1694             : 
    1695        1222 :         return cmp_next_hrtimer_event(basem, expires);
    1696             : }
    1697             : 
    1698             : /**
    1699             :  * timer_clear_idle - Clear the idle state of the timer base
    1700             :  *
    1701             :  * Called with interrupts disabled
    1702             :  */
    1703       17376 : void timer_clear_idle(void)
    1704             : {
    1705       17376 :         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
    1706             : 
    1707             :         /*
    1708             :          * We do this unlocked. The worst outcome is a remote enqueue sending
    1709             :          * a pointless IPI, but taking the lock would just make the window for
    1710             :          * sending the IPI a few instructions smaller for the cost of taking
    1711             :          * the lock in the exit from idle path.
    1712             :          */
    1713       17410 :         base->is_idle = false;
    1714       17410 : }
    1715             : #endif
    1716             : 
    1717             : /**
    1718             :  * __run_timers - run all expired timers (if any) on this CPU.
    1719             :  * @base: the timer vector to be processed.
    1720             :  */
    1721        8972 : static inline void __run_timers(struct timer_base *base)
    1722             : {
    1723        8972 :         struct hlist_head heads[LVL_DEPTH];
    1724        8972 :         int levels;
    1725             : 
    1726        8972 :         if (time_before(jiffies, base->next_expiry))
    1727        4464 :                 return;
    1728             : 
    1729        4508 :         timer_base_lock_expiry(base);
    1730        4508 :         raw_spin_lock_irq(&base->lock);
    1731             : 
    1732        9123 :         while (time_after_eq(jiffies, base->clk) &&
    1733        5173 :                time_after_eq(jiffies, base->next_expiry)) {
    1734        4614 :                 levels = collect_expired_timers(base, heads);
    1735             :                 /*
    1736             :                  * The only possible reason for not finding any expired
    1737             :                  * timer at this clk is that all matching timers have been
    1738             :                  * dequeued.
    1739             :                  */
    1740        9220 :                 WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
    1741        4610 :                 base->clk++;
    1742        4610 :                 base->next_expiry = __next_timer_interrupt(base);
    1743             : 
    1744        8412 :                 while (levels--)
    1745        3797 :                         expire_timers(base, heads + levels);
    1746             :         }
    1747        4518 :         raw_spin_unlock_irq(&base->lock);
    1748        4517 :         timer_base_unlock_expiry(base);
    1749             : }
    1750             : 
    1751             : /*
    1752             :  * This function runs timers and the timer-tq in bottom half context.
    1753             :  */
    1754        4476 : static __latent_entropy void run_timer_softirq(struct softirq_action *h)
    1755             : {
    1756        4476 :         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
    1757             : 
    1758        4479 :         __run_timers(base);
    1759        4488 :         if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
    1760        4488 :                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
    1761        4492 : }
    1762             : 
    1763             : /*
    1764             :  * Called by the local, per-CPU timer interrupt on SMP.
    1765             :  */
    1766       28314 : static void run_local_timers(void)
    1767             : {
    1768       28314 :         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
    1769             : 
    1770       28332 :         hrtimer_run_queues();
    1771             :         /* Raise the softirq only if required. */
    1772       28560 :         if (time_before(jiffies, base->next_expiry)) {
    1773       24039 :                 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
    1774             :                         return;
    1775             :                 /* CPU is awake, so check the deferrable base. */
    1776       24039 :                 base++;
    1777       24039 :                 if (time_before(jiffies, base->next_expiry))
    1778             :                         return;
    1779             :         }
    1780        4624 :         raise_softirq(TIMER_SOFTIRQ);
    1781             : }
    1782             : 
    1783             : /*
    1784             :  * Called from the timer interrupt handler to charge one tick to the current
    1785             :  * process.  user_tick is 1 if the tick is user time, 0 for system.
    1786             :  */
    1787       28034 : void update_process_times(int user_tick)
    1788             : {
    1789       28034 :         struct task_struct *p = current;
    1790             : 
    1791       28034 :         PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
    1792             : 
    1793             :         /* Note: this timer irq context must be accounted for as well. */
    1794       28034 :         account_process_tick(p, user_tick);
    1795       28396 :         run_local_timers();
    1796       28468 :         rcu_sched_clock_irq(user_tick);
    1797             : #ifdef CONFIG_IRQ_WORK
    1798       28280 :         if (in_irq())
    1799       28294 :                 irq_work_tick();
    1800             : #endif
    1801       28211 :         scheduler_tick();
    1802       28288 :         if (IS_ENABLED(CONFIG_POSIX_TIMERS))
    1803       28288 :                 run_posix_cpu_timers();
    1804       28528 : }
    1805             : 
    1806             : /*
    1807             :  * Since schedule_timeout()'s timer is defined on the stack, it must store
    1808             :  * the target task on the stack as well.
    1809             :  */
    1810             : struct process_timer {
    1811             :         struct timer_list timer;
    1812             :         struct task_struct *task;
    1813             : };
    1814             : 
    1815        3189 : static void process_timeout(struct timer_list *t)
    1816             : {
    1817        3189 :         struct process_timer *timeout = from_timer(timeout, t, timer);
    1818             : 
    1819        3189 :         wake_up_process(timeout->task);
    1820        3188 : }
    1821             : 
    1822             : /**
    1823             :  * schedule_timeout - sleep until timeout
    1824             :  * @timeout: timeout value in jiffies
    1825             :  *
    1826             :  * Make the current task sleep until @timeout jiffies have elapsed.
    1827             :  * The function behavior depends on the current task state
    1828             :  * (see also set_current_state() description):
    1829             :  *
    1830             :  * %TASK_RUNNING - the scheduler is called, but the task does not sleep
    1831             :  * at all. That happens because sched_submit_work() does nothing for
    1832             :  * tasks in %TASK_RUNNING state.
    1833             :  *
    1834             :  * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
    1835             :  * pass before the routine returns unless the current task is explicitly
    1836             :  * woken up, (e.g. by wake_up_process()).
    1837             :  *
    1838             :  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
    1839             :  * delivered to the current task or the current task is explicitly woken
    1840             :  * up.
    1841             :  *
    1842             :  * The current task state is guaranteed to be %TASK_RUNNING when this
    1843             :  * routine returns.
    1844             :  *
    1845             :  * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
    1846             :  * the CPU away without a bound on the timeout. In this case the return
    1847             :  * value will be %MAX_SCHEDULE_TIMEOUT.
    1848             :  *
    1849             :  * Returns 0 when the timer has expired otherwise the remaining time in
    1850             :  * jiffies will be returned. In all cases the return value is guaranteed
    1851             :  * to be non-negative.
    1852             :  */
    1853        4999 : signed long __sched schedule_timeout(signed long timeout)
    1854             : {
    1855        4999 :         struct process_timer timer;
    1856        4999 :         unsigned long expire;
    1857             : 
    1858        4999 :         switch (timeout)
    1859             :         {
    1860         430 :         case MAX_SCHEDULE_TIMEOUT:
    1861             :                 /*
    1862             :                  * These two special cases are useful to be comfortable
    1863             :                  * in the caller. Nothing more. We could take
    1864             :                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
    1865             :                  * but I' d like to return a valid offset (>=0) to allow
    1866             :                  * the caller to do everything it want with the retval.
    1867             :                  */
    1868         430 :                 schedule();
    1869         429 :                 goto out;
    1870        4569 :         default:
    1871             :                 /*
    1872             :                  * Another bit of PARANOID. Note that the retval will be
    1873             :                  * 0 since no piece of kernel is supposed to do a check
    1874             :                  * for a negative retval of schedule_timeout() (since it
    1875             :                  * should never happens anyway). You just have the printk()
    1876             :                  * that will tell you if something is gone wrong and where.
    1877             :                  */
    1878        4569 :                 if (timeout < 0) {
    1879           0 :                         printk(KERN_ERR "schedule_timeout: wrong timeout "
    1880             :                                 "value %lx\n", timeout);
    1881           0 :                         dump_stack();
    1882           0 :                         current->state = TASK_RUNNING;
    1883           0 :                         goto out;
    1884             :                 }
    1885             :         }
    1886             : 
    1887        4569 :         expire = timeout + jiffies;
    1888             : 
    1889        4569 :         timer.task = current;
    1890        4569 :         timer_setup_on_stack(&timer.timer, process_timeout, 0);
    1891        4569 :         __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
    1892        4569 :         schedule();
    1893        4566 :         del_singleshot_timer_sync(&timer.timer);
    1894             : 
    1895             :         /* Remove the timer from the object tracker */
    1896        4566 :         destroy_timer_on_stack(&timer.timer);
    1897             : 
    1898        4566 :         timeout = expire - jiffies;
    1899             : 
    1900        4995 :  out:
    1901        4995 :         return timeout < 0 ? 0 : timeout;
    1902             : }
    1903             : EXPORT_SYMBOL(schedule_timeout);
    1904             : 
    1905             : /*
    1906             :  * We can use __set_current_state() here because schedule_timeout() calls
    1907             :  * schedule() unconditionally.
    1908             :  */
    1909           0 : signed long __sched schedule_timeout_interruptible(signed long timeout)
    1910             : {
    1911           0 :         __set_current_state(TASK_INTERRUPTIBLE);
    1912           0 :         return schedule_timeout(timeout);
    1913             : }
    1914             : EXPORT_SYMBOL(schedule_timeout_interruptible);
    1915             : 
    1916           0 : signed long __sched schedule_timeout_killable(signed long timeout)
    1917             : {
    1918           0 :         __set_current_state(TASK_KILLABLE);
    1919           0 :         return schedule_timeout(timeout);
    1920             : }
    1921             : EXPORT_SYMBOL(schedule_timeout_killable);
    1922             : 
    1923           0 : signed long __sched schedule_timeout_uninterruptible(signed long timeout)
    1924             : {
    1925           0 :         __set_current_state(TASK_UNINTERRUPTIBLE);
    1926           0 :         return schedule_timeout(timeout);
    1927             : }
    1928             : EXPORT_SYMBOL(schedule_timeout_uninterruptible);
    1929             : 
    1930             : /*
    1931             :  * Like schedule_timeout_uninterruptible(), except this task will not contribute
    1932             :  * to load average.
    1933             :  */
    1934           0 : signed long __sched schedule_timeout_idle(signed long timeout)
    1935             : {
    1936           0 :         __set_current_state(TASK_IDLE);
    1937           0 :         return schedule_timeout(timeout);
    1938             : }
    1939             : EXPORT_SYMBOL(schedule_timeout_idle);
    1940             : 
    1941             : #ifdef CONFIG_HOTPLUG_CPU
    1942           0 : static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
    1943             : {
    1944           0 :         struct timer_list *timer;
    1945           0 :         int cpu = new_base->cpu;
    1946             : 
    1947           0 :         while (!hlist_empty(head)) {
    1948           0 :                 timer = hlist_entry(head->first, struct timer_list, entry);
    1949           0 :                 detach_timer(timer, false);
    1950           0 :                 timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
    1951           0 :                 internal_add_timer(new_base, timer);
    1952             :         }
    1953           0 : }
    1954             : 
    1955           3 : int timers_prepare_cpu(unsigned int cpu)
    1956             : {
    1957           3 :         struct timer_base *base;
    1958           3 :         int b;
    1959             : 
    1960           9 :         for (b = 0; b < NR_BASES; b++) {
    1961           6 :                 base = per_cpu_ptr(&timer_bases[b], cpu);
    1962           6 :                 base->clk = jiffies;
    1963           6 :                 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
    1964           6 :                 base->is_idle = false;
    1965             :         }
    1966           3 :         return 0;
    1967             : }
    1968             : 
    1969           0 : int timers_dead_cpu(unsigned int cpu)
    1970             : {
    1971           0 :         struct timer_base *old_base;
    1972           0 :         struct timer_base *new_base;
    1973           0 :         int b, i;
    1974             : 
    1975           0 :         BUG_ON(cpu_online(cpu));
    1976             : 
    1977           0 :         for (b = 0; b < NR_BASES; b++) {
    1978           0 :                 old_base = per_cpu_ptr(&timer_bases[b], cpu);
    1979           0 :                 new_base = get_cpu_ptr(&timer_bases[b]);
    1980             :                 /*
    1981             :                  * The caller is globally serialized and nobody else
    1982             :                  * takes two locks at once, deadlock is not possible.
    1983             :                  */
    1984           0 :                 raw_spin_lock_irq(&new_base->lock);
    1985           0 :                 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
    1986             : 
    1987             :                 /*
    1988             :                  * The current CPUs base clock might be stale. Update it
    1989             :                  * before moving the timers over.
    1990             :                  */
    1991           0 :                 forward_timer_base(new_base);
    1992             : 
    1993           0 :                 BUG_ON(old_base->running_timer);
    1994             : 
    1995           0 :                 for (i = 0; i < WHEEL_SIZE; i++)
    1996           0 :                         migrate_timer_list(new_base, old_base->vectors + i);
    1997             : 
    1998           0 :                 raw_spin_unlock(&old_base->lock);
    1999           0 :                 raw_spin_unlock_irq(&new_base->lock);
    2000           0 :                 put_cpu_ptr(&timer_bases);
    2001             :         }
    2002           0 :         return 0;
    2003             : }
    2004             : 
    2005             : #endif /* CONFIG_HOTPLUG_CPU */
    2006             : 
    2007           4 : static void __init init_timer_cpu(int cpu)
    2008             : {
    2009           4 :         struct timer_base *base;
    2010           4 :         int i;
    2011             : 
    2012          12 :         for (i = 0; i < NR_BASES; i++) {
    2013           8 :                 base = per_cpu_ptr(&timer_bases[i], cpu);
    2014           8 :                 base->cpu = cpu;
    2015           8 :                 raw_spin_lock_init(&base->lock);
    2016           8 :                 base->clk = jiffies;
    2017           8 :                 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
    2018           8 :                 timer_base_init_expiry_lock(base);
    2019             :         }
    2020           4 : }
    2021             : 
    2022           1 : static void __init init_timer_cpus(void)
    2023             : {
    2024           1 :         int cpu;
    2025             : 
    2026           5 :         for_each_possible_cpu(cpu)
    2027           4 :                 init_timer_cpu(cpu);
    2028           1 : }
    2029             : 
    2030           1 : void __init init_timers(void)
    2031             : {
    2032           1 :         init_timer_cpus();
    2033           1 :         posix_cputimers_init_work();
    2034           1 :         open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
    2035           1 : }
    2036             : 
    2037             : /**
    2038             :  * msleep - sleep safely even with waitqueue interruptions
    2039             :  * @msecs: Time in milliseconds to sleep for
    2040             :  */
    2041           0 : void msleep(unsigned int msecs)
    2042             : {
    2043           0 :         unsigned long timeout = msecs_to_jiffies(msecs) + 1;
    2044             : 
    2045           0 :         while (timeout)
    2046           0 :                 timeout = schedule_timeout_uninterruptible(timeout);
    2047           0 : }
    2048             : 
    2049             : EXPORT_SYMBOL(msleep);
    2050             : 
    2051             : /**
    2052             :  * msleep_interruptible - sleep waiting for signals
    2053             :  * @msecs: Time in milliseconds to sleep for
    2054             :  */
    2055           0 : unsigned long msleep_interruptible(unsigned int msecs)
    2056             : {
    2057           0 :         unsigned long timeout = msecs_to_jiffies(msecs) + 1;
    2058             : 
    2059           0 :         while (timeout && !signal_pending(current))
    2060           0 :                 timeout = schedule_timeout_interruptible(timeout);
    2061           0 :         return jiffies_to_msecs(timeout);
    2062             : }
    2063             : 
    2064             : EXPORT_SYMBOL(msleep_interruptible);
    2065             : 
    2066             : /**
    2067             :  * usleep_range - Sleep for an approximate time
    2068             :  * @min: Minimum time in usecs to sleep
    2069             :  * @max: Maximum time in usecs to sleep
    2070             :  *
    2071             :  * In non-atomic context where the exact wakeup time is flexible, use
    2072             :  * usleep_range() instead of udelay().  The sleep improves responsiveness
    2073             :  * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
    2074             :  * power usage by allowing hrtimers to take advantage of an already-
    2075             :  * scheduled interrupt instead of scheduling a new one just for this sleep.
    2076             :  */
    2077           0 : void __sched usleep_range(unsigned long min, unsigned long max)
    2078             : {
    2079           0 :         ktime_t exp = ktime_add_us(ktime_get(), min);
    2080           0 :         u64 delta = (u64)(max - min) * NSEC_PER_USEC;
    2081             : 
    2082           0 :         for (;;) {
    2083           0 :                 __set_current_state(TASK_UNINTERRUPTIBLE);
    2084             :                 /* Do not return before the requested sleep time has elapsed */
    2085           0 :                 if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
    2086             :                         break;
    2087             :         }
    2088           0 : }
    2089             : EXPORT_SYMBOL(usleep_range);

Generated by: LCOV version 1.14