Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * sched_clock() for unstable CPU clocks
4 : *
5 : * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
6 : *
7 : * Updates and enhancements:
8 : * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
9 : *
10 : * Based on code by:
11 : * Ingo Molnar <mingo@redhat.com>
12 : * Guillaume Chazarain <guichaz@gmail.com>
13 : *
14 : *
15 : * What this file implements:
16 : *
17 : * cpu_clock(i) provides a fast (execution time) high resolution
18 : * clock with bounded drift between CPUs. The value of cpu_clock(i)
19 : * is monotonic for constant i. The timestamp returned is in nanoseconds.
20 : *
21 : * ######################### BIG FAT WARNING ##########################
22 : * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
23 : * # go backwards !! #
24 : * ####################################################################
25 : *
26 : * There is no strict promise about the base, although it tends to start
27 : * at 0 on boot (but people really shouldn't rely on that).
28 : *
29 : * cpu_clock(i) -- can be used from any context, including NMI.
30 : * local_clock() -- is cpu_clock() on the current CPU.
31 : *
32 : * sched_clock_cpu(i)
33 : *
34 : * How it is implemented:
35 : *
36 : * The implementation either uses sched_clock() when
37 : * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
38 : * sched_clock() is assumed to provide these properties (mostly it means
39 : * the architecture provides a globally synchronized highres time source).
40 : *
41 : * Otherwise it tries to create a semi stable clock from a mixture of other
42 : * clocks, including:
43 : *
44 : * - GTOD (clock monotomic)
45 : * - sched_clock()
46 : * - explicit idle events
47 : *
48 : * We use GTOD as base and use sched_clock() deltas to improve resolution. The
49 : * deltas are filtered to provide monotonicity and keeping it within an
50 : * expected window.
51 : *
52 : * Furthermore, explicit sleep and wakeup hooks allow us to account for time
53 : * that is otherwise invisible (TSC gets stopped).
54 : *
55 : */
56 : #include "sched.h"
57 : #include <linux/sched_clock.h>
58 :
59 : /*
60 : * Scheduler clock - returns current time in nanosec units.
61 : * This is default implementation.
62 : * Architectures and sub-architectures can override this.
63 : */
64 0 : unsigned long long __weak sched_clock(void)
65 : {
66 0 : return (unsigned long long)(jiffies - INITIAL_JIFFIES)
67 0 : * (NSEC_PER_SEC / HZ);
68 : }
69 : EXPORT_SYMBOL_GPL(sched_clock);
70 :
71 : static DEFINE_STATIC_KEY_FALSE(sched_clock_running);
72 :
73 : #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
74 : /*
75 : * We must start with !__sched_clock_stable because the unstable -> stable
76 : * transition is accurate, while the stable -> unstable transition is not.
77 : *
78 : * Similarly we start with __sched_clock_stable_early, thereby assuming we
79 : * will become stable, such that there's only a single 1 -> 0 transition.
80 : */
81 : static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
82 : static int __sched_clock_stable_early = 1;
83 :
84 : /*
85 : * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
86 : */
87 : __read_mostly u64 __sched_clock_offset;
88 : static __read_mostly u64 __gtod_offset;
89 :
90 : struct sched_clock_data {
91 : u64 tick_raw;
92 : u64 tick_gtod;
93 : u64 clock;
94 : };
95 :
96 : static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
97 :
98 1468 : static inline struct sched_clock_data *this_scd(void)
99 : {
100 2933 : return this_cpu_ptr(&sched_clock_data);
101 : }
102 :
103 3931 : static inline struct sched_clock_data *cpu_sdc(int cpu)
104 : {
105 3931 : return &per_cpu(sched_clock_data, cpu);
106 : }
107 :
108 138702 : int sched_clock_stable(void)
109 : {
110 1 : return static_branch_likely(&__sched_clock_stable);
111 : }
112 :
113 908 : static void __scd_stamp(struct sched_clock_data *scd)
114 : {
115 908 : scd->tick_gtod = ktime_get_ns();
116 918 : scd->tick_raw = sched_clock();
117 912 : }
118 :
119 1 : static void __set_sched_clock_stable(void)
120 : {
121 1 : struct sched_clock_data *scd;
122 :
123 : /*
124 : * Since we're still unstable and the tick is already running, we have
125 : * to disable IRQs in order to get a consistent scd->tick* reading.
126 : */
127 1 : local_irq_disable();
128 1 : scd = this_scd();
129 : /*
130 : * Attempt to make the (initial) unstable->stable transition continuous.
131 : */
132 1 : __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
133 1 : local_irq_enable();
134 :
135 1 : printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
136 : scd->tick_gtod, __gtod_offset,
137 : scd->tick_raw, __sched_clock_offset);
138 :
139 1 : static_branch_enable(&__sched_clock_stable);
140 1 : tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
141 1 : }
142 :
143 : /*
144 : * If we ever get here, we're screwed, because we found out -- typically after
145 : * the fact -- that TSC wasn't good. This means all our clocksources (including
146 : * ktime) could have reported wrong values.
147 : *
148 : * What we do here is an attempt to fix up and continue sort of where we left
149 : * off in a coherent manner.
150 : *
151 : * The only way to fully avoid random clock jumps is to boot with:
152 : * "tsc=unstable".
153 : */
154 0 : static void __sched_clock_work(struct work_struct *work)
155 : {
156 0 : struct sched_clock_data *scd;
157 0 : int cpu;
158 :
159 : /* take a current timestamp and set 'now' */
160 0 : preempt_disable();
161 0 : scd = this_scd();
162 0 : __scd_stamp(scd);
163 0 : scd->clock = scd->tick_gtod + __gtod_offset;
164 0 : preempt_enable();
165 :
166 : /* clone to all CPUs */
167 0 : for_each_possible_cpu(cpu)
168 0 : per_cpu(sched_clock_data, cpu) = *scd;
169 :
170 0 : printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
171 0 : printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
172 : scd->tick_gtod, __gtod_offset,
173 : scd->tick_raw, __sched_clock_offset);
174 :
175 0 : static_branch_disable(&__sched_clock_stable);
176 0 : }
177 :
178 : static DECLARE_WORK(sched_clock_work, __sched_clock_work);
179 :
180 0 : static void __clear_sched_clock_stable(void)
181 : {
182 0 : if (!sched_clock_stable())
183 : return;
184 :
185 0 : tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
186 0 : schedule_work(&sched_clock_work);
187 : }
188 :
189 0 : void clear_sched_clock_stable(void)
190 : {
191 0 : __sched_clock_stable_early = 0;
192 :
193 0 : smp_mb(); /* matches sched_clock_init_late() */
194 :
195 0 : if (static_key_count(&sched_clock_running.key) == 2)
196 0 : __clear_sched_clock_stable();
197 0 : }
198 :
199 1 : static void __sched_clock_gtod_offset(void)
200 : {
201 1 : struct sched_clock_data *scd = this_scd();
202 :
203 1 : __scd_stamp(scd);
204 1 : __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod;
205 1 : }
206 :
207 1 : void __init sched_clock_init(void)
208 : {
209 : /*
210 : * Set __gtod_offset such that once we mark sched_clock_running,
211 : * sched_clock_tick() continues where sched_clock() left off.
212 : *
213 : * Even if TSC is buggered, we're still UP at this point so it
214 : * can't really be out of sync.
215 : */
216 1 : local_irq_disable();
217 1 : __sched_clock_gtod_offset();
218 1 : local_irq_enable();
219 :
220 1 : static_branch_inc(&sched_clock_running);
221 1 : }
222 : /*
223 : * We run this as late_initcall() such that it runs after all built-in drivers,
224 : * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
225 : */
226 1 : static int __init sched_clock_init_late(void)
227 : {
228 1 : static_branch_inc(&sched_clock_running);
229 : /*
230 : * Ensure that it is impossible to not do a static_key update.
231 : *
232 : * Either {set,clear}_sched_clock_stable() must see sched_clock_running
233 : * and do the update, or we must see their __sched_clock_stable_early
234 : * and do the update, or both.
235 : */
236 1 : smp_mb(); /* matches {set,clear}_sched_clock_stable() */
237 :
238 1 : if (__sched_clock_stable_early)
239 1 : __set_sched_clock_stable();
240 :
241 1 : return 0;
242 : }
243 : late_initcall(sched_clock_init_late);
244 :
245 : /*
246 : * min, max except they take wrapping into account
247 : */
248 :
249 4847 : static inline u64 wrap_min(u64 x, u64 y)
250 : {
251 4847 : return (s64)(x - y) < 0 ? x : y;
252 : }
253 :
254 14541 : static inline u64 wrap_max(u64 x, u64 y)
255 : {
256 14541 : return (s64)(x - y) > 0 ? x : y;
257 : }
258 :
259 : /*
260 : * update the percpu scd from the raw @now value
261 : *
262 : * - filter out backward motion
263 : * - use the GTOD tick value to create a window to filter crazy TSC values
264 : */
265 4825 : static u64 sched_clock_local(struct sched_clock_data *scd)
266 : {
267 4825 : u64 now, clock, old_clock, min_clock, max_clock, gtod;
268 4825 : s64 delta;
269 :
270 4825 : again:
271 4825 : now = sched_clock();
272 4847 : delta = now - scd->tick_raw;
273 4847 : if (unlikely(delta < 0))
274 0 : delta = 0;
275 :
276 4847 : old_clock = scd->clock;
277 :
278 : /*
279 : * scd->clock = clamp(scd->tick_gtod + delta,
280 : * max(scd->tick_gtod, scd->clock),
281 : * scd->tick_gtod + TICK_NSEC);
282 : */
283 :
284 4847 : gtod = scd->tick_gtod + __gtod_offset;
285 4847 : clock = gtod + delta;
286 4847 : min_clock = wrap_max(gtod, old_clock);
287 4847 : max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
288 :
289 4847 : clock = wrap_max(clock, min_clock);
290 4847 : clock = wrap_min(clock, max_clock);
291 :
292 4847 : if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
293 0 : goto again;
294 :
295 4848 : return clock;
296 : }
297 :
298 554 : static u64 sched_clock_remote(struct sched_clock_data *scd)
299 : {
300 554 : struct sched_clock_data *my_scd = this_scd();
301 554 : u64 this_clock, remote_clock;
302 554 : u64 *ptr, old_val, val;
303 :
304 : #if BITS_PER_LONG != 64
305 : again:
306 : /*
307 : * Careful here: The local and the remote clock values need to
308 : * be read out atomic as we need to compare the values and
309 : * then update either the local or the remote side. So the
310 : * cmpxchg64 below only protects one readout.
311 : *
312 : * We must reread via sched_clock_local() in the retry case on
313 : * 32-bit kernels as an NMI could use sched_clock_local() via the
314 : * tracer and hit between the readout of
315 : * the low 32-bit and the high 32-bit portion.
316 : */
317 : this_clock = sched_clock_local(my_scd);
318 : /*
319 : * We must enforce atomic readout on 32-bit, otherwise the
320 : * update on the remote CPU can hit inbetween the readout of
321 : * the low 32-bit and the high 32-bit portion.
322 : */
323 : remote_clock = cmpxchg64(&scd->clock, 0, 0);
324 : #else
325 : /*
326 : * On 64-bit kernels the read of [my]scd->clock is atomic versus the
327 : * update, so we can avoid the above 32-bit dance.
328 : */
329 554 : sched_clock_local(my_scd);
330 554 : again:
331 554 : this_clock = my_scd->clock;
332 554 : remote_clock = scd->clock;
333 : #endif
334 :
335 : /*
336 : * Use the opportunity that we have both locks
337 : * taken to couple the two clocks: we take the
338 : * larger time as the latest time for both
339 : * runqueues. (this creates monotonic movement)
340 : */
341 554 : if (likely((s64)(remote_clock - this_clock) < 0)) {
342 512 : ptr = &scd->clock;
343 512 : old_val = remote_clock;
344 512 : val = this_clock;
345 : } else {
346 : /*
347 : * Should be rare, but possible:
348 : */
349 42 : ptr = &my_scd->clock;
350 42 : old_val = this_clock;
351 42 : val = remote_clock;
352 : }
353 :
354 554 : if (cmpxchg64(ptr, old_val, val) != old_val)
355 0 : goto again;
356 :
357 554 : return val;
358 : }
359 :
360 : /*
361 : * Similar to cpu_clock(), but requires local IRQs to be disabled.
362 : *
363 : * See cpu_clock().
364 : */
365 99206 : u64 sched_clock_cpu(int cpu)
366 : {
367 99206 : struct sched_clock_data *scd;
368 99206 : u64 clock;
369 :
370 99206 : if (sched_clock_stable())
371 95109 : return sched_clock() + __sched_clock_offset;
372 :
373 4062 : if (!static_branch_likely(&sched_clock_running))
374 131 : return sched_clock();
375 :
376 3935 : preempt_disable_notrace();
377 3931 : scd = cpu_sdc(cpu);
378 :
379 3931 : if (cpu != smp_processor_id())
380 554 : clock = sched_clock_remote(scd);
381 : else
382 3377 : clock = sched_clock_local(scd);
383 3934 : preempt_enable_notrace();
384 :
385 3934 : return clock;
386 : }
387 : EXPORT_SYMBOL_GPL(sched_clock_cpu);
388 :
389 24615 : void sched_clock_tick(void)
390 : {
391 24615 : struct sched_clock_data *scd;
392 :
393 24615 : if (sched_clock_stable())
394 : return;
395 :
396 911 : if (!static_branch_likely(&sched_clock_running))
397 : return;
398 :
399 1816 : lockdep_assert_irqs_disabled();
400 :
401 912 : scd = this_scd();
402 909 : __scd_stamp(scd);
403 912 : sched_clock_local(scd);
404 : }
405 :
406 0 : void sched_clock_tick_stable(void)
407 : {
408 0 : if (!sched_clock_stable())
409 : return;
410 :
411 : /*
412 : * Called under watchdog_lock.
413 : *
414 : * The watchdog just found this TSC to (still) be stable, so now is a
415 : * good moment to update our __gtod_offset. Because once we find the
416 : * TSC to be unstable, any computation will be computing crap.
417 : */
418 0 : local_irq_disable();
419 0 : __sched_clock_gtod_offset();
420 0 : local_irq_enable();
421 : }
422 :
423 : /*
424 : * We are going deep-idle (irqs are disabled):
425 : */
426 16244 : void sched_clock_idle_sleep_event(void)
427 : {
428 16244 : sched_clock_cpu(smp_processor_id());
429 16284 : }
430 : EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
431 :
432 : /*
433 : * We just idled; resync with ktime.
434 : */
435 14880 : void sched_clock_idle_wakeup_event(void)
436 : {
437 14880 : unsigned long flags;
438 :
439 14880 : if (sched_clock_stable())
440 : return;
441 :
442 493 : if (unlikely(timekeeping_suspended))
443 : return;
444 :
445 988 : local_irq_save(flags);
446 495 : sched_clock_tick();
447 500 : local_irq_restore(flags);
448 : }
449 : EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
450 :
451 : #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
452 :
453 : void __init sched_clock_init(void)
454 : {
455 : static_branch_inc(&sched_clock_running);
456 : local_irq_disable();
457 : generic_sched_clock_init();
458 : local_irq_enable();
459 : }
460 :
461 : u64 sched_clock_cpu(int cpu)
462 : {
463 : if (!static_branch_likely(&sched_clock_running))
464 : return 0;
465 :
466 : return sched_clock();
467 : }
468 :
469 : #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
470 :
471 : /*
472 : * Running clock - returns the time that has elapsed while a guest has been
473 : * running.
474 : * On a guest this value should be local_clock minus the time the guest was
475 : * suspended by the hypervisor (for any reason).
476 : * On bare metal this function should return the same as local_clock.
477 : * Architectures and sub-architectures can override this.
478 : */
479 0 : u64 __weak running_clock(void)
480 : {
481 0 : return local_clock();
482 : }
|