Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * Copyright (C) 1991, 1992 Linus Torvalds
4 : * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
5 : * Copyright (C) 2011 Don Zickus Red Hat, Inc.
6 : *
7 : * Pentium III FXSR, SSE support
8 : * Gareth Hughes <gareth@valinux.com>, May 2000
9 : */
10 :
11 : /*
12 : * Handle hardware traps and faults.
13 : */
14 : #include <linux/spinlock.h>
15 : #include <linux/kprobes.h>
16 : #include <linux/kdebug.h>
17 : #include <linux/sched/debug.h>
18 : #include <linux/nmi.h>
19 : #include <linux/debugfs.h>
20 : #include <linux/delay.h>
21 : #include <linux/hardirq.h>
22 : #include <linux/ratelimit.h>
23 : #include <linux/slab.h>
24 : #include <linux/export.h>
25 : #include <linux/atomic.h>
26 : #include <linux/sched/clock.h>
27 :
28 : #include <asm/cpu_entry_area.h>
29 : #include <asm/traps.h>
30 : #include <asm/mach_traps.h>
31 : #include <asm/nmi.h>
32 : #include <asm/x86_init.h>
33 : #include <asm/reboot.h>
34 : #include <asm/cache.h>
35 : #include <asm/nospec-branch.h>
36 : #include <asm/sev-es.h>
37 :
38 : #define CREATE_TRACE_POINTS
39 : #include <trace/events/nmi.h>
40 :
41 : struct nmi_desc {
42 : raw_spinlock_t lock;
43 : struct list_head head;
44 : };
45 :
46 : static struct nmi_desc nmi_desc[NMI_MAX] =
47 : {
48 : {
49 : .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
50 : .head = LIST_HEAD_INIT(nmi_desc[0].head),
51 : },
52 : {
53 : .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
54 : .head = LIST_HEAD_INIT(nmi_desc[1].head),
55 : },
56 : {
57 : .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
58 : .head = LIST_HEAD_INIT(nmi_desc[2].head),
59 : },
60 : {
61 : .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
62 : .head = LIST_HEAD_INIT(nmi_desc[3].head),
63 : },
64 :
65 : };
66 :
67 : struct nmi_stats {
68 : unsigned int normal;
69 : unsigned int unknown;
70 : unsigned int external;
71 : unsigned int swallow;
72 : };
73 :
74 : static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
75 :
76 : static int ignore_nmis __read_mostly;
77 :
78 : int unknown_nmi_panic;
79 : /*
80 : * Prevent NMI reason port (0x61) being accessed simultaneously, can
81 : * only be used in NMI handler.
82 : */
83 : static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
84 :
85 0 : static int __init setup_unknown_nmi_panic(char *str)
86 : {
87 0 : unknown_nmi_panic = 1;
88 0 : return 1;
89 : }
90 : __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
91 :
92 : #define nmi_to_desc(type) (&nmi_desc[type])
93 :
94 : static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
95 :
96 1 : static int __init nmi_warning_debugfs(void)
97 : {
98 1 : debugfs_create_u64("nmi_longest_ns", 0644,
99 : arch_debugfs_dir, &nmi_longest_ns);
100 1 : return 0;
101 : }
102 : fs_initcall(nmi_warning_debugfs);
103 :
104 0 : static void nmi_check_duration(struct nmiaction *action, u64 duration)
105 : {
106 0 : int remainder_ns, decimal_msecs;
107 :
108 0 : if (duration < nmi_longest_ns || duration < action->max_duration)
109 : return;
110 :
111 0 : action->max_duration = duration;
112 :
113 0 : remainder_ns = do_div(duration, (1000 * 1000));
114 0 : decimal_msecs = remainder_ns / 1000;
115 :
116 0 : printk_ratelimited(KERN_INFO
117 : "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
118 : action->handler, duration, decimal_msecs);
119 : }
120 :
121 0 : static int nmi_handle(unsigned int type, struct pt_regs *regs)
122 : {
123 0 : struct nmi_desc *desc = nmi_to_desc(type);
124 0 : struct nmiaction *a;
125 0 : int handled=0;
126 :
127 0 : rcu_read_lock();
128 :
129 : /*
130 : * NMIs are edge-triggered, which means if you have enough
131 : * of them concurrently, you can lose some because only one
132 : * can be latched at any given time. Walk the whole list
133 : * to handle those situations.
134 : */
135 0 : list_for_each_entry_rcu(a, &desc->head, list) {
136 0 : int thishandled;
137 0 : u64 delta;
138 :
139 0 : delta = sched_clock();
140 0 : thishandled = a->handler(type, regs);
141 0 : handled += thishandled;
142 0 : delta = sched_clock() - delta;
143 0 : trace_nmi_handler(a->handler, (int)delta, thishandled);
144 :
145 0 : nmi_check_duration(a, delta);
146 : }
147 :
148 0 : rcu_read_unlock();
149 :
150 : /* return total number of NMI events handled */
151 0 : return handled;
152 : }
153 : NOKPROBE_SYMBOL(nmi_handle);
154 :
155 2 : int __register_nmi_handler(unsigned int type, struct nmiaction *action)
156 : {
157 2 : struct nmi_desc *desc = nmi_to_desc(type);
158 2 : unsigned long flags;
159 :
160 2 : if (!action->handler)
161 : return -EINVAL;
162 :
163 2 : raw_spin_lock_irqsave(&desc->lock, flags);
164 :
165 : /*
166 : * Indicate if there are multiple registrations on the
167 : * internal NMI handler call chains (SERR and IO_CHECK).
168 : */
169 2 : WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
170 2 : WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
171 :
172 : /*
173 : * some handlers need to be executed first otherwise a fake
174 : * event confuses some handlers (kdump uses this flag)
175 : */
176 2 : if (action->flags & NMI_FLAG_FIRST)
177 0 : list_add_rcu(&action->list, &desc->head);
178 : else
179 2 : list_add_tail_rcu(&action->list, &desc->head);
180 :
181 2 : raw_spin_unlock_irqrestore(&desc->lock, flags);
182 2 : return 0;
183 : }
184 : EXPORT_SYMBOL(__register_nmi_handler);
185 :
186 0 : void unregister_nmi_handler(unsigned int type, const char *name)
187 : {
188 0 : struct nmi_desc *desc = nmi_to_desc(type);
189 0 : struct nmiaction *n;
190 0 : unsigned long flags;
191 :
192 0 : raw_spin_lock_irqsave(&desc->lock, flags);
193 :
194 0 : list_for_each_entry_rcu(n, &desc->head, list) {
195 : /*
196 : * the name passed in to describe the nmi handler
197 : * is used as the lookup key
198 : */
199 0 : if (!strcmp(n->name, name)) {
200 0 : WARN(in_nmi(),
201 : "Trying to free NMI (%s) from NMI context!\n", n->name);
202 0 : list_del_rcu(&n->list);
203 : break;
204 : }
205 : }
206 :
207 0 : raw_spin_unlock_irqrestore(&desc->lock, flags);
208 0 : synchronize_rcu();
209 0 : }
210 : EXPORT_SYMBOL_GPL(unregister_nmi_handler);
211 :
212 : static void
213 0 : pci_serr_error(unsigned char reason, struct pt_regs *regs)
214 : {
215 : /* check to see if anyone registered against these types of errors */
216 0 : if (nmi_handle(NMI_SERR, regs))
217 : return;
218 :
219 0 : pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
220 : reason, smp_processor_id());
221 :
222 0 : if (panic_on_unrecovered_nmi)
223 0 : nmi_panic(regs, "NMI: Not continuing");
224 :
225 0 : pr_emerg("Dazed and confused, but trying to continue\n");
226 :
227 : /* Clear and disable the PCI SERR error line. */
228 0 : reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
229 0 : outb(reason, NMI_REASON_PORT);
230 : }
231 : NOKPROBE_SYMBOL(pci_serr_error);
232 :
233 : static void
234 0 : io_check_error(unsigned char reason, struct pt_regs *regs)
235 : {
236 0 : unsigned long i;
237 :
238 : /* check to see if anyone registered against these types of errors */
239 0 : if (nmi_handle(NMI_IO_CHECK, regs))
240 : return;
241 :
242 0 : pr_emerg(
243 : "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
244 : reason, smp_processor_id());
245 0 : show_regs(regs);
246 :
247 0 : if (panic_on_io_nmi) {
248 0 : nmi_panic(regs, "NMI IOCK error: Not continuing");
249 :
250 : /*
251 : * If we end up here, it means we have received an NMI while
252 : * processing panic(). Simply return without delaying and
253 : * re-enabling NMIs.
254 : */
255 0 : return;
256 : }
257 :
258 : /* Re-enable the IOCK line, wait for a few seconds */
259 0 : reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
260 0 : outb(reason, NMI_REASON_PORT);
261 :
262 0 : i = 20000;
263 0 : while (--i) {
264 0 : touch_nmi_watchdog();
265 0 : udelay(100);
266 : }
267 :
268 0 : reason &= ~NMI_REASON_CLEAR_IOCHK;
269 0 : outb(reason, NMI_REASON_PORT);
270 : }
271 : NOKPROBE_SYMBOL(io_check_error);
272 :
273 : static void
274 0 : unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
275 : {
276 0 : int handled;
277 :
278 : /*
279 : * Use 'false' as back-to-back NMIs are dealt with one level up.
280 : * Of course this makes having multiple 'unknown' handlers useless
281 : * as only the first one is ever run (unless it can actually determine
282 : * if it caused the NMI)
283 : */
284 0 : handled = nmi_handle(NMI_UNKNOWN, regs);
285 0 : if (handled) {
286 0 : __this_cpu_add(nmi_stats.unknown, handled);
287 0 : return;
288 : }
289 :
290 0 : __this_cpu_add(nmi_stats.unknown, 1);
291 :
292 0 : pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
293 : reason, smp_processor_id());
294 :
295 0 : pr_emerg("Do you have a strange power saving mode enabled?\n");
296 0 : if (unknown_nmi_panic || panic_on_unrecovered_nmi)
297 0 : nmi_panic(regs, "NMI: Not continuing");
298 :
299 0 : pr_emerg("Dazed and confused, but trying to continue\n");
300 : }
301 : NOKPROBE_SYMBOL(unknown_nmi_error);
302 :
303 : static DEFINE_PER_CPU(bool, swallow_nmi);
304 : static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
305 :
306 0 : static noinstr void default_do_nmi(struct pt_regs *regs)
307 : {
308 0 : unsigned char reason = 0;
309 0 : int handled;
310 0 : bool b2b = false;
311 :
312 : /*
313 : * CPU-specific NMI must be processed before non-CPU-specific
314 : * NMI, otherwise we may lose it, because the CPU-specific
315 : * NMI can not be detected/processed on other CPUs.
316 : */
317 :
318 : /*
319 : * Back-to-back NMIs are interesting because they can either
320 : * be two NMI or more than two NMIs (any thing over two is dropped
321 : * due to NMI being edge-triggered). If this is the second half
322 : * of the back-to-back NMI, assume we dropped things and process
323 : * more handlers. Otherwise reset the 'swallow' NMI behaviour
324 : */
325 0 : if (regs->ip == __this_cpu_read(last_nmi_rip))
326 : b2b = true;
327 : else
328 0 : __this_cpu_write(swallow_nmi, false);
329 :
330 0 : __this_cpu_write(last_nmi_rip, regs->ip);
331 :
332 0 : instrumentation_begin();
333 :
334 0 : handled = nmi_handle(NMI_LOCAL, regs);
335 0 : __this_cpu_add(nmi_stats.normal, handled);
336 0 : if (handled) {
337 : /*
338 : * There are cases when a NMI handler handles multiple
339 : * events in the current NMI. One of these events may
340 : * be queued for in the next NMI. Because the event is
341 : * already handled, the next NMI will result in an unknown
342 : * NMI. Instead lets flag this for a potential NMI to
343 : * swallow.
344 : */
345 0 : if (handled > 1)
346 0 : __this_cpu_write(swallow_nmi, true);
347 0 : goto out;
348 : }
349 :
350 : /*
351 : * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
352 : *
353 : * Another CPU may be processing panic routines while holding
354 : * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
355 : * and if so, call its callback directly. If there is no CPU preparing
356 : * crash dump, we simply loop here.
357 : */
358 0 : while (!raw_spin_trylock(&nmi_reason_lock)) {
359 0 : run_crash_ipi_callback(regs);
360 0 : cpu_relax();
361 : }
362 :
363 0 : reason = x86_platform.get_nmi_reason();
364 :
365 0 : if (reason & NMI_REASON_MASK) {
366 0 : if (reason & NMI_REASON_SERR)
367 0 : pci_serr_error(reason, regs);
368 0 : else if (reason & NMI_REASON_IOCHK)
369 0 : io_check_error(reason, regs);
370 : #ifdef CONFIG_X86_32
371 : /*
372 : * Reassert NMI in case it became active
373 : * meanwhile as it's edge-triggered:
374 : */
375 : reassert_nmi();
376 : #endif
377 0 : __this_cpu_add(nmi_stats.external, 1);
378 0 : raw_spin_unlock(&nmi_reason_lock);
379 0 : goto out;
380 : }
381 0 : raw_spin_unlock(&nmi_reason_lock);
382 :
383 : /*
384 : * Only one NMI can be latched at a time. To handle
385 : * this we may process multiple nmi handlers at once to
386 : * cover the case where an NMI is dropped. The downside
387 : * to this approach is we may process an NMI prematurely,
388 : * while its real NMI is sitting latched. This will cause
389 : * an unknown NMI on the next run of the NMI processing.
390 : *
391 : * We tried to flag that condition above, by setting the
392 : * swallow_nmi flag when we process more than one event.
393 : * This condition is also only present on the second half
394 : * of a back-to-back NMI, so we flag that condition too.
395 : *
396 : * If both are true, we assume we already processed this
397 : * NMI previously and we swallow it. Otherwise we reset
398 : * the logic.
399 : *
400 : * There are scenarios where we may accidentally swallow
401 : * a 'real' unknown NMI. For example, while processing
402 : * a perf NMI another perf NMI comes in along with a
403 : * 'real' unknown NMI. These two NMIs get combined into
404 : * one (as described above). When the next NMI gets
405 : * processed, it will be flagged by perf as handled, but
406 : * no one will know that there was a 'real' unknown NMI sent
407 : * also. As a result it gets swallowed. Or if the first
408 : * perf NMI returns two events handled then the second
409 : * NMI will get eaten by the logic below, again losing a
410 : * 'real' unknown NMI. But this is the best we can do
411 : * for now.
412 : */
413 0 : if (b2b && __this_cpu_read(swallow_nmi))
414 0 : __this_cpu_add(nmi_stats.swallow, 1);
415 : else
416 0 : unknown_nmi_error(reason, regs);
417 :
418 0 : out:
419 0 : instrumentation_end();
420 0 : }
421 :
422 : /*
423 : * NMIs can page fault or hit breakpoints which will cause it to lose
424 : * its NMI context with the CPU when the breakpoint or page fault does an IRET.
425 : *
426 : * As a result, NMIs can nest if NMIs get unmasked due an IRET during
427 : * NMI processing. On x86_64, the asm glue protects us from nested NMIs
428 : * if the outer NMI came from kernel mode, but we can still nest if the
429 : * outer NMI came from user mode.
430 : *
431 : * To handle these nested NMIs, we have three states:
432 : *
433 : * 1) not running
434 : * 2) executing
435 : * 3) latched
436 : *
437 : * When no NMI is in progress, it is in the "not running" state.
438 : * When an NMI comes in, it goes into the "executing" state.
439 : * Normally, if another NMI is triggered, it does not interrupt
440 : * the running NMI and the HW will simply latch it so that when
441 : * the first NMI finishes, it will restart the second NMI.
442 : * (Note, the latch is binary, thus multiple NMIs triggering,
443 : * when one is running, are ignored. Only one NMI is restarted.)
444 : *
445 : * If an NMI executes an iret, another NMI can preempt it. We do not
446 : * want to allow this new NMI to run, but we want to execute it when the
447 : * first one finishes. We set the state to "latched", and the exit of
448 : * the first NMI will perform a dec_return, if the result is zero
449 : * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
450 : * dec_return would have set the state to NMI_EXECUTING (what we want it
451 : * to be when we are running). In this case, we simply jump back to
452 : * rerun the NMI handler again, and restart the 'latched' NMI.
453 : *
454 : * No trap (breakpoint or page fault) should be hit before nmi_restart,
455 : * thus there is no race between the first check of state for NOT_RUNNING
456 : * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
457 : * at this point.
458 : *
459 : * In case the NMI takes a page fault, we need to save off the CR2
460 : * because the NMI could have preempted another page fault and corrupt
461 : * the CR2 that is about to be read. As nested NMIs must be restarted
462 : * and they can not take breakpoints or page faults, the update of the
463 : * CR2 must be done before converting the nmi state back to NOT_RUNNING.
464 : * Otherwise, there would be a race of another nested NMI coming in
465 : * after setting state to NOT_RUNNING but before updating the nmi_cr2.
466 : */
467 : enum nmi_states {
468 : NMI_NOT_RUNNING = 0,
469 : NMI_EXECUTING,
470 : NMI_LATCHED,
471 : };
472 : static DEFINE_PER_CPU(enum nmi_states, nmi_state);
473 : static DEFINE_PER_CPU(unsigned long, nmi_cr2);
474 : static DEFINE_PER_CPU(unsigned long, nmi_dr7);
475 :
476 0 : DEFINE_IDTENTRY_RAW(exc_nmi)
477 : {
478 0 : irqentry_state_t irq_state;
479 :
480 : /*
481 : * Re-enable NMIs right here when running as an SEV-ES guest. This might
482 : * cause nested NMIs, but those can be handled safely.
483 : */
484 0 : sev_es_nmi_complete();
485 :
486 0 : if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
487 0 : return;
488 :
489 0 : if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
490 0 : this_cpu_write(nmi_state, NMI_LATCHED);
491 0 : return;
492 : }
493 0 : this_cpu_write(nmi_state, NMI_EXECUTING);
494 0 : this_cpu_write(nmi_cr2, read_cr2());
495 0 : nmi_restart:
496 :
497 : /*
498 : * Needs to happen before DR7 is accessed, because the hypervisor can
499 : * intercept DR7 reads/writes, turning those into #VC exceptions.
500 : */
501 0 : sev_es_ist_enter(regs);
502 :
503 0 : this_cpu_write(nmi_dr7, local_db_save());
504 :
505 0 : irq_state = irqentry_nmi_enter(regs);
506 :
507 0 : inc_irq_stat(__nmi_count);
508 :
509 0 : if (!ignore_nmis)
510 0 : default_do_nmi(regs);
511 :
512 0 : irqentry_nmi_exit(regs, irq_state);
513 :
514 0 : local_db_restore(this_cpu_read(nmi_dr7));
515 :
516 0 : sev_es_ist_exit();
517 :
518 0 : if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
519 0 : write_cr2(this_cpu_read(nmi_cr2));
520 0 : if (this_cpu_dec_return(nmi_state))
521 0 : goto nmi_restart;
522 :
523 0 : if (user_mode(regs))
524 0 : mds_user_clear_cpu_buffers();
525 : }
526 :
527 1 : void stop_nmi(void)
528 : {
529 1 : ignore_nmis++;
530 1 : }
531 :
532 1 : void restart_nmi(void)
533 : {
534 1 : ignore_nmis--;
535 1 : }
536 :
537 : /* reset the back-to-back NMI logic */
538 50528 : void local_touch_nmi(void)
539 : {
540 50528 : __this_cpu_write(last_nmi_rip, 0);
541 50528 : }
542 : EXPORT_SYMBOL_GPL(local_touch_nmi);
|