Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 :
3 : #include <linux/context_tracking.h>
4 : #include <linux/entry-common.h>
5 : #include <linux/highmem.h>
6 : #include <linux/livepatch.h>
7 : #include <linux/audit.h>
8 :
9 : #include "common.h"
10 :
11 : #define CREATE_TRACE_POINTS
12 : #include <trace/events/syscalls.h>
13 :
14 : /* See comment for enter_from_user_mode() in entry-common.h */
15 442540 : static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
16 : {
17 442540 : arch_check_user_regs(regs);
18 442540 : lockdep_hardirqs_off(CALLER_ADDR0);
19 :
20 442549 : CT_WARN_ON(ct_state() != CONTEXT_USER);
21 442549 : user_exit_irqoff();
22 :
23 442549 : instrumentation_begin();
24 442549 : trace_hardirqs_off_finish();
25 442555 : instrumentation_end();
26 : }
27 :
28 0 : void noinstr enter_from_user_mode(struct pt_regs *regs)
29 : {
30 0 : __enter_from_user_mode(regs);
31 0 : }
32 :
33 : static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
34 : {
35 : if (unlikely(audit_context())) {
36 : unsigned long args[6];
37 :
38 : syscall_get_arguments(current, regs, args);
39 : audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
40 : }
41 : }
42 :
43 0 : static long syscall_trace_enter(struct pt_regs *regs, long syscall,
44 : unsigned long work)
45 : {
46 0 : long ret = 0;
47 :
48 : /*
49 : * Handle Syscall User Dispatch. This must comes first, since
50 : * the ABI here can be something that doesn't make sense for
51 : * other syscall_work features.
52 : */
53 0 : if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
54 0 : if (syscall_user_dispatch(regs))
55 : return -1L;
56 : }
57 :
58 : /* Handle ptrace */
59 0 : if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
60 0 : ret = arch_syscall_enter_tracehook(regs);
61 0 : if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
62 : return -1L;
63 : }
64 :
65 : /* Do seccomp after ptrace, to catch any tracer changes. */
66 0 : if (work & SYSCALL_WORK_SECCOMP) {
67 0 : ret = __secure_computing(NULL);
68 : if (ret == -1L)
69 : return ret;
70 : }
71 :
72 : /* Either of the above might have changed the syscall number */
73 0 : syscall = syscall_get_nr(current, regs);
74 :
75 0 : if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
76 0 : trace_sys_enter(regs, syscall);
77 :
78 0 : syscall_enter_audit(regs, syscall);
79 :
80 : return ret ? : syscall;
81 : }
82 :
83 : static __always_inline long
84 296014 : __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
85 : {
86 296014 : unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
87 :
88 296014 : if (work & SYSCALL_WORK_ENTER)
89 0 : syscall = syscall_trace_enter(regs, syscall, work);
90 :
91 296014 : return syscall;
92 : }
93 :
94 0 : long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
95 : {
96 0 : return __syscall_enter_from_user_work(regs, syscall);
97 : }
98 :
99 295977 : noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
100 : {
101 295977 : long ret;
102 :
103 295977 : __enter_from_user_mode(regs);
104 :
105 295990 : instrumentation_begin();
106 295990 : local_irq_enable();
107 296014 : ret = __syscall_enter_from_user_work(regs, syscall);
108 296014 : instrumentation_end();
109 :
110 296014 : return ret;
111 : }
112 :
113 0 : noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
114 : {
115 0 : __enter_from_user_mode(regs);
116 0 : instrumentation_begin();
117 0 : local_irq_enable();
118 0 : instrumentation_end();
119 0 : }
120 :
121 : /* See comment for exit_to_user_mode() in entry-common.h */
122 442569 : static __always_inline void __exit_to_user_mode(void)
123 : {
124 442569 : instrumentation_begin();
125 442569 : trace_hardirqs_on_prepare();
126 442551 : lockdep_hardirqs_on_prepare(CALLER_ADDR0);
127 442589 : instrumentation_end();
128 :
129 442589 : user_enter_irqoff();
130 442589 : arch_exit_to_user_mode();
131 442605 : lockdep_hardirqs_on(CALLER_ADDR0);
132 : }
133 :
134 0 : void noinstr exit_to_user_mode(void)
135 : {
136 0 : __exit_to_user_mode();
137 0 : }
138 :
139 : /* Workaround to allow gradual conversion of architecture code */
140 0 : void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
141 :
142 563 : static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
143 : {
144 563 : if (ti_work & _TIF_NOTIFY_SIGNAL)
145 0 : tracehook_notify_signal();
146 :
147 563 : arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
148 563 : }
149 :
150 54834 : static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
151 : unsigned long ti_work)
152 : {
153 : /*
154 : * Before returning to user space ensure that all pending work
155 : * items have been completed.
156 : */
157 109730 : while (ti_work & EXIT_TO_USER_MODE_WORK) {
158 :
159 54909 : local_irq_enable_exit_to_user(ti_work);
160 :
161 54911 : if (ti_work & _TIF_NEED_RESCHED)
162 2635 : schedule();
163 :
164 54911 : if (ti_work & _TIF_UPROBE)
165 54911 : uprobe_notify_resume(regs);
166 :
167 54911 : if (ti_work & _TIF_PATCH_PENDING)
168 0 : klp_update_patch_state(current);
169 :
170 54911 : if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
171 563 : handle_signal_work(regs, ti_work);
172 :
173 54911 : if (ti_work & _TIF_NOTIFY_RESUME) {
174 51835 : tracehook_notify_resume(regs);
175 51835 : rseq_handle_notify_resume(NULL, regs);
176 : }
177 :
178 : /* Architecture specific TIF work */
179 54912 : arch_exit_to_user_mode_work(regs, ti_work);
180 :
181 : /*
182 : * Disable interrupts and reevaluate the work flags as they
183 : * might have changed while interrupts and preemption was
184 : * enabled above.
185 : */
186 54912 : local_irq_disable_exit_to_user();
187 :
188 : /* Check if any of the above work has queued a deferred wakeup */
189 54896 : rcu_nocb_flush_deferred_wakeup();
190 :
191 54896 : ti_work = READ_ONCE(current_thread_info()->flags);
192 : }
193 :
194 : /* Return the latest work state for arch_exit_to_user_mode() */
195 54821 : return ti_work;
196 : }
197 :
198 442491 : static void exit_to_user_mode_prepare(struct pt_regs *regs)
199 : {
200 442491 : unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
201 :
202 884996 : lockdep_assert_irqs_disabled();
203 :
204 : /* Flush pending rcuog wakeup before the last need_resched() check */
205 442505 : rcu_nocb_flush_deferred_wakeup();
206 :
207 442505 : if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
208 54836 : ti_work = exit_to_user_mode_loop(regs, ti_work);
209 :
210 442490 : arch_exit_to_user_mode_prepare(regs, ti_work);
211 :
212 : /* Ensure that the address limit is intact and no locks are held */
213 442496 : addr_limit_user_check();
214 442496 : kmap_assert_nomap();
215 885008 : lockdep_assert_irqs_disabled();
216 442506 : lockdep_sys_exit();
217 442538 : }
218 :
219 : /*
220 : * If SYSCALL_EMU is set, then the only reason to report is when
221 : * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
222 : * instruction has been already reported in syscall_enter_from_user_mode().
223 : */
224 0 : static inline bool report_single_step(unsigned long work)
225 : {
226 0 : if (work & SYSCALL_WORK_SYSCALL_EMU)
227 : return false;
228 :
229 0 : return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
230 : }
231 :
232 0 : static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
233 : {
234 0 : bool step;
235 :
236 : /*
237 : * If the syscall was rolled back due to syscall user dispatching,
238 : * then the tracers below are not invoked for the same reason as
239 : * the entry side was not invoked in syscall_trace_enter(): The ABI
240 : * of these syscalls is unknown.
241 : */
242 0 : if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
243 0 : if (unlikely(current->syscall_dispatch.on_dispatch)) {
244 0 : current->syscall_dispatch.on_dispatch = false;
245 0 : return;
246 : }
247 : }
248 :
249 0 : audit_syscall_exit(regs);
250 :
251 0 : if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
252 0 : trace_sys_exit(regs, syscall_get_return_value(current, regs));
253 :
254 0 : step = report_single_step(work);
255 0 : if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
256 0 : arch_syscall_exit_tracehook(regs, step);
257 : }
258 :
259 : /*
260 : * Syscall specific exit to user mode preparation. Runs with interrupts
261 : * enabled.
262 : */
263 295952 : static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
264 : {
265 295952 : unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
266 295952 : unsigned long nr = syscall_get_nr(current, regs);
267 :
268 295952 : CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
269 :
270 295952 : if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
271 295952 : if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
272 0 : local_irq_enable();
273 : }
274 :
275 296002 : rseq_syscall(regs);
276 :
277 : /*
278 : * Do one-time syscall specific work. If these work items are
279 : * enabled, we want to run them exactly once per syscall exit with
280 : * interrupts enabled.
281 : */
282 296002 : if (unlikely(work & SYSCALL_WORK_EXIT))
283 0 : syscall_exit_work(regs, work);
284 296002 : }
285 :
286 295968 : static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
287 : {
288 295968 : syscall_exit_to_user_mode_prepare(regs);
289 296021 : local_irq_disable_exit_to_user();
290 295979 : exit_to_user_mode_prepare(regs);
291 : }
292 :
293 0 : void syscall_exit_to_user_mode_work(struct pt_regs *regs)
294 : {
295 0 : __syscall_exit_to_user_mode_work(regs);
296 0 : }
297 :
298 295968 : __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
299 : {
300 295968 : instrumentation_begin();
301 295968 : __syscall_exit_to_user_mode_work(regs);
302 296001 : instrumentation_end();
303 296001 : __exit_to_user_mode();
304 296048 : }
305 :
306 146563 : noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
307 : {
308 146563 : __enter_from_user_mode(regs);
309 146565 : }
310 :
311 146556 : noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
312 : {
313 146556 : instrumentation_begin();
314 146556 : exit_to_user_mode_prepare(regs);
315 146568 : instrumentation_end();
316 146568 : __exit_to_user_mode();
317 146562 : }
318 :
319 191042 : noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
320 : {
321 191042 : irqentry_state_t ret = {
322 : .exit_rcu = false,
323 : };
324 :
325 191042 : if (user_mode(regs)) {
326 146562 : irqentry_enter_from_user_mode(regs);
327 146565 : return ret;
328 : }
329 :
330 : /*
331 : * If this entry hit the idle task invoke rcu_irq_enter() whether
332 : * RCU is watching or not.
333 : *
334 : * Interrupts can nest when the first interrupt invokes softirq
335 : * processing on return which enables interrupts.
336 : *
337 : * Scheduler ticks in the idle task can mark quiescent state and
338 : * terminate a grace period, if and only if the timer interrupt is
339 : * not nested into another interrupt.
340 : *
341 : * Checking for rcu_is_watching() here would prevent the nesting
342 : * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
343 : * the tick then rcu_flavor_sched_clock_irq() would wrongfully
344 : * assume that it is the first interupt and eventually claim
345 : * quiescent state and end grace periods prematurely.
346 : *
347 : * Unconditionally invoke rcu_irq_enter() so RCU state stays
348 : * consistent.
349 : *
350 : * TINY_RCU does not support EQS, so let the compiler eliminate
351 : * this part when enabled.
352 : */
353 44480 : if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
354 : /*
355 : * If RCU is not watching then the same careful
356 : * sequence vs. lockdep and tracing is required
357 : * as in irqentry_enter_from_user_mode().
358 : */
359 18363 : lockdep_hardirqs_off(CALLER_ADDR0);
360 19003 : rcu_irq_enter();
361 19211 : instrumentation_begin();
362 19211 : trace_hardirqs_off_finish();
363 18989 : instrumentation_end();
364 :
365 18989 : ret.exit_rcu = true;
366 18989 : return ret;
367 : }
368 :
369 : /*
370 : * If RCU is watching then RCU only wants to check whether it needs
371 : * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
372 : * already contains a warning when RCU is not watching, so no point
373 : * in having another one here.
374 : */
375 26117 : lockdep_hardirqs_off(CALLER_ADDR0);
376 26089 : instrumentation_begin();
377 26089 : rcu_irq_enter_check_tick();
378 26089 : trace_hardirqs_off_finish();
379 26080 : instrumentation_end();
380 :
381 26080 : return ret;
382 : }
383 :
384 0 : void irqentry_exit_cond_resched(void)
385 : {
386 0 : if (!preempt_count()) {
387 : /* Sanity check RCU and thread stack */
388 0 : rcu_irq_exit_check_preempt();
389 0 : if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
390 : WARN_ON_ONCE(!on_thread_stack());
391 0 : if (need_resched())
392 0 : preempt_schedule_irq();
393 : }
394 0 : }
395 : #ifdef CONFIG_PREEMPT_DYNAMIC
396 : DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
397 : #endif
398 :
399 192196 : noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
400 : {
401 384434 : lockdep_assert_irqs_disabled();
402 :
403 : /* Check whether this returns to user mode */
404 192236 : if (user_mode(regs)) {
405 146555 : irqentry_exit_to_user_mode(regs);
406 45681 : } else if (!regs_irqs_disabled(regs)) {
407 : /*
408 : * If RCU was not watching on entry this needs to be done
409 : * carefully and needs the same ordering of lockdep/tracing
410 : * and RCU as the return to user mode path.
411 : */
412 45681 : if (state.exit_rcu) {
413 19392 : instrumentation_begin();
414 : /* Tell the tracer that IRET will enable interrupts */
415 19392 : trace_hardirqs_on_prepare();
416 19397 : lockdep_hardirqs_on_prepare(CALLER_ADDR0);
417 19395 : instrumentation_end();
418 19395 : rcu_irq_exit();
419 19420 : lockdep_hardirqs_on(CALLER_ADDR0);
420 19421 : return;
421 : }
422 :
423 26289 : instrumentation_begin();
424 26289 : if (IS_ENABLED(CONFIG_PREEMPTION)) {
425 : #ifdef CONFIG_PREEMT_DYNAMIC
426 : static_call(irqentry_exit_cond_resched)();
427 : #else
428 : irqentry_exit_cond_resched();
429 : #endif
430 : }
431 : /* Covers both tracing and lockdep */
432 26289 : trace_hardirqs_on();
433 26289 : instrumentation_end();
434 : } else {
435 : /*
436 : * IRQ flags state is correct already. Just tell RCU if it
437 : * was not watching on entry.
438 : */
439 0 : if (state.exit_rcu)
440 0 : rcu_irq_exit();
441 : }
442 : }
443 :
444 1 : irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
445 : {
446 1 : irqentry_state_t irq_state;
447 :
448 1 : irq_state.lockdep = lockdep_hardirqs_enabled();
449 :
450 1 : __nmi_enter();
451 1 : lockdep_hardirqs_off(CALLER_ADDR0);
452 1 : lockdep_hardirq_enter();
453 1 : rcu_nmi_enter();
454 :
455 1 : instrumentation_begin();
456 1 : trace_hardirqs_off_finish();
457 1 : ftrace_nmi_enter();
458 1 : instrumentation_end();
459 :
460 1 : return irq_state;
461 : }
462 :
463 1 : void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
464 : {
465 1 : instrumentation_begin();
466 1 : ftrace_nmi_exit();
467 1 : if (irq_state.lockdep) {
468 1 : trace_hardirqs_on_prepare();
469 1 : lockdep_hardirqs_on_prepare(CALLER_ADDR0);
470 : }
471 1 : instrumentation_end();
472 :
473 1 : rcu_nmi_exit();
474 1 : lockdep_hardirq_exit();
475 1 : if (irq_state.lockdep)
476 1 : lockdep_hardirqs_on(CALLER_ADDR0);
477 1 : __nmi_exit();
478 1 : }
|