LCOV - code coverage report
Current view: top level - kernel/entry - common.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 151 211 71.6 %
Date: 2021-04-22 12:43:58 Functions: 12 21 57.1 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : #include <linux/context_tracking.h>
       4             : #include <linux/entry-common.h>
       5             : #include <linux/highmem.h>
       6             : #include <linux/livepatch.h>
       7             : #include <linux/audit.h>
       8             : 
       9             : #include "common.h"
      10             : 
      11             : #define CREATE_TRACE_POINTS
      12             : #include <trace/events/syscalls.h>
      13             : 
      14             : /* See comment for enter_from_user_mode() in entry-common.h */
      15      442540 : static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
      16             : {
      17      442540 :         arch_check_user_regs(regs);
      18      442540 :         lockdep_hardirqs_off(CALLER_ADDR0);
      19             : 
      20      442549 :         CT_WARN_ON(ct_state() != CONTEXT_USER);
      21      442549 :         user_exit_irqoff();
      22             : 
      23      442549 :         instrumentation_begin();
      24      442549 :         trace_hardirqs_off_finish();
      25      442555 :         instrumentation_end();
      26             : }
      27             : 
      28           0 : void noinstr enter_from_user_mode(struct pt_regs *regs)
      29             : {
      30           0 :         __enter_from_user_mode(regs);
      31           0 : }
      32             : 
      33             : static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
      34             : {
      35             :         if (unlikely(audit_context())) {
      36             :                 unsigned long args[6];
      37             : 
      38             :                 syscall_get_arguments(current, regs, args);
      39             :                 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
      40             :         }
      41             : }
      42             : 
      43           0 : static long syscall_trace_enter(struct pt_regs *regs, long syscall,
      44             :                                 unsigned long work)
      45             : {
      46           0 :         long ret = 0;
      47             : 
      48             :         /*
      49             :          * Handle Syscall User Dispatch.  This must comes first, since
      50             :          * the ABI here can be something that doesn't make sense for
      51             :          * other syscall_work features.
      52             :          */
      53           0 :         if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
      54           0 :                 if (syscall_user_dispatch(regs))
      55             :                         return -1L;
      56             :         }
      57             : 
      58             :         /* Handle ptrace */
      59           0 :         if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
      60           0 :                 ret = arch_syscall_enter_tracehook(regs);
      61           0 :                 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
      62             :                         return -1L;
      63             :         }
      64             : 
      65             :         /* Do seccomp after ptrace, to catch any tracer changes. */
      66           0 :         if (work & SYSCALL_WORK_SECCOMP) {
      67           0 :                 ret = __secure_computing(NULL);
      68             :                 if (ret == -1L)
      69             :                         return ret;
      70             :         }
      71             : 
      72             :         /* Either of the above might have changed the syscall number */
      73           0 :         syscall = syscall_get_nr(current, regs);
      74             : 
      75           0 :         if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
      76           0 :                 trace_sys_enter(regs, syscall);
      77             : 
      78           0 :         syscall_enter_audit(regs, syscall);
      79             : 
      80             :         return ret ? : syscall;
      81             : }
      82             : 
      83             : static __always_inline long
      84      296014 : __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
      85             : {
      86      296014 :         unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
      87             : 
      88      296014 :         if (work & SYSCALL_WORK_ENTER)
      89           0 :                 syscall = syscall_trace_enter(regs, syscall, work);
      90             : 
      91      296014 :         return syscall;
      92             : }
      93             : 
      94           0 : long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
      95             : {
      96           0 :         return __syscall_enter_from_user_work(regs, syscall);
      97             : }
      98             : 
      99      295977 : noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
     100             : {
     101      295977 :         long ret;
     102             : 
     103      295977 :         __enter_from_user_mode(regs);
     104             : 
     105      295990 :         instrumentation_begin();
     106      295990 :         local_irq_enable();
     107      296014 :         ret = __syscall_enter_from_user_work(regs, syscall);
     108      296014 :         instrumentation_end();
     109             : 
     110      296014 :         return ret;
     111             : }
     112             : 
     113           0 : noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
     114             : {
     115           0 :         __enter_from_user_mode(regs);
     116           0 :         instrumentation_begin();
     117           0 :         local_irq_enable();
     118           0 :         instrumentation_end();
     119           0 : }
     120             : 
     121             : /* See comment for exit_to_user_mode() in entry-common.h */
     122      442569 : static __always_inline void __exit_to_user_mode(void)
     123             : {
     124      442569 :         instrumentation_begin();
     125      442569 :         trace_hardirqs_on_prepare();
     126      442551 :         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
     127      442589 :         instrumentation_end();
     128             : 
     129      442589 :         user_enter_irqoff();
     130      442589 :         arch_exit_to_user_mode();
     131      442605 :         lockdep_hardirqs_on(CALLER_ADDR0);
     132             : }
     133             : 
     134           0 : void noinstr exit_to_user_mode(void)
     135             : {
     136           0 :         __exit_to_user_mode();
     137           0 : }
     138             : 
     139             : /* Workaround to allow gradual conversion of architecture code */
     140           0 : void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
     141             : 
     142         563 : static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
     143             : {
     144         563 :         if (ti_work & _TIF_NOTIFY_SIGNAL)
     145           0 :                 tracehook_notify_signal();
     146             : 
     147         563 :         arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
     148         563 : }
     149             : 
     150       54834 : static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
     151             :                                             unsigned long ti_work)
     152             : {
     153             :         /*
     154             :          * Before returning to user space ensure that all pending work
     155             :          * items have been completed.
     156             :          */
     157      109730 :         while (ti_work & EXIT_TO_USER_MODE_WORK) {
     158             : 
     159       54909 :                 local_irq_enable_exit_to_user(ti_work);
     160             : 
     161       54911 :                 if (ti_work & _TIF_NEED_RESCHED)
     162        2635 :                         schedule();
     163             : 
     164       54911 :                 if (ti_work & _TIF_UPROBE)
     165       54911 :                         uprobe_notify_resume(regs);
     166             : 
     167       54911 :                 if (ti_work & _TIF_PATCH_PENDING)
     168           0 :                         klp_update_patch_state(current);
     169             : 
     170       54911 :                 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
     171         563 :                         handle_signal_work(regs, ti_work);
     172             : 
     173       54911 :                 if (ti_work & _TIF_NOTIFY_RESUME) {
     174       51835 :                         tracehook_notify_resume(regs);
     175       51835 :                         rseq_handle_notify_resume(NULL, regs);
     176             :                 }
     177             : 
     178             :                 /* Architecture specific TIF work */
     179       54912 :                 arch_exit_to_user_mode_work(regs, ti_work);
     180             : 
     181             :                 /*
     182             :                  * Disable interrupts and reevaluate the work flags as they
     183             :                  * might have changed while interrupts and preemption was
     184             :                  * enabled above.
     185             :                  */
     186       54912 :                 local_irq_disable_exit_to_user();
     187             : 
     188             :                 /* Check if any of the above work has queued a deferred wakeup */
     189       54896 :                 rcu_nocb_flush_deferred_wakeup();
     190             : 
     191       54896 :                 ti_work = READ_ONCE(current_thread_info()->flags);
     192             :         }
     193             : 
     194             :         /* Return the latest work state for arch_exit_to_user_mode() */
     195       54821 :         return ti_work;
     196             : }
     197             : 
     198      442491 : static void exit_to_user_mode_prepare(struct pt_regs *regs)
     199             : {
     200      442491 :         unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
     201             : 
     202      884996 :         lockdep_assert_irqs_disabled();
     203             : 
     204             :         /* Flush pending rcuog wakeup before the last need_resched() check */
     205      442505 :         rcu_nocb_flush_deferred_wakeup();
     206             : 
     207      442505 :         if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
     208       54836 :                 ti_work = exit_to_user_mode_loop(regs, ti_work);
     209             : 
     210      442490 :         arch_exit_to_user_mode_prepare(regs, ti_work);
     211             : 
     212             :         /* Ensure that the address limit is intact and no locks are held */
     213      442496 :         addr_limit_user_check();
     214      442496 :         kmap_assert_nomap();
     215      885008 :         lockdep_assert_irqs_disabled();
     216      442506 :         lockdep_sys_exit();
     217      442538 : }
     218             : 
     219             : /*
     220             :  * If SYSCALL_EMU is set, then the only reason to report is when
     221             :  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
     222             :  * instruction has been already reported in syscall_enter_from_user_mode().
     223             :  */
     224           0 : static inline bool report_single_step(unsigned long work)
     225             : {
     226           0 :         if (work & SYSCALL_WORK_SYSCALL_EMU)
     227             :                 return false;
     228             : 
     229           0 :         return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
     230             : }
     231             : 
     232           0 : static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
     233             : {
     234           0 :         bool step;
     235             : 
     236             :         /*
     237             :          * If the syscall was rolled back due to syscall user dispatching,
     238             :          * then the tracers below are not invoked for the same reason as
     239             :          * the entry side was not invoked in syscall_trace_enter(): The ABI
     240             :          * of these syscalls is unknown.
     241             :          */
     242           0 :         if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
     243           0 :                 if (unlikely(current->syscall_dispatch.on_dispatch)) {
     244           0 :                         current->syscall_dispatch.on_dispatch = false;
     245           0 :                         return;
     246             :                 }
     247             :         }
     248             : 
     249           0 :         audit_syscall_exit(regs);
     250             : 
     251           0 :         if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
     252           0 :                 trace_sys_exit(regs, syscall_get_return_value(current, regs));
     253             : 
     254           0 :         step = report_single_step(work);
     255           0 :         if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
     256           0 :                 arch_syscall_exit_tracehook(regs, step);
     257             : }
     258             : 
     259             : /*
     260             :  * Syscall specific exit to user mode preparation. Runs with interrupts
     261             :  * enabled.
     262             :  */
     263      295952 : static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
     264             : {
     265      295952 :         unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
     266      295952 :         unsigned long nr = syscall_get_nr(current, regs);
     267             : 
     268      295952 :         CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
     269             : 
     270      295952 :         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
     271      295952 :                 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
     272           0 :                         local_irq_enable();
     273             :         }
     274             : 
     275      296002 :         rseq_syscall(regs);
     276             : 
     277             :         /*
     278             :          * Do one-time syscall specific work. If these work items are
     279             :          * enabled, we want to run them exactly once per syscall exit with
     280             :          * interrupts enabled.
     281             :          */
     282      296002 :         if (unlikely(work & SYSCALL_WORK_EXIT))
     283           0 :                 syscall_exit_work(regs, work);
     284      296002 : }
     285             : 
     286      295968 : static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
     287             : {
     288      295968 :         syscall_exit_to_user_mode_prepare(regs);
     289      296021 :         local_irq_disable_exit_to_user();
     290      295979 :         exit_to_user_mode_prepare(regs);
     291             : }
     292             : 
     293           0 : void syscall_exit_to_user_mode_work(struct pt_regs *regs)
     294             : {
     295           0 :         __syscall_exit_to_user_mode_work(regs);
     296           0 : }
     297             : 
     298      295968 : __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
     299             : {
     300      295968 :         instrumentation_begin();
     301      295968 :         __syscall_exit_to_user_mode_work(regs);
     302      296001 :         instrumentation_end();
     303      296001 :         __exit_to_user_mode();
     304      296048 : }
     305             : 
     306      146563 : noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
     307             : {
     308      146563 :         __enter_from_user_mode(regs);
     309      146565 : }
     310             : 
     311      146556 : noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
     312             : {
     313      146556 :         instrumentation_begin();
     314      146556 :         exit_to_user_mode_prepare(regs);
     315      146568 :         instrumentation_end();
     316      146568 :         __exit_to_user_mode();
     317      146562 : }
     318             : 
     319      191042 : noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
     320             : {
     321      191042 :         irqentry_state_t ret = {
     322             :                 .exit_rcu = false,
     323             :         };
     324             : 
     325      191042 :         if (user_mode(regs)) {
     326      146562 :                 irqentry_enter_from_user_mode(regs);
     327      146565 :                 return ret;
     328             :         }
     329             : 
     330             :         /*
     331             :          * If this entry hit the idle task invoke rcu_irq_enter() whether
     332             :          * RCU is watching or not.
     333             :          *
     334             :          * Interrupts can nest when the first interrupt invokes softirq
     335             :          * processing on return which enables interrupts.
     336             :          *
     337             :          * Scheduler ticks in the idle task can mark quiescent state and
     338             :          * terminate a grace period, if and only if the timer interrupt is
     339             :          * not nested into another interrupt.
     340             :          *
     341             :          * Checking for rcu_is_watching() here would prevent the nesting
     342             :          * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
     343             :          * the tick then rcu_flavor_sched_clock_irq() would wrongfully
     344             :          * assume that it is the first interupt and eventually claim
     345             :          * quiescent state and end grace periods prematurely.
     346             :          *
     347             :          * Unconditionally invoke rcu_irq_enter() so RCU state stays
     348             :          * consistent.
     349             :          *
     350             :          * TINY_RCU does not support EQS, so let the compiler eliminate
     351             :          * this part when enabled.
     352             :          */
     353       44480 :         if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
     354             :                 /*
     355             :                  * If RCU is not watching then the same careful
     356             :                  * sequence vs. lockdep and tracing is required
     357             :                  * as in irqentry_enter_from_user_mode().
     358             :                  */
     359       18363 :                 lockdep_hardirqs_off(CALLER_ADDR0);
     360       19003 :                 rcu_irq_enter();
     361       19211 :                 instrumentation_begin();
     362       19211 :                 trace_hardirqs_off_finish();
     363       18989 :                 instrumentation_end();
     364             : 
     365       18989 :                 ret.exit_rcu = true;
     366       18989 :                 return ret;
     367             :         }
     368             : 
     369             :         /*
     370             :          * If RCU is watching then RCU only wants to check whether it needs
     371             :          * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
     372             :          * already contains a warning when RCU is not watching, so no point
     373             :          * in having another one here.
     374             :          */
     375       26117 :         lockdep_hardirqs_off(CALLER_ADDR0);
     376       26089 :         instrumentation_begin();
     377       26089 :         rcu_irq_enter_check_tick();
     378       26089 :         trace_hardirqs_off_finish();
     379       26080 :         instrumentation_end();
     380             : 
     381       26080 :         return ret;
     382             : }
     383             : 
     384           0 : void irqentry_exit_cond_resched(void)
     385             : {
     386           0 :         if (!preempt_count()) {
     387             :                 /* Sanity check RCU and thread stack */
     388           0 :                 rcu_irq_exit_check_preempt();
     389           0 :                 if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
     390             :                         WARN_ON_ONCE(!on_thread_stack());
     391           0 :                 if (need_resched())
     392           0 :                         preempt_schedule_irq();
     393             :         }
     394           0 : }
     395             : #ifdef CONFIG_PREEMPT_DYNAMIC
     396             : DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
     397             : #endif
     398             : 
     399      192196 : noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
     400             : {
     401      384434 :         lockdep_assert_irqs_disabled();
     402             : 
     403             :         /* Check whether this returns to user mode */
     404      192236 :         if (user_mode(regs)) {
     405      146555 :                 irqentry_exit_to_user_mode(regs);
     406       45681 :         } else if (!regs_irqs_disabled(regs)) {
     407             :                 /*
     408             :                  * If RCU was not watching on entry this needs to be done
     409             :                  * carefully and needs the same ordering of lockdep/tracing
     410             :                  * and RCU as the return to user mode path.
     411             :                  */
     412       45681 :                 if (state.exit_rcu) {
     413       19392 :                         instrumentation_begin();
     414             :                         /* Tell the tracer that IRET will enable interrupts */
     415       19392 :                         trace_hardirqs_on_prepare();
     416       19397 :                         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
     417       19395 :                         instrumentation_end();
     418       19395 :                         rcu_irq_exit();
     419       19420 :                         lockdep_hardirqs_on(CALLER_ADDR0);
     420       19421 :                         return;
     421             :                 }
     422             : 
     423       26289 :                 instrumentation_begin();
     424       26289 :                 if (IS_ENABLED(CONFIG_PREEMPTION)) {
     425             : #ifdef CONFIG_PREEMT_DYNAMIC
     426             :                         static_call(irqentry_exit_cond_resched)();
     427             : #else
     428             :                         irqentry_exit_cond_resched();
     429             : #endif
     430             :                 }
     431             :                 /* Covers both tracing and lockdep */
     432       26289 :                 trace_hardirqs_on();
     433       26289 :                 instrumentation_end();
     434             :         } else {
     435             :                 /*
     436             :                  * IRQ flags state is correct already. Just tell RCU if it
     437             :                  * was not watching on entry.
     438             :                  */
     439           0 :                 if (state.exit_rcu)
     440           0 :                         rcu_irq_exit();
     441             :         }
     442             : }
     443             : 
     444           1 : irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
     445             : {
     446           1 :         irqentry_state_t irq_state;
     447             : 
     448           1 :         irq_state.lockdep = lockdep_hardirqs_enabled();
     449             : 
     450           1 :         __nmi_enter();
     451           1 :         lockdep_hardirqs_off(CALLER_ADDR0);
     452           1 :         lockdep_hardirq_enter();
     453           1 :         rcu_nmi_enter();
     454             : 
     455           1 :         instrumentation_begin();
     456           1 :         trace_hardirqs_off_finish();
     457           1 :         ftrace_nmi_enter();
     458           1 :         instrumentation_end();
     459             : 
     460           1 :         return irq_state;
     461             : }
     462             : 
     463           1 : void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
     464             : {
     465           1 :         instrumentation_begin();
     466           1 :         ftrace_nmi_exit();
     467           1 :         if (irq_state.lockdep) {
     468           1 :                 trace_hardirqs_on_prepare();
     469           1 :                 lockdep_hardirqs_on_prepare(CALLER_ADDR0);
     470             :         }
     471           1 :         instrumentation_end();
     472             : 
     473           1 :         rcu_nmi_exit();
     474           1 :         lockdep_hardirq_exit();
     475           1 :         if (irq_state.lockdep)
     476           1 :                 lockdep_hardirqs_on(CALLER_ADDR0);
     477           1 :         __nmi_exit();
     478           1 : }

Generated by: LCOV version 1.14