LCOV - code coverage report
Current view: top level - arch/x86/entry - common.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 9 57 15.8 %
Date: 2021-04-22 12:43:58 Functions: 1 6 16.7 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * common.c - C code for kernel entry and exit
       4             :  * Copyright (c) 2015 Andrew Lutomirski
       5             :  *
       6             :  * Based on asm and ptrace code by many authors.  The code here originated
       7             :  * in ptrace.c and signal.c.
       8             :  */
       9             : 
      10             : #include <linux/kernel.h>
      11             : #include <linux/sched.h>
      12             : #include <linux/sched/task_stack.h>
      13             : #include <linux/entry-common.h>
      14             : #include <linux/mm.h>
      15             : #include <linux/smp.h>
      16             : #include <linux/errno.h>
      17             : #include <linux/ptrace.h>
      18             : #include <linux/export.h>
      19             : #include <linux/nospec.h>
      20             : #include <linux/syscalls.h>
      21             : #include <linux/uaccess.h>
      22             : 
      23             : #ifdef CONFIG_XEN_PV
      24             : #include <xen/xen-ops.h>
      25             : #include <xen/events.h>
      26             : #endif
      27             : 
      28             : #include <asm/desc.h>
      29             : #include <asm/traps.h>
      30             : #include <asm/vdso.h>
      31             : #include <asm/cpufeature.h>
      32             : #include <asm/fpu/api.h>
      33             : #include <asm/nospec-branch.h>
      34             : #include <asm/io_bitmap.h>
      35             : #include <asm/syscall.h>
      36             : #include <asm/irq_stack.h>
      37             : 
      38             : #ifdef CONFIG_X86_64
      39      389079 : __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
      40             : {
      41      389079 :         nr = syscall_enter_from_user_mode(regs, nr);
      42             : 
      43      389112 :         instrumentation_begin();
      44      389112 :         if (likely(nr < NR_syscalls)) {
      45      389112 :                 nr = array_index_nospec(nr, NR_syscalls);
      46      389114 :                 regs->ax = sys_call_table[nr](regs);
      47             : #ifdef CONFIG_X86_X32_ABI
      48             :         } else if (likely((nr & __X32_SYSCALL_BIT) &&
      49             :                           (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
      50             :                 nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
      51             :                                         X32_NR_syscalls);
      52             :                 regs->ax = x32_sys_call_table[nr](regs);
      53             : #endif
      54             :         }
      55      386631 :         instrumentation_end();
      56      386631 :         syscall_exit_to_user_mode(regs);
      57      386683 : }
      58             : #endif
      59             : 
      60             : #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
      61           0 : static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
      62             : {
      63           0 :         if (IS_ENABLED(CONFIG_IA32_EMULATION))
      64           0 :                 current_thread_info()->status |= TS_COMPAT;
      65             : 
      66           0 :         return (unsigned int)regs->orig_ax;
      67             : }
      68             : 
      69             : /*
      70             :  * Invoke a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.
      71             :  */
      72           0 : static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
      73             :                                                   unsigned int nr)
      74             : {
      75           0 :         if (likely(nr < IA32_NR_syscalls)) {
      76           0 :                 nr = array_index_nospec(nr, IA32_NR_syscalls);
      77           0 :                 regs->ax = ia32_sys_call_table[nr](regs);
      78             :         }
      79             : }
      80             : 
      81             : /* Handles int $0x80 */
      82           0 : __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
      83             : {
      84           0 :         unsigned int nr = syscall_32_enter(regs);
      85             : 
      86             :         /*
      87             :          * Subtlety here: if ptrace pokes something larger than 2^32-1 into
      88             :          * orig_ax, the unsigned int return value truncates it.  This may
      89             :          * or may not be necessary, but it matches the old asm behavior.
      90             :          */
      91           0 :         nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
      92           0 :         instrumentation_begin();
      93             : 
      94           0 :         do_syscall_32_irqs_on(regs, nr);
      95             : 
      96           0 :         instrumentation_end();
      97           0 :         syscall_exit_to_user_mode(regs);
      98           0 : }
      99             : 
     100           0 : static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
     101             : {
     102           0 :         unsigned int nr = syscall_32_enter(regs);
     103           0 :         int res;
     104             : 
     105             :         /*
     106             :          * This cannot use syscall_enter_from_user_mode() as it has to
     107             :          * fetch EBP before invoking any of the syscall entry work
     108             :          * functions.
     109             :          */
     110           0 :         syscall_enter_from_user_mode_prepare(regs);
     111             : 
     112           0 :         instrumentation_begin();
     113             :         /* Fetch EBP from where the vDSO stashed it. */
     114           0 :         if (IS_ENABLED(CONFIG_X86_64)) {
     115             :                 /*
     116             :                  * Micro-optimization: the pointer we're following is
     117             :                  * explicitly 32 bits, so it can't be out of range.
     118             :                  */
     119           0 :                 res = __get_user(*(u32 *)&regs->bp,
     120             :                          (u32 __user __force *)(unsigned long)(u32)regs->sp);
     121             :         } else {
     122             :                 res = get_user(*(u32 *)&regs->bp,
     123             :                        (u32 __user __force *)(unsigned long)(u32)regs->sp);
     124             :         }
     125             : 
     126           0 :         if (res) {
     127             :                 /* User code screwed up. */
     128           0 :                 regs->ax = -EFAULT;
     129             : 
     130           0 :                 instrumentation_end();
     131           0 :                 local_irq_disable();
     132           0 :                 irqentry_exit_to_user_mode(regs);
     133           0 :                 return false;
     134             :         }
     135             : 
     136             :         /* The case truncates any ptrace induced syscall nr > 2^32 -1 */
     137           0 :         nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
     138             : 
     139             :         /* Now this is just like a normal syscall. */
     140           0 :         do_syscall_32_irqs_on(regs, nr);
     141             : 
     142           0 :         instrumentation_end();
     143           0 :         syscall_exit_to_user_mode(regs);
     144           0 :         return true;
     145             : }
     146             : 
     147             : /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
     148           0 : __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
     149             : {
     150             :         /*
     151             :          * Called using the internal vDSO SYSENTER/SYSCALL32 calling
     152             :          * convention.  Adjust regs so it looks like we entered using int80.
     153             :          */
     154           0 :         unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
     155           0 :                                         vdso_image_32.sym_int80_landing_pad;
     156             : 
     157             :         /*
     158             :          * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
     159             :          * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
     160             :          * Fix it up.
     161             :          */
     162           0 :         regs->ip = landing_pad;
     163             : 
     164             :         /* Invoke the syscall. If it failed, keep it simple: use IRET. */
     165           0 :         if (!__do_fast_syscall_32(regs))
     166             :                 return 0;
     167             : 
     168             : #ifdef CONFIG_X86_64
     169             :         /*
     170             :          * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
     171             :          * SYSRETL is available on all 64-bit CPUs, so we don't need to
     172             :          * bother with SYSEXIT.
     173             :          *
     174             :          * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
     175             :          * because the ECX fixup above will ensure that this is essentially
     176             :          * never the case.
     177             :          */
     178           0 :         return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
     179           0 :                 regs->ip == landing_pad &&
     180           0 :                 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
     181             : #else
     182             :         /*
     183             :          * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
     184             :          *
     185             :          * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
     186             :          * because the ECX fixup above will ensure that this is essentially
     187             :          * never the case.
     188             :          *
     189             :          * We don't allow syscalls at all from VM86 mode, but we still
     190             :          * need to check VM, because we might be returning from sys_vm86.
     191             :          */
     192             :         return static_cpu_has(X86_FEATURE_SEP) &&
     193             :                 regs->cs == __USER_CS && regs->ss == __USER_DS &&
     194             :                 regs->ip == landing_pad &&
     195             :                 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
     196             : #endif
     197             : }
     198             : 
     199             : /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
     200           0 : __visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
     201             : {
     202             :         /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
     203           0 :         regs->sp = regs->bp;
     204             : 
     205             :         /* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
     206           0 :         regs->flags |= X86_EFLAGS_IF;
     207             : 
     208           0 :         return do_fast_syscall_32(regs);
     209             : }
     210             : #endif
     211             : 
     212           0 : SYSCALL_DEFINE0(ni_syscall)
     213             : {
     214           0 :         return -ENOSYS;
     215             : }
     216             : 
     217             : #ifdef CONFIG_XEN_PV
     218             : #ifndef CONFIG_PREEMPTION
     219             : /*
     220             :  * Some hypercalls issued by the toolstack can take many 10s of
     221             :  * seconds. Allow tasks running hypercalls via the privcmd driver to
     222             :  * be voluntarily preempted even if full kernel preemption is
     223             :  * disabled.
     224             :  *
     225             :  * Such preemptible hypercalls are bracketed by
     226             :  * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
     227             :  * calls.
     228             :  */
     229             : DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
     230             : EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
     231             : 
     232             : /*
     233             :  * In case of scheduling the flag must be cleared and restored after
     234             :  * returning from schedule as the task might move to a different CPU.
     235             :  */
     236             : static __always_inline bool get_and_clear_inhcall(void)
     237             : {
     238             :         bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
     239             : 
     240             :         __this_cpu_write(xen_in_preemptible_hcall, false);
     241             :         return inhcall;
     242             : }
     243             : 
     244             : static __always_inline void restore_inhcall(bool inhcall)
     245             : {
     246             :         __this_cpu_write(xen_in_preemptible_hcall, inhcall);
     247             : }
     248             : #else
     249             : static __always_inline bool get_and_clear_inhcall(void) { return false; }
     250             : static __always_inline void restore_inhcall(bool inhcall) { }
     251             : #endif
     252             : 
     253             : static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
     254             : {
     255             :         struct pt_regs *old_regs = set_irq_regs(regs);
     256             : 
     257             :         inc_irq_stat(irq_hv_callback_count);
     258             : 
     259             :         xen_hvm_evtchn_do_upcall();
     260             : 
     261             :         set_irq_regs(old_regs);
     262             : }
     263             : 
     264             : __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
     265             : {
     266             :         irqentry_state_t state = irqentry_enter(regs);
     267             :         bool inhcall;
     268             : 
     269             :         run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
     270             : 
     271             :         inhcall = get_and_clear_inhcall();
     272             :         if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
     273             :                 instrumentation_begin();
     274             :                 irqentry_exit_cond_resched();
     275             :                 instrumentation_end();
     276             :                 restore_inhcall(inhcall);
     277             :         } else {
     278             :                 irqentry_exit(regs, state);
     279             :         }
     280             : }
     281             : #endif /* CONFIG_XEN_PV */

Generated by: LCOV version 1.14