LCOV - code coverage report
Current view: top level - arch/x86/kernel - process_64.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 154 282 54.6 %
Date: 2021-04-22 12:43:58 Functions: 13 25 52.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  Copyright (C) 1995  Linus Torvalds
       4             :  *
       5             :  *  Pentium III FXSR, SSE support
       6             :  *      Gareth Hughes <gareth@valinux.com>, May 2000
       7             :  *
       8             :  *  X86-64 port
       9             :  *      Andi Kleen.
      10             :  *
      11             :  *      CPU hotplug support - ashok.raj@intel.com
      12             :  */
      13             : 
      14             : /*
      15             :  * This file handles the architecture-dependent parts of process handling..
      16             :  */
      17             : 
      18             : #include <linux/cpu.h>
      19             : #include <linux/errno.h>
      20             : #include <linux/sched.h>
      21             : #include <linux/sched/task.h>
      22             : #include <linux/sched/task_stack.h>
      23             : #include <linux/fs.h>
      24             : #include <linux/kernel.h>
      25             : #include <linux/mm.h>
      26             : #include <linux/elfcore.h>
      27             : #include <linux/smp.h>
      28             : #include <linux/slab.h>
      29             : #include <linux/user.h>
      30             : #include <linux/interrupt.h>
      31             : #include <linux/delay.h>
      32             : #include <linux/export.h>
      33             : #include <linux/ptrace.h>
      34             : #include <linux/notifier.h>
      35             : #include <linux/kprobes.h>
      36             : #include <linux/kdebug.h>
      37             : #include <linux/prctl.h>
      38             : #include <linux/uaccess.h>
      39             : #include <linux/io.h>
      40             : #include <linux/ftrace.h>
      41             : #include <linux/syscalls.h>
      42             : 
      43             : #include <asm/processor.h>
      44             : #include <asm/fpu/internal.h>
      45             : #include <asm/mmu_context.h>
      46             : #include <asm/prctl.h>
      47             : #include <asm/desc.h>
      48             : #include <asm/proto.h>
      49             : #include <asm/ia32.h>
      50             : #include <asm/debugreg.h>
      51             : #include <asm/switch_to.h>
      52             : #include <asm/xen/hypervisor.h>
      53             : #include <asm/vdso.h>
      54             : #include <asm/resctrl.h>
      55             : #include <asm/unistd.h>
      56             : #include <asm/fsgsbase.h>
      57             : #ifdef CONFIG_IA32_EMULATION
      58             : /* Not included via unistd.h */
      59             : #include <asm/unistd_32_ia32.h>
      60             : #endif
      61             : 
      62             : #include "process.h"
      63             : 
      64             : /* Prints also some state that isn't saved in the pt_regs */
      65           2 : void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
      66             :                  const char *log_lvl)
      67             : {
      68           2 :         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
      69           2 :         unsigned long d0, d1, d2, d3, d6, d7;
      70           2 :         unsigned int fsindex, gsindex;
      71           2 :         unsigned int ds, es;
      72             : 
      73           2 :         show_iret_regs(regs, log_lvl);
      74             : 
      75           2 :         if (regs->orig_ax != -1)
      76           1 :                 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
      77             :         else
      78           1 :                 pr_cont("\n");
      79             : 
      80           2 :         printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
      81             :                log_lvl, regs->ax, regs->bx, regs->cx);
      82           2 :         printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
      83             :                log_lvl, regs->dx, regs->si, regs->di);
      84           2 :         printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
      85             :                log_lvl, regs->bp, regs->r8, regs->r9);
      86           2 :         printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
      87             :                log_lvl, regs->r10, regs->r11, regs->r12);
      88           2 :         printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
      89             :                log_lvl, regs->r13, regs->r14, regs->r15);
      90             : 
      91           2 :         if (mode == SHOW_REGS_SHORT)
      92             :                 return;
      93             : 
      94           1 :         if (mode == SHOW_REGS_USER) {
      95           0 :                 rdmsrl(MSR_FS_BASE, fs);
      96           0 :                 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
      97           0 :                 printk("%sFS:  %016lx GS:  %016lx\n",
      98             :                        log_lvl, fs, shadowgs);
      99           0 :                 return;
     100             :         }
     101             : 
     102           1 :         asm("movl %%ds,%0" : "=r" (ds));
     103           1 :         asm("movl %%es,%0" : "=r" (es));
     104           1 :         asm("movl %%fs,%0" : "=r" (fsindex));
     105           1 :         asm("movl %%gs,%0" : "=r" (gsindex));
     106             : 
     107           1 :         rdmsrl(MSR_FS_BASE, fs);
     108           1 :         rdmsrl(MSR_GS_BASE, gs);
     109           1 :         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
     110             : 
     111           1 :         cr0 = read_cr0();
     112           1 :         cr2 = read_cr2();
     113           1 :         cr3 = __read_cr3();
     114           1 :         cr4 = __read_cr4();
     115             : 
     116           1 :         printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
     117             :                log_lvl, fs, fsindex, gs, gsindex, shadowgs);
     118           1 :         printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
     119             :                 log_lvl, regs->cs, ds, es, cr0);
     120           1 :         printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
     121             :                 log_lvl, cr2, cr3, cr4);
     122             : 
     123           1 :         get_debugreg(d0, 0);
     124           1 :         get_debugreg(d1, 1);
     125           1 :         get_debugreg(d2, 2);
     126           1 :         get_debugreg(d3, 3);
     127           1 :         get_debugreg(d6, 6);
     128           1 :         get_debugreg(d7, 7);
     129             : 
     130             :         /* Only print out debug registers if they are in their non-default state. */
     131           1 :         if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
     132           1 :             (d6 == DR6_RESERVED) && (d7 == 0x400))) {
     133           0 :                 printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
     134             :                        log_lvl, d0, d1, d2);
     135           0 :                 printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
     136             :                        log_lvl, d3, d6, d7);
     137             :         }
     138             : 
     139           1 :         if (boot_cpu_has(X86_FEATURE_OSPKE))
     140           0 :                 printk("%sPKRU: %08x\n", log_lvl, read_pkru());
     141             : }
     142             : 
     143        2465 : void release_thread(struct task_struct *dead_task)
     144             : {
     145        2465 :         WARN_ON(dead_task->mm);
     146        2465 : }
     147             : 
     148             : enum which_selector {
     149             :         FS,
     150             :         GS
     151             : };
     152             : 
     153             : /*
     154             :  * Out of line to be protected from kprobes and tracing. If this would be
     155             :  * traced or probed than any access to a per CPU variable happens with
     156             :  * the wrong GS.
     157             :  *
     158             :  * It is not used on Xen paravirt. When paravirt support is needed, it
     159             :  * needs to be renamed with native_ prefix.
     160             :  */
     161       56003 : static noinstr unsigned long __rdgsbase_inactive(void)
     162             : {
     163       56003 :         unsigned long gsbase;
     164             : 
     165      112008 :         lockdep_assert_irqs_disabled();
     166             : 
     167       56005 :         if (!static_cpu_has(X86_FEATURE_XENPV)) {
     168       56005 :                 native_swapgs();
     169       56005 :                 gsbase = rdgsbase();
     170       56006 :                 native_swapgs();
     171             :         } else {
     172           0 :                 instrumentation_begin();
     173           0 :                 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
     174       56007 :                 instrumentation_end();
     175             :         }
     176             : 
     177       56007 :         return gsbase;
     178             : }
     179             : 
     180             : /*
     181             :  * Out of line to be protected from kprobes and tracing. If this would be
     182             :  * traced or probed than any access to a per CPU variable happens with
     183             :  * the wrong GS.
     184             :  *
     185             :  * It is not used on Xen paravirt. When paravirt support is needed, it
     186             :  * needs to be renamed with native_ prefix.
     187             :  */
     188       53468 : static noinstr void __wrgsbase_inactive(unsigned long gsbase)
     189             : {
     190      106941 :         lockdep_assert_irqs_disabled();
     191             : 
     192       53473 :         if (!static_cpu_has(X86_FEATURE_XENPV)) {
     193       53471 :                 native_swapgs();
     194       53473 :                 wrgsbase(gsbase);
     195       53473 :                 native_swapgs();
     196             :         } else {
     197           0 :                 instrumentation_begin();
     198           0 :                 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
     199       53472 :                 instrumentation_end();
     200             :         }
     201       53472 : }
     202             : 
     203             : /*
     204             :  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
     205             :  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
     206             :  * It's forcibly inlined because it'll generate better code and this function
     207             :  * is hot.
     208             :  */
     209           0 : static __always_inline void save_base_legacy(struct task_struct *prev_p,
     210             :                                              unsigned short selector,
     211             :                                              enum which_selector which)
     212             : {
     213           0 :         if (likely(selector == 0)) {
     214             :                 /*
     215             :                  * On Intel (without X86_BUG_NULL_SEG), the segment base could
     216             :                  * be the pre-existing saved base or it could be zero.  On AMD
     217             :                  * (with X86_BUG_NULL_SEG), the segment base could be almost
     218             :                  * anything.
     219             :                  *
     220             :                  * This branch is very hot (it's hit twice on almost every
     221             :                  * context switch between 64-bit programs), and avoiding
     222             :                  * the RDMSR helps a lot, so we just assume that whatever
     223             :                  * value is already saved is correct.  This matches historical
     224             :                  * Linux behavior, so it won't break existing applications.
     225             :                  *
     226             :                  * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
     227             :                  * report that the base is zero, it needs to actually be zero:
     228             :                  * see the corresponding logic in load_seg_legacy.
     229             :                  */
     230             :         } else {
     231             :                 /*
     232             :                  * If the selector is 1, 2, or 3, then the base is zero on
     233             :                  * !X86_BUG_NULL_SEG CPUs and could be anything on
     234             :                  * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
     235             :                  * has never attempted to preserve the base across context
     236             :                  * switches.
     237             :                  *
     238             :                  * If selector > 3, then it refers to a real segment, and
     239             :                  * saving the base isn't necessary.
     240             :                  */
     241           0 :                 if (which == FS)
     242           0 :                         prev_p->thread.fsbase = 0;
     243             :                 else
     244           0 :                         prev_p->thread.gsbase = 0;
     245             :         }
     246             : }
     247             : 
     248       56004 : static __always_inline void save_fsgs(struct task_struct *task)
     249             : {
     250       56004 :         savesegment(fs, task->thread.fsindex);
     251       56004 :         savesegment(gs, task->thread.gsindex);
     252       56002 :         if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
     253             :                 /*
     254             :                  * If FSGSBASE is enabled, we can't make any useful guesses
     255             :                  * about the base, and user code expects us to save the current
     256             :                  * value.  Fortunately, reading the base directly is efficient.
     257             :                  */
     258       56003 :                 task->thread.fsbase = rdfsbase();
     259       56003 :                 task->thread.gsbase = __rdgsbase_inactive();
     260             :         } else {
     261           0 :                 save_base_legacy(task, task->thread.fsindex, FS);
     262           0 :                 save_base_legacy(task, task->thread.gsindex, GS);
     263             :         }
     264             : }
     265             : 
     266             : /*
     267             :  * While a process is running,current->thread.fsbase and current->thread.gsbase
     268             :  * may not match the corresponding CPU registers (see save_base_legacy()).
     269             :  */
     270        2542 : void current_save_fsgs(void)
     271             : {
     272        2542 :         unsigned long flags;
     273             : 
     274             :         /* Interrupts need to be off for FSGSBASE */
     275        5084 :         local_irq_save(flags);
     276        2542 :         save_fsgs(current);
     277        2542 :         local_irq_restore(flags);
     278        2542 : }
     279             : #if IS_ENABLED(CONFIG_KVM)
     280             : EXPORT_SYMBOL_GPL(current_save_fsgs);
     281             : #endif
     282             : 
     283        2177 : static __always_inline void loadseg(enum which_selector which,
     284             :                                     unsigned short sel)
     285             : {
     286        2177 :         if (which == FS)
     287        2177 :                 loadsegment(fs, sel);
     288             :         else
     289      106937 :                 load_gs_index(sel);
     290             : }
     291             : 
     292           0 : static __always_inline void load_seg_legacy(unsigned short prev_index,
     293             :                                             unsigned long prev_base,
     294             :                                             unsigned short next_index,
     295             :                                             unsigned long next_base,
     296             :                                             enum which_selector which)
     297             : {
     298           0 :         if (likely(next_index <= 3)) {
     299             :                 /*
     300             :                  * The next task is using 64-bit TLS, is not using this
     301             :                  * segment at all, or is having fun with arcane CPU features.
     302             :                  */
     303           0 :                 if (next_base == 0) {
     304             :                         /*
     305             :                          * Nasty case: on AMD CPUs, we need to forcibly zero
     306             :                          * the base.
     307             :                          */
     308           0 :                         if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
     309           0 :                                 loadseg(which, __USER_DS);
     310           0 :                                 loadseg(which, next_index);
     311             :                         } else {
     312             :                                 /*
     313             :                                  * We could try to exhaustively detect cases
     314             :                                  * under which we can skip the segment load,
     315             :                                  * but there's really only one case that matters
     316             :                                  * for performance: if both the previous and
     317             :                                  * next states are fully zeroed, we can skip
     318             :                                  * the load.
     319             :                                  *
     320             :                                  * (This assumes that prev_base == 0 has no
     321             :                                  * false positives.  This is the case on
     322             :                                  * Intel-style CPUs.)
     323             :                                  */
     324           0 :                                 if (likely(prev_index | next_index | prev_base))
     325           0 :                                         loadseg(which, next_index);
     326             :                         }
     327             :                 } else {
     328           0 :                         if (prev_index != next_index)
     329           0 :                                 loadseg(which, next_index);
     330           0 :                         wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
     331             :                                next_base);
     332             :                 }
     333             :         } else {
     334             :                 /*
     335             :                  * The next task is using a real segment.  Loading the selector
     336             :                  * is sufficient.
     337             :                  */
     338           0 :                 loadseg(which, next_index);
     339             :         }
     340             : }
     341             : 
     342       53464 : static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
     343             :                                               struct thread_struct *next)
     344             : {
     345       53466 :         if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
     346             :                 /* Update the FS and GS selectors if they could have changed. */
     347       53467 :                 if (unlikely(prev->fsindex || next->fsindex))
     348           0 :                         loadseg(FS, next->fsindex);
     349       53467 :                 if (unlikely(prev->gsindex || next->gsindex))
     350           0 :                         loadseg(GS, next->gsindex);
     351             : 
     352             :                 /* Update the bases. */
     353       53467 :                 wrfsbase(next->fsbase);
     354       53469 :                 __wrgsbase_inactive(next->gsbase);
     355             :         } else {
     356           0 :                 load_seg_legacy(prev->fsindex, prev->fsbase,
     357           0 :                                 next->fsindex, next->fsbase, FS);
     358       53470 :                 load_seg_legacy(prev->gsindex, prev->gsbase,
     359           0 :                                 next->gsindex, next->gsbase, GS);
     360             :         }
     361             : }
     362             : 
     363           0 : unsigned long x86_fsgsbase_read_task(struct task_struct *task,
     364             :                                      unsigned short selector)
     365             : {
     366           0 :         unsigned short idx = selector >> 3;
     367           0 :         unsigned long base;
     368             : 
     369           0 :         if (likely((selector & SEGMENT_TI_MASK) == 0)) {
     370           0 :                 if (unlikely(idx >= GDT_ENTRIES))
     371             :                         return 0;
     372             : 
     373             :                 /*
     374             :                  * There are no user segments in the GDT with nonzero bases
     375             :                  * other than the TLS segments.
     376             :                  */
     377           0 :                 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
     378             :                         return 0;
     379             : 
     380           0 :                 idx -= GDT_ENTRY_TLS_MIN;
     381           0 :                 base = get_desc_base(&task->thread.tls_array[idx]);
     382             :         } else {
     383             : #ifdef CONFIG_MODIFY_LDT_SYSCALL
     384             :                 struct ldt_struct *ldt;
     385             : 
     386             :                 /*
     387             :                  * If performance here mattered, we could protect the LDT
     388             :                  * with RCU.  This is a slow path, though, so we can just
     389             :                  * take the mutex.
     390             :                  */
     391             :                 mutex_lock(&task->mm->context.lock);
     392             :                 ldt = task->mm->context.ldt;
     393             :                 if (unlikely(!ldt || idx >= ldt->nr_entries))
     394             :                         base = 0;
     395             :                 else
     396             :                         base = get_desc_base(ldt->entries + idx);
     397             :                 mutex_unlock(&task->mm->context.lock);
     398             : #else
     399             :                 base = 0;
     400             : #endif
     401             :         }
     402             : 
     403             :         return base;
     404             : }
     405             : 
     406           0 : unsigned long x86_gsbase_read_cpu_inactive(void)
     407             : {
     408           0 :         unsigned long gsbase;
     409             : 
     410           0 :         if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
     411           0 :                 unsigned long flags;
     412             : 
     413           0 :                 local_irq_save(flags);
     414           0 :                 gsbase = __rdgsbase_inactive();
     415           0 :                 local_irq_restore(flags);
     416             :         } else {
     417           0 :                 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
     418             :         }
     419             : 
     420           0 :         return gsbase;
     421             : }
     422             : 
     423           0 : void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
     424             : {
     425           0 :         if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
     426           0 :                 unsigned long flags;
     427             : 
     428           0 :                 local_irq_save(flags);
     429           0 :                 __wrgsbase_inactive(gsbase);
     430           0 :                 local_irq_restore(flags);
     431             :         } else {
     432           0 :                 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
     433             :         }
     434           0 : }
     435             : 
     436           0 : unsigned long x86_fsbase_read_task(struct task_struct *task)
     437             : {
     438           0 :         unsigned long fsbase;
     439             : 
     440           0 :         if (task == current)
     441           0 :                 fsbase = x86_fsbase_read_cpu();
     442           0 :         else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
     443           0 :                  (task->thread.fsindex == 0))
     444           0 :                 fsbase = task->thread.fsbase;
     445             :         else
     446           0 :                 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
     447             : 
     448           0 :         return fsbase;
     449             : }
     450             : 
     451           0 : unsigned long x86_gsbase_read_task(struct task_struct *task)
     452             : {
     453           0 :         unsigned long gsbase;
     454             : 
     455           0 :         if (task == current)
     456           0 :                 gsbase = x86_gsbase_read_cpu_inactive();
     457           0 :         else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
     458           0 :                  (task->thread.gsindex == 0))
     459           0 :                 gsbase = task->thread.gsbase;
     460             :         else
     461           0 :                 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
     462             : 
     463           0 :         return gsbase;
     464             : }
     465             : 
     466           6 : void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
     467             : {
     468           6 :         WARN_ON_ONCE(task == current);
     469             : 
     470           6 :         task->thread.fsbase = fsbase;
     471           6 : }
     472             : 
     473           0 : void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
     474             : {
     475           0 :         WARN_ON_ONCE(task == current);
     476             : 
     477           0 :         task->thread.gsbase = gsbase;
     478           0 : }
     479             : 
     480             : static void
     481        2177 : start_thread_common(struct pt_regs *regs, unsigned long new_ip,
     482             :                     unsigned long new_sp,
     483             :                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
     484             : {
     485        2177 :         WARN_ON_ONCE(regs != current_pt_regs());
     486             : 
     487        2177 :         if (static_cpu_has(X86_BUG_NULL_SEG)) {
     488             :                 /* Loading zero below won't clear the base. */
     489           0 :                 loadsegment(fs, __USER_DS);
     490           0 :                 load_gs_index(__USER_DS);
     491             :         }
     492             : 
     493        2177 :         loadsegment(fs, 0);
     494        2177 :         loadsegment(es, _ds);
     495        2177 :         loadsegment(ds, _ds);
     496        2177 :         load_gs_index(0);
     497             : 
     498        2177 :         regs->ip             = new_ip;
     499        2177 :         regs->sp             = new_sp;
     500        2177 :         regs->cs             = _cs;
     501        2177 :         regs->ss             = _ss;
     502        2177 :         regs->flags          = X86_EFLAGS_IF;
     503        2177 : }
     504             : 
     505             : void
     506        2177 : start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
     507             : {
     508        2177 :         start_thread_common(regs, new_ip, new_sp,
     509             :                             __USER_CS, __USER_DS, 0);
     510        2177 : }
     511             : EXPORT_SYMBOL_GPL(start_thread);
     512             : 
     513             : #ifdef CONFIG_COMPAT
     514           0 : void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
     515             : {
     516           0 :         start_thread_common(regs, new_ip, new_sp,
     517             :                             x32 ? __USER_CS : __USER32_CS,
     518             :                             __USER_DS, __USER_DS);
     519           0 : }
     520             : #endif
     521             : 
     522             : /*
     523             :  *      switch_to(x,y) should switch tasks from x to y.
     524             :  *
     525             :  * This could still be optimized:
     526             :  * - fold all the options into a flag word and test it with a single test.
     527             :  * - could test fs/gs bitsliced
     528             :  *
     529             :  * Kprobes not supported here. Set the probe on schedule instead.
     530             :  * Function graph tracer not supported too.
     531             :  */
     532             : __visible __notrace_funcgraph struct task_struct *
     533       53460 : __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
     534             : {
     535       53460 :         struct thread_struct *prev = &prev_p->thread;
     536       53460 :         struct thread_struct *next = &next_p->thread;
     537       53460 :         struct fpu *prev_fpu = &prev->fpu;
     538       53460 :         struct fpu *next_fpu = &next->fpu;
     539       53460 :         int cpu = smp_processor_id();
     540             : 
     541       53460 :         WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
     542             :                      this_cpu_read(hardirq_stack_inuse));
     543             : 
     544       53460 :         if (!test_thread_flag(TIF_NEED_FPU_LOAD))
     545       14352 :                 switch_fpu_prepare(prev_fpu, cpu);
     546             : 
     547             :         /* We must save %fs and %gs before load_TLS() because
     548             :          * %fs and %gs may be cleared by load_TLS().
     549             :          *
     550             :          * (e.g. xen_load_tls())
     551             :          */
     552       53462 :         save_fsgs(prev_p);
     553             : 
     554             :         /*
     555             :          * Load TLS before restoring any segments so that segment loads
     556             :          * reference the correct GDT entries.
     557             :          */
     558       53464 :         load_TLS(next, cpu);
     559             : 
     560             :         /*
     561             :          * Leave lazy mode, flushing any hypercalls made here.  This
     562             :          * must be done after loading TLS entries in the GDT but before
     563             :          * loading segments that might reference them.
     564             :          */
     565       53464 :         arch_end_context_switch(next_p);
     566             : 
     567             :         /* Switch DS and ES.
     568             :          *
     569             :          * Reading them only returns the selectors, but writing them (if
     570             :          * nonzero) loads the full descriptor from the GDT or LDT.  The
     571             :          * LDT for next is loaded in switch_mm, and the GDT is loaded
     572             :          * above.
     573             :          *
     574             :          * We therefore need to write new values to the segment
     575             :          * registers on every context switch unless both the new and old
     576             :          * values are zero.
     577             :          *
     578             :          * Note that we don't need to do anything for CS and SS, as
     579             :          * those are saved and restored as part of pt_regs.
     580             :          */
     581       53464 :         savesegment(es, prev->es);
     582       53464 :         if (unlikely(next->es | prev->es))
     583           0 :                 loadsegment(es, next->es);
     584             : 
     585       53464 :         savesegment(ds, prev->ds);
     586       53464 :         if (unlikely(next->ds | prev->ds))
     587           0 :                 loadsegment(ds, next->ds);
     588             : 
     589       53464 :         x86_fsgsbase_load(prev, next);
     590             : 
     591             :         /*
     592             :          * Switch the PDA and FPU contexts.
     593             :          */
     594       53470 :         this_cpu_write(current_task, next_p);
     595       53463 :         this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
     596             : 
     597       53467 :         switch_fpu_finish(next_fpu);
     598             : 
     599             :         /* Reload sp0. */
     600       53473 :         update_task_stack(next_p);
     601             : 
     602       53471 :         switch_to_extra(prev_p, next_p);
     603             : 
     604       53464 :         if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
     605             :                 /*
     606             :                  * AMD CPUs have a misfeature: SYSRET sets the SS selector but
     607             :                  * does not update the cached descriptor.  As a result, if we
     608             :                  * do SYSRET while SS is NULL, we'll end up in user mode with
     609             :                  * SS apparently equal to __USER_DS but actually unusable.
     610             :                  *
     611             :                  * The straightforward workaround would be to fix it up just
     612             :                  * before SYSRET, but that would slow down the system call
     613             :                  * fast paths.  Instead, we ensure that SS is never NULL in
     614             :                  * system call context.  We do this by replacing NULL SS
     615             :                  * selectors at every context switch.  SYSCALL sets up a valid
     616             :                  * SS, so the only way to get NULL is to re-enter the kernel
     617             :                  * from CPL 3 through an interrupt.  Since that can't happen
     618             :                  * in the same task as a running syscall, we are guaranteed to
     619             :                  * context switch between every interrupt vector entry and a
     620             :                  * subsequent SYSRET.
     621             :                  *
     622             :                  * We read SS first because SS reads are much faster than
     623             :                  * writes.  Out of caution, we force SS to __KERNEL_DS even if
     624             :                  * it previously had a different non-NULL value.
     625             :                  */
     626           0 :                 unsigned short ss_sel;
     627           0 :                 savesegment(ss, ss_sel);
     628           0 :                 if (ss_sel != __KERNEL_DS)
     629           0 :                         loadsegment(ss, __KERNEL_DS);
     630             :         }
     631             : 
     632             :         /* Load the Intel cache allocation PQR MSR. */
     633       53468 :         resctrl_sched_in();
     634             : 
     635       53468 :         return prev_p;
     636             : }
     637             : 
     638        2177 : void set_personality_64bit(void)
     639             : {
     640             :         /* inherit personality from parent */
     641             : 
     642             :         /* Make sure to be in 64bit mode */
     643        2177 :         clear_thread_flag(TIF_ADDR32);
     644             :         /* Pretend that this comes from a 64bit execve */
     645        2177 :         task_pt_regs(current)->orig_ax = __NR_execve;
     646        2177 :         current_thread_info()->status &= ~TS_COMPAT;
     647        2177 :         if (current->mm)
     648        2177 :                 current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
     649             : 
     650             :         /* TBD: overwrites user setup. Should have two bits.
     651             :            But 64bit processes have always behaved this way,
     652             :            so it's not too bad. The main problem is just that
     653             :            32bit children are affected again. */
     654        2177 :         current->personality &= ~READ_IMPLIES_EXEC;
     655        2177 : }
     656             : 
     657             : static void __set_personality_x32(void)
     658             : {
     659             : #ifdef CONFIG_X86_X32
     660             :         if (current->mm)
     661             :                 current->mm->context.flags = 0;
     662             : 
     663             :         current->personality &= ~READ_IMPLIES_EXEC;
     664             :         /*
     665             :          * in_32bit_syscall() uses the presence of the x32 syscall bit
     666             :          * flag to determine compat status.  The x86 mmap() code relies on
     667             :          * the syscall bitness so set x32 syscall bit right here to make
     668             :          * in_32bit_syscall() work during exec().
     669             :          *
     670             :          * Pretend to come from a x32 execve.
     671             :          */
     672             :         task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
     673             :         current_thread_info()->status &= ~TS_COMPAT;
     674             : #endif
     675             : }
     676             : 
     677           0 : static void __set_personality_ia32(void)
     678             : {
     679             : #ifdef CONFIG_IA32_EMULATION
     680           0 :         if (current->mm) {
     681             :                 /*
     682             :                  * uprobes applied to this MM need to know this and
     683             :                  * cannot use user_64bit_mode() at that time.
     684             :                  */
     685           0 :                 current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
     686             :         }
     687             : 
     688           0 :         current->personality |= force_personality32;
     689             :         /* Prepare the first "return" to user space */
     690           0 :         task_pt_regs(current)->orig_ax = __NR_ia32_execve;
     691           0 :         current_thread_info()->status |= TS_COMPAT;
     692             : #endif
     693           0 : }
     694             : 
     695           0 : void set_personality_ia32(bool x32)
     696             : {
     697             :         /* Make sure to be in 32bit mode */
     698           0 :         set_thread_flag(TIF_ADDR32);
     699             : 
     700           0 :         if (x32)
     701             :                 __set_personality_x32();
     702             :         else
     703           0 :                 __set_personality_ia32();
     704           0 : }
     705             : EXPORT_SYMBOL_GPL(set_personality_ia32);
     706             : 
     707             : #ifdef CONFIG_CHECKPOINT_RESTORE
     708             : static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
     709             : {
     710             :         int ret;
     711             : 
     712             :         ret = map_vdso_once(image, addr);
     713             :         if (ret)
     714             :                 return ret;
     715             : 
     716             :         return (long)image->size;
     717             : }
     718             : #endif
     719             : 
     720        2183 : long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
     721             : {
     722        2183 :         int ret = 0;
     723             : 
     724        2183 :         switch (option) {
     725           0 :         case ARCH_SET_GS: {
     726           0 :                 if (unlikely(arg2 >= TASK_SIZE_MAX))
     727             :                         return -EPERM;
     728             : 
     729           0 :                 preempt_disable();
     730             :                 /*
     731             :                  * ARCH_SET_GS has always overwritten the index
     732             :                  * and the base. Zero is the most sensible value
     733             :                  * to put in the index, and is the only value that
     734             :                  * makes any sense if FSGSBASE is unavailable.
     735             :                  */
     736           0 :                 if (task == current) {
     737           0 :                         loadseg(GS, 0);
     738           0 :                         x86_gsbase_write_cpu_inactive(arg2);
     739             : 
     740             :                         /*
     741             :                          * On non-FSGSBASE systems, save_base_legacy() expects
     742             :                          * that we also fill in thread.gsbase.
     743             :                          */
     744           0 :                         task->thread.gsbase = arg2;
     745             : 
     746             :                 } else {
     747           0 :                         task->thread.gsindex = 0;
     748           0 :                         x86_gsbase_write_task(task, arg2);
     749             :                 }
     750           0 :                 preempt_enable();
     751             :                 break;
     752             :         }
     753        2183 :         case ARCH_SET_FS: {
     754             :                 /*
     755             :                  * Not strictly needed for %fs, but do it for symmetry
     756             :                  * with %gs
     757             :                  */
     758        2183 :                 if (unlikely(arg2 >= TASK_SIZE_MAX))
     759             :                         return -EPERM;
     760             : 
     761        2183 :                 preempt_disable();
     762             :                 /*
     763             :                  * Set the selector to 0 for the same reason
     764             :                  * as %gs above.
     765             :                  */
     766        2183 :                 if (task == current) {
     767        2177 :                         loadseg(FS, 0);
     768        2177 :                         x86_fsbase_write_cpu(arg2);
     769             : 
     770             :                         /*
     771             :                          * On non-FSGSBASE systems, save_base_legacy() expects
     772             :                          * that we also fill in thread.fsbase.
     773             :                          */
     774        2177 :                         task->thread.fsbase = arg2;
     775             :                 } else {
     776           6 :                         task->thread.fsindex = 0;
     777           6 :                         x86_fsbase_write_task(task, arg2);
     778             :                 }
     779        2183 :                 preempt_enable();
     780             :                 break;
     781             :         }
     782           0 :         case ARCH_GET_FS: {
     783           0 :                 unsigned long base = x86_fsbase_read_task(task);
     784             : 
     785           0 :                 ret = put_user(base, (unsigned long __user *)arg2);
     786           0 :                 break;
     787             :         }
     788           0 :         case ARCH_GET_GS: {
     789           0 :                 unsigned long base = x86_gsbase_read_task(task);
     790             : 
     791           0 :                 ret = put_user(base, (unsigned long __user *)arg2);
     792           0 :                 break;
     793             :         }
     794             : 
     795             : #ifdef CONFIG_CHECKPOINT_RESTORE
     796             : # ifdef CONFIG_X86_X32_ABI
     797             :         case ARCH_MAP_VDSO_X32:
     798             :                 return prctl_map_vdso(&vdso_image_x32, arg2);
     799             : # endif
     800             : # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
     801             :         case ARCH_MAP_VDSO_32:
     802             :                 return prctl_map_vdso(&vdso_image_32, arg2);
     803             : # endif
     804             :         case ARCH_MAP_VDSO_64:
     805             :                 return prctl_map_vdso(&vdso_image_64, arg2);
     806             : #endif
     807             : 
     808             :         default:
     809             :                 ret = -EINVAL;
     810             :                 break;
     811             :         }
     812             : 
     813        2183 :         return ret;
     814             : }
     815             : 
     816        4354 : SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
     817             : {
     818        2177 :         long ret;
     819             : 
     820        2177 :         ret = do_arch_prctl_64(current, option, arg2);
     821        2177 :         if (ret == -EINVAL)
     822           0 :                 ret = do_arch_prctl_common(current, option, arg2);
     823             : 
     824        2177 :         return ret;
     825             : }
     826             : 
     827             : #ifdef CONFIG_IA32_EMULATION
     828           0 : COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
     829             : {
     830           0 :         return do_arch_prctl_common(current, option, arg2);
     831             : }
     832             : #endif
     833             : 
     834           0 : unsigned long KSTK_ESP(struct task_struct *task)
     835             : {
     836           0 :         return task_pt_regs(task)->sp;
     837             : }

Generated by: LCOV version 1.14