LCOV - code coverage report
Current view: top level - fs - exec.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 642 875 73.4 %
Date: 2021-04-22 12:43:58 Functions: 45 57 78.9 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/fs/exec.c
       4             :  *
       5             :  *  Copyright (C) 1991, 1992  Linus Torvalds
       6             :  */
       7             : 
       8             : /*
       9             :  * #!-checking implemented by tytso.
      10             :  */
      11             : /*
      12             :  * Demand-loading implemented 01.12.91 - no need to read anything but
      13             :  * the header into memory. The inode of the executable is put into
      14             :  * "current->executable", and page faults do the actual loading. Clean.
      15             :  *
      16             :  * Once more I can proudly say that linux stood up to being changed: it
      17             :  * was less than 2 hours work to get demand-loading completely implemented.
      18             :  *
      19             :  * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
      20             :  * current->executable is only used by the procfs.  This allows a dispatch
      21             :  * table to check for several different types  of binary formats.  We keep
      22             :  * trying until we recognize the file or we run out of supported binary
      23             :  * formats.
      24             :  */
      25             : 
      26             : #include <linux/kernel_read_file.h>
      27             : #include <linux/slab.h>
      28             : #include <linux/file.h>
      29             : #include <linux/fdtable.h>
      30             : #include <linux/mm.h>
      31             : #include <linux/vmacache.h>
      32             : #include <linux/stat.h>
      33             : #include <linux/fcntl.h>
      34             : #include <linux/swap.h>
      35             : #include <linux/string.h>
      36             : #include <linux/init.h>
      37             : #include <linux/sched/mm.h>
      38             : #include <linux/sched/coredump.h>
      39             : #include <linux/sched/signal.h>
      40             : #include <linux/sched/numa_balancing.h>
      41             : #include <linux/sched/task.h>
      42             : #include <linux/pagemap.h>
      43             : #include <linux/perf_event.h>
      44             : #include <linux/highmem.h>
      45             : #include <linux/spinlock.h>
      46             : #include <linux/key.h>
      47             : #include <linux/personality.h>
      48             : #include <linux/binfmts.h>
      49             : #include <linux/utsname.h>
      50             : #include <linux/pid_namespace.h>
      51             : #include <linux/module.h>
      52             : #include <linux/namei.h>
      53             : #include <linux/mount.h>
      54             : #include <linux/security.h>
      55             : #include <linux/syscalls.h>
      56             : #include <linux/tsacct_kern.h>
      57             : #include <linux/cn_proc.h>
      58             : #include <linux/audit.h>
      59             : #include <linux/tracehook.h>
      60             : #include <linux/kmod.h>
      61             : #include <linux/fsnotify.h>
      62             : #include <linux/fs_struct.h>
      63             : #include <linux/oom.h>
      64             : #include <linux/compat.h>
      65             : #include <linux/vmalloc.h>
      66             : #include <linux/io_uring.h>
      67             : #include <linux/syscall_user_dispatch.h>
      68             : 
      69             : #include <linux/uaccess.h>
      70             : #include <asm/mmu_context.h>
      71             : #include <asm/tlb.h>
      72             : 
      73             : #include <trace/events/task.h>
      74             : #include "internal.h"
      75             : 
      76             : #include <trace/events/sched.h>
      77             : 
      78             : static int bprm_creds_from_file(struct linux_binprm *bprm);
      79             : 
      80             : int suid_dumpable = 0;
      81             : 
      82             : static LIST_HEAD(formats);
      83             : static DEFINE_RWLOCK(binfmt_lock);
      84             : 
      85           3 : void __register_binfmt(struct linux_binfmt * fmt, int insert)
      86             : {
      87           3 :         BUG_ON(!fmt);
      88           3 :         if (WARN_ON(!fmt->load_binary))
      89             :                 return;
      90           3 :         write_lock(&binfmt_lock);
      91           3 :         insert ? list_add(&fmt->lh, &formats) :
      92           3 :                  list_add_tail(&fmt->lh, &formats);
      93           3 :         write_unlock(&binfmt_lock);
      94             : }
      95             : 
      96             : EXPORT_SYMBOL(__register_binfmt);
      97             : 
      98           0 : void unregister_binfmt(struct linux_binfmt * fmt)
      99             : {
     100           0 :         write_lock(&binfmt_lock);
     101           0 :         list_del(&fmt->lh);
     102           0 :         write_unlock(&binfmt_lock);
     103           0 : }
     104             : 
     105             : EXPORT_SYMBOL(unregister_binfmt);
     106             : 
     107        2084 : static inline void put_binfmt(struct linux_binfmt * fmt)
     108             : {
     109        2084 :         module_put(fmt->module);
     110             : }
     111             : 
     112       30239 : bool path_noexec(const struct path *path)
     113             : {
     114       30239 :         return (path->mnt->mnt_flags & MNT_NOEXEC) ||
     115       30239 :                (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
     116             : }
     117             : 
     118             : #ifdef CONFIG_USELIB
     119             : /*
     120             :  * Note that a shared library must be both readable and executable due to
     121             :  * security reasons.
     122             :  *
     123             :  * Also note that we take the address to load from from the file itself.
     124             :  */
     125             : SYSCALL_DEFINE1(uselib, const char __user *, library)
     126             : {
     127             :         struct linux_binfmt *fmt;
     128             :         struct file *file;
     129             :         struct filename *tmp = getname(library);
     130             :         int error = PTR_ERR(tmp);
     131             :         static const struct open_flags uselib_flags = {
     132             :                 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
     133             :                 .acc_mode = MAY_READ | MAY_EXEC,
     134             :                 .intent = LOOKUP_OPEN,
     135             :                 .lookup_flags = LOOKUP_FOLLOW,
     136             :         };
     137             : 
     138             :         if (IS_ERR(tmp))
     139             :                 goto out;
     140             : 
     141             :         file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
     142             :         putname(tmp);
     143             :         error = PTR_ERR(file);
     144             :         if (IS_ERR(file))
     145             :                 goto out;
     146             : 
     147             :         /*
     148             :          * may_open() has already checked for this, so it should be
     149             :          * impossible to trip now. But we need to be extra cautious
     150             :          * and check again at the very end too.
     151             :          */
     152             :         error = -EACCES;
     153             :         if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
     154             :                          path_noexec(&file->f_path)))
     155             :                 goto exit;
     156             : 
     157             :         fsnotify_open(file);
     158             : 
     159             :         error = -ENOEXEC;
     160             : 
     161             :         read_lock(&binfmt_lock);
     162             :         list_for_each_entry(fmt, &formats, lh) {
     163             :                 if (!fmt->load_shlib)
     164             :                         continue;
     165             :                 if (!try_module_get(fmt->module))
     166             :                         continue;
     167             :                 read_unlock(&binfmt_lock);
     168             :                 error = fmt->load_shlib(file);
     169             :                 read_lock(&binfmt_lock);
     170             :                 put_binfmt(fmt);
     171             :                 if (error != -ENOEXEC)
     172             :                         break;
     173             :         }
     174             :         read_unlock(&binfmt_lock);
     175             : exit:
     176             :         fput(file);
     177             : out:
     178             :         return error;
     179             : }
     180             : #endif /* #ifdef CONFIG_USELIB */
     181             : 
     182             : #ifdef CONFIG_MMU
     183             : /*
     184             :  * The nascent bprm->mm is not visible until exec_mmap() but it can
     185             :  * use a lot of memory, account these pages in current->mm temporary
     186             :  * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
     187             :  * change the counter back via acct_arg_size(0).
     188             :  */
     189       11273 : static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
     190             : {
     191       11273 :         struct mm_struct *mm = current->mm;
     192       11273 :         long diff = (long)(pages - bprm->vma_pages);
     193             : 
     194       11273 :         if (!mm || !diff)
     195             :                 return;
     196             : 
     197        5590 :         bprm->vma_pages = pages;
     198        5590 :         add_mm_counter(mm, MM_ANONPAGES, diff);
     199             : }
     200             : 
     201        8518 : static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
     202             :                 int write)
     203             : {
     204        8518 :         struct page *page;
     205        8518 :         int ret;
     206        8518 :         unsigned int gup_flags = FOLL_FORCE;
     207             : 
     208             : #ifdef CONFIG_STACK_GROWSUP
     209             :         if (write) {
     210             :                 ret = expand_downwards(bprm->vma, pos);
     211             :                 if (ret < 0)
     212             :                         return NULL;
     213             :         }
     214             : #endif
     215             : 
     216        8518 :         if (write)
     217        8476 :                 gup_flags |= FOLL_WRITE;
     218             : 
     219             :         /*
     220             :          * We are doing an exec().  'current' is the process
     221             :          * doing the exec and bprm->mm is the new process's mm.
     222             :          */
     223        8518 :         ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
     224             :                         &page, NULL, NULL);
     225        8519 :         if (ret <= 0)
     226             :                 return NULL;
     227             : 
     228        8519 :         if (write)
     229        8477 :                 acct_arg_size(bprm, vma_pages(bprm->vma));
     230             : 
     231        8519 :         return page;
     232             : }
     233             : 
     234        8519 : static void put_arg_page(struct page *page)
     235             : {
     236        8519 :         put_page(page);
     237        8477 : }
     238             : 
     239        2796 : static void free_arg_pages(struct linux_binprm *bprm)
     240             : {
     241        2796 : }
     242             : 
     243        8477 : static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
     244             :                 struct page *page)
     245             : {
     246        8477 :         flush_cache_page(bprm->vma, pos, page_to_pfn(page));
     247        5587 : }
     248             : 
     249        2796 : static int __bprm_mm_init(struct linux_binprm *bprm)
     250             : {
     251        2796 :         int err;
     252        2796 :         struct vm_area_struct *vma = NULL;
     253        2796 :         struct mm_struct *mm = bprm->mm;
     254             : 
     255        2796 :         bprm->vma = vma = vm_area_alloc(mm);
     256        2796 :         if (!vma)
     257             :                 return -ENOMEM;
     258        2796 :         vma_set_anonymous(vma);
     259             : 
     260        2796 :         if (mmap_write_lock_killable(mm)) {
     261           0 :                 err = -EINTR;
     262           0 :                 goto err_free;
     263             :         }
     264             : 
     265             :         /*
     266             :          * Place the stack at the largest stack address the architecture
     267             :          * supports. Later, we'll move this to an appropriate place. We don't
     268             :          * use STACK_TOP because that can depend on attributes which aren't
     269             :          * configured yet.
     270             :          */
     271        2796 :         BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
     272        2796 :         vma->vm_end = STACK_TOP_MAX;
     273        2796 :         vma->vm_start = vma->vm_end - PAGE_SIZE;
     274        2796 :         vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
     275        2796 :         vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
     276             : 
     277        2796 :         err = insert_vm_struct(mm, vma);
     278        2796 :         if (err)
     279           0 :                 goto err;
     280             : 
     281        2796 :         mm->stack_vm = mm->total_vm = 1;
     282        2796 :         mmap_write_unlock(mm);
     283        2796 :         bprm->p = vma->vm_end - sizeof(void *);
     284        2796 :         return 0;
     285           0 : err:
     286           0 :         mmap_write_unlock(mm);
     287           0 : err_free:
     288           0 :         bprm->vma = NULL;
     289           0 :         vm_area_free(vma);
     290           0 :         return err;
     291             : }
     292             : 
     293       60269 : static bool valid_arg_len(struct linux_binprm *bprm, long len)
     294             : {
     295       60269 :         return len <= MAX_ARG_STRLEN;
     296             : }
     297             : 
     298             : #else
     299             : 
     300             : static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
     301             : {
     302             : }
     303             : 
     304             : static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
     305             :                 int write)
     306             : {
     307             :         struct page *page;
     308             : 
     309             :         page = bprm->page[pos / PAGE_SIZE];
     310             :         if (!page && write) {
     311             :                 page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
     312             :                 if (!page)
     313             :                         return NULL;
     314             :                 bprm->page[pos / PAGE_SIZE] = page;
     315             :         }
     316             : 
     317             :         return page;
     318             : }
     319             : 
     320             : static void put_arg_page(struct page *page)
     321             : {
     322             : }
     323             : 
     324             : static void free_arg_page(struct linux_binprm *bprm, int i)
     325             : {
     326             :         if (bprm->page[i]) {
     327             :                 __free_page(bprm->page[i]);
     328             :                 bprm->page[i] = NULL;
     329             :         }
     330             : }
     331             : 
     332             : static void free_arg_pages(struct linux_binprm *bprm)
     333             : {
     334             :         int i;
     335             : 
     336             :         for (i = 0; i < MAX_ARG_PAGES; i++)
     337             :                 free_arg_page(bprm, i);
     338             : }
     339             : 
     340             : static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
     341             :                 struct page *page)
     342             : {
     343             : }
     344             : 
     345             : static int __bprm_mm_init(struct linux_binprm *bprm)
     346             : {
     347             :         bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
     348             :         return 0;
     349             : }
     350             : 
     351             : static bool valid_arg_len(struct linux_binprm *bprm, long len)
     352             : {
     353             :         return len <= bprm->p;
     354             : }
     355             : 
     356             : #endif /* CONFIG_MMU */
     357             : 
     358             : /*
     359             :  * Create a new mm_struct and populate it with a temporary stack
     360             :  * vm_area_struct.  We don't have enough context at this point to set the stack
     361             :  * flags, permissions, and offset, so we use temporary values.  We'll update
     362             :  * them later in setup_arg_pages().
     363             :  */
     364        2796 : static int bprm_mm_init(struct linux_binprm *bprm)
     365             : {
     366        2796 :         int err;
     367        2796 :         struct mm_struct *mm = NULL;
     368             : 
     369        2796 :         bprm->mm = mm = mm_alloc();
     370        2796 :         err = -ENOMEM;
     371        2796 :         if (!mm)
     372           0 :                 goto err;
     373             : 
     374             :         /* Save current stack limit for all calculations made during exec. */
     375        2796 :         task_lock(current->group_leader);
     376        2796 :         bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
     377        2796 :         task_unlock(current->group_leader);
     378             : 
     379        2796 :         err = __bprm_mm_init(bprm);
     380        2796 :         if (err)
     381           0 :                 goto err;
     382             : 
     383             :         return 0;
     384             : 
     385           0 : err:
     386           0 :         if (mm) {
     387           0 :                 bprm->mm = NULL;
     388           0 :                 mmdrop(mm);
     389             :         }
     390             : 
     391             :         return err;
     392             : }
     393             : 
     394             : struct user_arg_ptr {
     395             : #ifdef CONFIG_COMPAT
     396             :         bool is_compat;
     397             : #endif
     398             :         union {
     399             :                 const char __user *const __user *native;
     400             : #ifdef CONFIG_COMPAT
     401             :                 const compat_uptr_t __user *compat;
     402             : #endif
     403             :         } ptr;
     404             : };
     405             : 
     406      120345 : static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
     407             : {
     408      120345 :         const char __user *native;
     409             : 
     410             : #ifdef CONFIG_COMPAT
     411      120345 :         if (unlikely(argv.is_compat)) {
     412           0 :                 compat_uptr_t compat;
     413             : 
     414           0 :                 if (get_user(compat, argv.ptr.compat + nr))
     415      120345 :                         return ERR_PTR(-EFAULT);
     416             : 
     417           0 :                 return compat_ptr(compat);
     418             :         }
     419             : #endif
     420             : 
     421      120345 :         if (get_user(native, argv.ptr.native + nr))
     422           0 :                 return ERR_PTR(-EFAULT);
     423             : 
     424             :         return native;
     425             : }
     426             : 
     427             : /*
     428             :  * count() counts the number of strings in array ARGV.
     429             :  */
     430        5590 : static int count(struct user_arg_ptr argv, int max)
     431             : {
     432        5590 :         int i = 0;
     433             : 
     434        5590 :         if (argv.ptr.native != NULL) {
     435       57379 :                 for (;;) {
     436       62966 :                         const char __user *p = get_user_arg_ptr(argv, i);
     437             : 
     438       62966 :                         if (!p)
     439             :                                 break;
     440             : 
     441       57379 :                         if (IS_ERR(p))
     442             :                                 return -EFAULT;
     443             : 
     444       57379 :                         if (i >= max)
     445             :                                 return -E2BIG;
     446       57379 :                         ++i;
     447             : 
     448       57379 :                         if (fatal_signal_pending(current))
     449             :                                 return -ERESTARTNOHAND;
     450       57379 :                         cond_resched();
     451             :                 }
     452             :         }
     453             :         return i;
     454             : }
     455             : 
     456           2 : static int count_strings_kernel(const char *const *argv)
     457             : {
     458           2 :         int i;
     459             : 
     460           2 :         if (!argv)
     461             :                 return 0;
     462             : 
     463           5 :         for (i = 0; argv[i]; ++i) {
     464           3 :                 if (i >= MAX_ARG_STRINGS)
     465             :                         return -E2BIG;
     466           3 :                 if (fatal_signal_pending(current))
     467             :                         return -ERESTARTNOHAND;
     468           3 :                 cond_resched();
     469             :         }
     470             :         return i;
     471             : }
     472             : 
     473        2796 : static int bprm_stack_limits(struct linux_binprm *bprm)
     474             : {
     475        2796 :         unsigned long limit, ptr_size;
     476             : 
     477             :         /*
     478             :          * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
     479             :          * (whichever is smaller) for the argv+env strings.
     480             :          * This ensures that:
     481             :          *  - the remaining binfmt code will not run out of stack space,
     482             :          *  - the program will have a reasonable amount of stack left
     483             :          *    to work from.
     484             :          */
     485        2796 :         limit = _STK_LIM / 4 * 3;
     486        2796 :         limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
     487             :         /*
     488             :          * We've historically supported up to 32 pages (ARG_MAX)
     489             :          * of argument strings even with small stacks
     490             :          */
     491        2796 :         limit = max_t(unsigned long, limit, ARG_MAX);
     492             :         /*
     493             :          * We must account for the size of all the argv and envp pointers to
     494             :          * the argv and envp strings, since they will also take up space in
     495             :          * the stack. They aren't stored until much later when we can't
     496             :          * signal to the parent that the child has run out of stack space.
     497             :          * Instead, calculate it here so it's possible to fail gracefully.
     498             :          */
     499        2796 :         ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
     500        2796 :         if (limit <= ptr_size)
     501             :                 return -E2BIG;
     502        2796 :         limit -= ptr_size;
     503             : 
     504        2796 :         bprm->argmin = bprm->p - limit;
     505        2796 :         return 0;
     506             : }
     507             : 
     508             : /*
     509             :  * 'copy_strings()' copies argument/environment strings from the old
     510             :  * processes's memory to the new process's stack.  The call to get_user_pages()
     511             :  * ensures the destination page is created and not swapped out.
     512             :  */
     513        5590 : static int copy_strings(int argc, struct user_arg_ptr argv,
     514             :                         struct linux_binprm *bprm)
     515             : {
     516        5590 :         struct page *kmapped_page = NULL;
     517        5590 :         char *kaddr = NULL;
     518        5590 :         unsigned long kpos = 0;
     519        5590 :         int ret;
     520             : 
     521       62969 :         while (argc-- > 0) {
     522       57379 :                 const char __user *str;
     523       57379 :                 int len;
     524       57379 :                 unsigned long pos;
     525             : 
     526       57379 :                 ret = -EFAULT;
     527       57379 :                 str = get_user_arg_ptr(argv, argc);
     528       57379 :                 if (IS_ERR(str))
     529           0 :                         goto out;
     530             : 
     531       57379 :                 len = strnlen_user(str, MAX_ARG_STRLEN);
     532       57379 :                 if (!len)
     533           0 :                         goto out;
     534             : 
     535       57379 :                 ret = -E2BIG;
     536       57379 :                 if (!valid_arg_len(bprm, len))
     537           0 :                         goto out;
     538             : 
     539             :                 /* We're going to work our way backwords. */
     540       57379 :                 pos = bprm->p;
     541       57379 :                 str += len;
     542       57379 :                 bprm->p -= len;
     543             : #ifdef CONFIG_MMU
     544       57379 :                 if (bprm->p < bprm->argmin)
     545           0 :                         goto out;
     546             : #endif
     547             : 
     548      114758 :                 while (len > 0) {
     549       57379 :                         int offset, bytes_to_copy;
     550             : 
     551       57379 :                         if (fatal_signal_pending(current)) {
     552           0 :                                 ret = -ERESTARTNOHAND;
     553           0 :                                 goto out;
     554             :                         }
     555       57379 :                         cond_resched();
     556             : 
     557       57379 :                         offset = pos % PAGE_SIZE;
     558       57379 :                         if (offset == 0)
     559           0 :                                 offset = PAGE_SIZE;
     560             : 
     561       57379 :                         bytes_to_copy = offset;
     562       57379 :                         if (bytes_to_copy > len)
     563             :                                 bytes_to_copy = len;
     564             : 
     565       57379 :                         offset -= bytes_to_copy;
     566       57379 :                         pos -= bytes_to_copy;
     567       57379 :                         str -= bytes_to_copy;
     568       57379 :                         len -= bytes_to_copy;
     569             : 
     570       57379 :                         if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
     571        5587 :                                 struct page *page;
     572             : 
     573        5587 :                                 page = get_arg_page(bprm, pos, 1);
     574        5587 :                                 if (!page) {
     575           0 :                                         ret = -E2BIG;
     576           0 :                                         goto out;
     577             :                                 }
     578             : 
     579        5587 :                                 if (kmapped_page) {
     580           0 :                                         flush_kernel_dcache_page(kmapped_page);
     581           0 :                                         kunmap(kmapped_page);
     582           0 :                                         put_arg_page(kmapped_page);
     583             :                                 }
     584        5587 :                                 kmapped_page = page;
     585        5587 :                                 kaddr = kmap(kmapped_page);
     586        5587 :                                 kpos = pos & PAGE_MASK;
     587        5587 :                                 flush_arg_page(bprm, kpos, kmapped_page);
     588             :                         }
     589      114758 :                         if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
     590           0 :                                 ret = -EFAULT;
     591           0 :                                 goto out;
     592             :                         }
     593             :                 }
     594             :         }
     595             :         ret = 0;
     596        5590 : out:
     597        5590 :         if (kmapped_page) {
     598        5587 :                 flush_kernel_dcache_page(kmapped_page);
     599        5587 :                 kunmap(kmapped_page);
     600        5587 :                 put_arg_page(kmapped_page);
     601             :         }
     602        5590 :         return ret;
     603             : }
     604             : 
     605             : /*
     606             :  * Copy and argument/environment string from the kernel to the processes stack.
     607             :  */
     608        2890 : int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
     609             : {
     610        2890 :         int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
     611        2890 :         unsigned long pos = bprm->p;
     612             : 
     613        2890 :         if (len == 0)
     614             :                 return -EFAULT;
     615        2890 :         if (!valid_arg_len(bprm, len))
     616             :                 return -E2BIG;
     617             : 
     618             :         /* We're going to work our way backwards. */
     619        2890 :         arg += len;
     620        2890 :         bprm->p -= len;
     621        2890 :         if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
     622             :                 return -E2BIG;
     623             : 
     624        5780 :         while (len > 0) {
     625        2890 :                 unsigned int bytes_to_copy = min_t(unsigned int, len,
     626             :                                 min_not_zero(offset_in_page(pos), PAGE_SIZE));
     627        2890 :                 struct page *page;
     628        2890 :                 char *kaddr;
     629             : 
     630        2890 :                 pos -= bytes_to_copy;
     631        2890 :                 arg -= bytes_to_copy;
     632        2890 :                 len -= bytes_to_copy;
     633             : 
     634        2890 :                 page = get_arg_page(bprm, pos, 1);
     635        2890 :                 if (!page)
     636             :                         return -E2BIG;
     637        2890 :                 kaddr = kmap_atomic(page);
     638        2890 :                 flush_arg_page(bprm, pos & PAGE_MASK, page);
     639        2890 :                 memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
     640        2890 :                 flush_kernel_dcache_page(page);
     641        2890 :                 kunmap_atomic(kaddr);
     642        8670 :                 put_arg_page(page);
     643             :         }
     644             : 
     645             :         return 0;
     646             : }
     647             : EXPORT_SYMBOL(copy_string_kernel);
     648             : 
     649           2 : static int copy_strings_kernel(int argc, const char *const *argv,
     650             :                                struct linux_binprm *bprm)
     651             : {
     652           5 :         while (argc-- > 0) {
     653           3 :                 int ret = copy_string_kernel(argv[argc], bprm);
     654           3 :                 if (ret < 0)
     655           0 :                         return ret;
     656           3 :                 if (fatal_signal_pending(current))
     657             :                         return -ERESTARTNOHAND;
     658           3 :                 cond_resched();
     659             :         }
     660             :         return 0;
     661             : }
     662             : 
     663             : #ifdef CONFIG_MMU
     664             : 
     665             : /*
     666             :  * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
     667             :  * the binfmt code determines where the new stack should reside, we shift it to
     668             :  * its final location.  The process proceeds as follows:
     669             :  *
     670             :  * 1) Use shift to calculate the new vma endpoints.
     671             :  * 2) Extend vma to cover both the old and new ranges.  This ensures the
     672             :  *    arguments passed to subsequent functions are consistent.
     673             :  * 3) Move vma's page tables to the new range.
     674             :  * 4) Free up any cleared pgd range.
     675             :  * 5) Shrink the vma to cover only the new range.
     676             :  */
     677        1021 : static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
     678             : {
     679        1021 :         struct mm_struct *mm = vma->vm_mm;
     680        1021 :         unsigned long old_start = vma->vm_start;
     681        1021 :         unsigned long old_end = vma->vm_end;
     682        1021 :         unsigned long length = old_end - old_start;
     683        1021 :         unsigned long new_start = old_start - shift;
     684        1021 :         unsigned long new_end = old_end - shift;
     685        1021 :         struct mmu_gather tlb;
     686             : 
     687        1021 :         BUG_ON(new_start > new_end);
     688             : 
     689             :         /*
     690             :          * ensure there are no vmas between where we want to go
     691             :          * and where we are
     692             :          */
     693        1021 :         if (vma != find_vma(mm, new_start))
     694             :                 return -EFAULT;
     695             : 
     696             :         /*
     697             :          * cover the whole range: [new_start, old_end)
     698             :          */
     699        1021 :         if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
     700             :                 return -ENOMEM;
     701             : 
     702             :         /*
     703             :          * move the page tables downwards, on failure we rely on
     704             :          * process cleanup to remove whatever mess we made.
     705             :          */
     706        1021 :         if (length != move_page_tables(vma, old_start,
     707             :                                        vma, new_start, length, false))
     708             :                 return -ENOMEM;
     709             : 
     710        1021 :         lru_add_drain();
     711        1021 :         tlb_gather_mmu(&tlb, mm);
     712        1021 :         if (new_end > old_start) {
     713             :                 /*
     714             :                  * when the old and new regions overlap clear from new_end.
     715             :                  */
     716           0 :                 free_pgd_range(&tlb, new_end, old_end, new_end,
     717           0 :                         vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
     718             :         } else {
     719             :                 /*
     720             :                  * otherwise, clean from old_start; this is done to not touch
     721             :                  * the address space in [new_end, old_start) some architectures
     722             :                  * have constraints on va-space that make this illegal (IA64) -
     723             :                  * for the others its just a little faster.
     724             :                  */
     725        1021 :                 free_pgd_range(&tlb, old_start, old_end, new_end,
     726        1021 :                         vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
     727             :         }
     728        1021 :         tlb_finish_mmu(&tlb);
     729             : 
     730             :         /*
     731             :          * Shrink the vma to just the new range.  Always succeeds.
     732             :          */
     733        1021 :         vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
     734             : 
     735        1021 :         return 0;
     736             : }
     737             : 
     738             : /*
     739             :  * Finalizes the stack vm_area_struct. The flags and permissions are updated,
     740             :  * the stack is optionally relocated, and some extra space is added.
     741             :  */
     742        1021 : int setup_arg_pages(struct linux_binprm *bprm,
     743             :                     unsigned long stack_top,
     744             :                     int executable_stack)
     745             : {
     746        1021 :         unsigned long ret;
     747        1021 :         unsigned long stack_shift;
     748        1021 :         struct mm_struct *mm = current->mm;
     749        1021 :         struct vm_area_struct *vma = bprm->vma;
     750        1021 :         struct vm_area_struct *prev = NULL;
     751        1021 :         unsigned long vm_flags;
     752        1021 :         unsigned long stack_base;
     753        1021 :         unsigned long stack_size;
     754        1021 :         unsigned long stack_expand;
     755        1021 :         unsigned long rlim_stack;
     756             : 
     757             : #ifdef CONFIG_STACK_GROWSUP
     758             :         /* Limit stack size */
     759             :         stack_base = bprm->rlim_stack.rlim_max;
     760             : 
     761             :         stack_base = calc_max_stack_size(stack_base);
     762             : 
     763             :         /* Add space for stack randomization. */
     764             :         stack_base += (STACK_RND_MASK << PAGE_SHIFT);
     765             : 
     766             :         /* Make sure we didn't let the argument array grow too large. */
     767             :         if (vma->vm_end - vma->vm_start > stack_base)
     768             :                 return -ENOMEM;
     769             : 
     770             :         stack_base = PAGE_ALIGN(stack_top - stack_base);
     771             : 
     772             :         stack_shift = vma->vm_start - stack_base;
     773             :         mm->arg_start = bprm->p - stack_shift;
     774             :         bprm->p = vma->vm_end - stack_shift;
     775             : #else
     776        1021 :         stack_top = arch_align_stack(stack_top);
     777        1021 :         stack_top = PAGE_ALIGN(stack_top);
     778             : 
     779        1021 :         if (unlikely(stack_top < mmap_min_addr) ||
     780        1021 :             unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
     781             :                 return -ENOMEM;
     782             : 
     783        1021 :         stack_shift = vma->vm_end - stack_top;
     784             : 
     785        1021 :         bprm->p -= stack_shift;
     786        1021 :         mm->arg_start = bprm->p;
     787             : #endif
     788             : 
     789        1021 :         if (bprm->loader)
     790           0 :                 bprm->loader -= stack_shift;
     791        1021 :         bprm->exec -= stack_shift;
     792             : 
     793        1021 :         if (mmap_write_lock_killable(mm))
     794             :                 return -EINTR;
     795             : 
     796        1021 :         vm_flags = VM_STACK_FLAGS;
     797             : 
     798             :         /*
     799             :          * Adjust stack execute permissions; explicitly enable for
     800             :          * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
     801             :          * (arch default) otherwise.
     802             :          */
     803        1021 :         if (unlikely(executable_stack == EXSTACK_ENABLE_X))
     804             :                 vm_flags |= VM_EXEC;
     805        1021 :         else if (executable_stack == EXSTACK_DISABLE_X)
     806        1021 :                 vm_flags &= ~VM_EXEC;
     807        1021 :         vm_flags |= mm->def_flags;
     808        1021 :         vm_flags |= VM_STACK_INCOMPLETE_SETUP;
     809             : 
     810        1021 :         ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
     811             :                         vm_flags);
     812        1021 :         if (ret)
     813           0 :                 goto out_unlock;
     814        1021 :         BUG_ON(prev != vma);
     815             : 
     816        1021 :         if (unlikely(vm_flags & VM_EXEC)) {
     817           0 :                 pr_warn_once("process '%pD4' started with executable stack\n",
     818             :                              bprm->file);
     819             :         }
     820             : 
     821             :         /* Move stack pages down in memory. */
     822        1021 :         if (stack_shift) {
     823        1021 :                 ret = shift_arg_pages(vma, stack_shift);
     824        1021 :                 if (ret)
     825           0 :                         goto out_unlock;
     826             :         }
     827             : 
     828             :         /* mprotect_fixup is overkill to remove the temporary stack flags */
     829        1021 :         vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
     830             : 
     831        1021 :         stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
     832        1021 :         stack_size = vma->vm_end - vma->vm_start;
     833             :         /*
     834             :          * Align this down to a page boundary as expand_stack
     835             :          * will align it up.
     836             :          */
     837        1021 :         rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
     838             : #ifdef CONFIG_STACK_GROWSUP
     839             :         if (stack_size + stack_expand > rlim_stack)
     840             :                 stack_base = vma->vm_start + rlim_stack;
     841             :         else
     842             :                 stack_base = vma->vm_end + stack_expand;
     843             : #else
     844        1021 :         if (stack_size + stack_expand > rlim_stack)
     845           0 :                 stack_base = vma->vm_end - rlim_stack;
     846             :         else
     847        1021 :                 stack_base = vma->vm_start - stack_expand;
     848             : #endif
     849        1021 :         current->mm->start_stack = bprm->p;
     850        1021 :         ret = expand_stack(vma, stack_base);
     851        1021 :         if (ret)
     852           0 :                 ret = -EFAULT;
     853             : 
     854        1021 : out_unlock:
     855        1021 :         mmap_write_unlock(mm);
     856        1021 :         return ret;
     857             : }
     858             : EXPORT_SYMBOL(setup_arg_pages);
     859             : 
     860             : #else
     861             : 
     862             : /*
     863             :  * Transfer the program arguments and environment from the holding pages
     864             :  * onto the stack. The provided stack pointer is adjusted accordingly.
     865             :  */
     866             : int transfer_args_to_stack(struct linux_binprm *bprm,
     867             :                            unsigned long *sp_location)
     868             : {
     869             :         unsigned long index, stop, sp;
     870             :         int ret = 0;
     871             : 
     872             :         stop = bprm->p >> PAGE_SHIFT;
     873             :         sp = *sp_location;
     874             : 
     875             :         for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
     876             :                 unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
     877             :                 char *src = kmap(bprm->page[index]) + offset;
     878             :                 sp -= PAGE_SIZE - offset;
     879             :                 if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
     880             :                         ret = -EFAULT;
     881             :                 kunmap(bprm->page[index]);
     882             :                 if (ret)
     883             :                         goto out;
     884             :         }
     885             : 
     886             :         *sp_location = sp;
     887             : 
     888             : out:
     889             :         return ret;
     890             : }
     891             : EXPORT_SYMBOL(transfer_args_to_stack);
     892             : 
     893             : #endif /* CONFIG_MMU */
     894             : 
     895        3857 : static struct file *do_open_execat(int fd, struct filename *name, int flags)
     896             : {
     897        3857 :         struct file *file;
     898        3857 :         int err;
     899        3857 :         struct open_flags open_exec_flags = {
     900             :                 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
     901             :                 .acc_mode = MAY_EXEC,
     902             :                 .intent = LOOKUP_OPEN,
     903             :                 .lookup_flags = LOOKUP_FOLLOW,
     904             :         };
     905             : 
     906        3857 :         if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
     907        3857 :                 return ERR_PTR(-EINVAL);
     908        3857 :         if (flags & AT_SYMLINK_NOFOLLOW)
     909           0 :                 open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
     910        3857 :         if (flags & AT_EMPTY_PATH)
     911           0 :                 open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
     912             : 
     913        3857 :         file = do_filp_open(fd, name, &open_exec_flags);
     914        3857 :         if (IS_ERR(file))
     915        1775 :                 goto out;
     916             : 
     917             :         /*
     918             :          * may_open() has already checked for this, so it should be
     919             :          * impossible to trip now. But we need to be extra cautious
     920             :          * and check again at the very end too.
     921             :          */
     922        2082 :         err = -EACCES;
     923        2082 :         if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
     924             :                          path_noexec(&file->f_path)))
     925           0 :                 goto exit;
     926             : 
     927        2082 :         err = deny_write_access(file);
     928        2082 :         if (err)
     929           0 :                 goto exit;
     930             : 
     931        2082 :         if (name->name[0] != '\0')
     932        2082 :                 fsnotify_open(file);
     933             : 
     934           0 : out:
     935             :         return file;
     936             : 
     937           0 : exit:
     938           0 :         fput(file);
     939           0 :         return ERR_PTR(err);
     940             : }
     941             : 
     942        1061 : struct file *open_exec(const char *name)
     943             : {
     944        1061 :         struct filename *filename = getname_kernel(name);
     945        1061 :         struct file *f = ERR_CAST(filename);
     946             : 
     947        1061 :         if (!IS_ERR(filename)) {
     948        1061 :                 f = do_open_execat(AT_FDCWD, filename, 0);
     949        1061 :                 putname(filename);
     950             :         }
     951        1061 :         return f;
     952             : }
     953             : EXPORT_SYMBOL(open_exec);
     954             : 
     955             : #if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
     956             :     defined(CONFIG_BINFMT_ELF_FDPIC)
     957             : ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
     958             : {
     959             :         ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
     960             :         if (res > 0)
     961             :                 flush_icache_user_range(addr, addr + len);
     962             :         return res;
     963             : }
     964             : EXPORT_SYMBOL(read_code);
     965             : #endif
     966             : 
     967             : /*
     968             :  * Maps the mm_struct mm into the current task struct.
     969             :  * On success, this function returns with exec_update_lock
     970             :  * held for writing.
     971             :  */
     972        1021 : static int exec_mmap(struct mm_struct *mm)
     973             : {
     974        1021 :         struct task_struct *tsk;
     975        1021 :         struct mm_struct *old_mm, *active_mm;
     976        1021 :         int ret;
     977             : 
     978             :         /* Notify parent that we're no longer interested in the old VM */
     979        1021 :         tsk = current;
     980        1021 :         old_mm = current->mm;
     981        1021 :         exec_mm_release(tsk, old_mm);
     982        1021 :         if (old_mm)
     983        1020 :                 sync_mm_rss(old_mm);
     984             : 
     985        1021 :         ret = down_write_killable(&tsk->signal->exec_update_lock);
     986        1021 :         if (ret)
     987             :                 return ret;
     988             : 
     989        1021 :         if (old_mm) {
     990             :                 /*
     991             :                  * Make sure that if there is a core dump in progress
     992             :                  * for the old mm, we get out and die instead of going
     993             :                  * through with the exec.  We must hold mmap_lock around
     994             :                  * checking core_state and changing tsk->mm.
     995             :                  */
     996        1020 :                 mmap_read_lock(old_mm);
     997        1020 :                 if (unlikely(old_mm->core_state)) {
     998           0 :                         mmap_read_unlock(old_mm);
     999           0 :                         up_write(&tsk->signal->exec_update_lock);
    1000           0 :                         return -EINTR;
    1001             :                 }
    1002             :         }
    1003             : 
    1004        1021 :         task_lock(tsk);
    1005        1021 :         membarrier_exec_mmap(mm);
    1006             : 
    1007        1021 :         local_irq_disable();
    1008        1021 :         active_mm = tsk->active_mm;
    1009        1021 :         tsk->active_mm = mm;
    1010        1021 :         tsk->mm = mm;
    1011             :         /*
    1012             :          * This prevents preemption while active_mm is being loaded and
    1013             :          * it and mm are being updated, which could cause problems for
    1014             :          * lazy tlb mm refcounting when these are updated by context
    1015             :          * switches. Not all architectures can handle irqs off over
    1016             :          * activate_mm yet.
    1017             :          */
    1018        1021 :         if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
    1019        1021 :                 local_irq_enable();
    1020        1021 :         activate_mm(active_mm, mm);
    1021        1021 :         if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
    1022        1021 :                 local_irq_enable();
    1023        1021 :         tsk->mm->vmacache_seqnum = 0;
    1024        1021 :         vmacache_flush(tsk);
    1025        1021 :         task_unlock(tsk);
    1026        1021 :         if (old_mm) {
    1027        1020 :                 mmap_read_unlock(old_mm);
    1028        1020 :                 BUG_ON(active_mm != old_mm);
    1029        1020 :                 setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
    1030        1020 :                 mm_update_next_owner(old_mm);
    1031        1020 :                 mmput(old_mm);
    1032        1020 :                 return 0;
    1033             :         }
    1034           1 :         mmdrop(active_mm);
    1035           1 :         return 0;
    1036             : }
    1037             : 
    1038        1021 : static int de_thread(struct task_struct *tsk)
    1039             : {
    1040        1021 :         struct signal_struct *sig = tsk->signal;
    1041        1021 :         struct sighand_struct *oldsighand = tsk->sighand;
    1042        1021 :         spinlock_t *lock = &oldsighand->siglock;
    1043             : 
    1044        1021 :         if (thread_group_empty(tsk))
    1045        1021 :                 goto no_thread_group;
    1046             : 
    1047             :         /*
    1048             :          * Kill all other threads in the thread group.
    1049             :          */
    1050           0 :         spin_lock_irq(lock);
    1051           0 :         if (signal_group_exit(sig)) {
    1052             :                 /*
    1053             :                  * Another group action in progress, just
    1054             :                  * return so that the signal is processed.
    1055             :                  */
    1056           0 :                 spin_unlock_irq(lock);
    1057           0 :                 return -EAGAIN;
    1058             :         }
    1059             : 
    1060           0 :         sig->group_exit_task = tsk;
    1061           0 :         sig->notify_count = zap_other_threads(tsk);
    1062           0 :         if (!thread_group_leader(tsk))
    1063           0 :                 sig->notify_count--;
    1064             : 
    1065           0 :         while (sig->notify_count) {
    1066           0 :                 __set_current_state(TASK_KILLABLE);
    1067           0 :                 spin_unlock_irq(lock);
    1068           0 :                 schedule();
    1069           0 :                 if (__fatal_signal_pending(tsk))
    1070           0 :                         goto killed;
    1071           0 :                 spin_lock_irq(lock);
    1072             :         }
    1073           0 :         spin_unlock_irq(lock);
    1074             : 
    1075             :         /*
    1076             :          * At this point all other threads have exited, all we have to
    1077             :          * do is to wait for the thread group leader to become inactive,
    1078             :          * and to assume its PID:
    1079             :          */
    1080           0 :         if (!thread_group_leader(tsk)) {
    1081           0 :                 struct task_struct *leader = tsk->group_leader;
    1082             : 
    1083           0 :                 for (;;) {
    1084           0 :                         cgroup_threadgroup_change_begin(tsk);
    1085           0 :                         write_lock_irq(&tasklist_lock);
    1086             :                         /*
    1087             :                          * Do this under tasklist_lock to ensure that
    1088             :                          * exit_notify() can't miss ->group_exit_task
    1089             :                          */
    1090           0 :                         sig->notify_count = -1;
    1091           0 :                         if (likely(leader->exit_state))
    1092             :                                 break;
    1093           0 :                         __set_current_state(TASK_KILLABLE);
    1094           0 :                         write_unlock_irq(&tasklist_lock);
    1095           0 :                         cgroup_threadgroup_change_end(tsk);
    1096           0 :                         schedule();
    1097           0 :                         if (__fatal_signal_pending(tsk))
    1098           0 :                                 goto killed;
    1099             :                 }
    1100             : 
    1101             :                 /*
    1102             :                  * The only record we have of the real-time age of a
    1103             :                  * process, regardless of execs it's done, is start_time.
    1104             :                  * All the past CPU time is accumulated in signal_struct
    1105             :                  * from sister threads now dead.  But in this non-leader
    1106             :                  * exec, nothing survives from the original leader thread,
    1107             :                  * whose birth marks the true age of this process now.
    1108             :                  * When we take on its identity by switching to its PID, we
    1109             :                  * also take its birthdate (always earlier than our own).
    1110             :                  */
    1111           0 :                 tsk->start_time = leader->start_time;
    1112           0 :                 tsk->start_boottime = leader->start_boottime;
    1113             : 
    1114           0 :                 BUG_ON(!same_thread_group(leader, tsk));
    1115             :                 /*
    1116             :                  * An exec() starts a new thread group with the
    1117             :                  * TGID of the previous thread group. Rehash the
    1118             :                  * two threads with a switched PID, and release
    1119             :                  * the former thread group leader:
    1120             :                  */
    1121             : 
    1122             :                 /* Become a process group leader with the old leader's pid.
    1123             :                  * The old leader becomes a thread of the this thread group.
    1124             :                  */
    1125           0 :                 exchange_tids(tsk, leader);
    1126           0 :                 transfer_pid(leader, tsk, PIDTYPE_TGID);
    1127           0 :                 transfer_pid(leader, tsk, PIDTYPE_PGID);
    1128           0 :                 transfer_pid(leader, tsk, PIDTYPE_SID);
    1129             : 
    1130           0 :                 list_replace_rcu(&leader->tasks, &tsk->tasks);
    1131           0 :                 list_replace_init(&leader->sibling, &tsk->sibling);
    1132             : 
    1133           0 :                 tsk->group_leader = tsk;
    1134           0 :                 leader->group_leader = tsk;
    1135             : 
    1136           0 :                 tsk->exit_signal = SIGCHLD;
    1137           0 :                 leader->exit_signal = -1;
    1138             : 
    1139           0 :                 BUG_ON(leader->exit_state != EXIT_ZOMBIE);
    1140           0 :                 leader->exit_state = EXIT_DEAD;
    1141             : 
    1142             :                 /*
    1143             :                  * We are going to release_task()->ptrace_unlink() silently,
    1144             :                  * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
    1145             :                  * the tracer wont't block again waiting for this thread.
    1146             :                  */
    1147           0 :                 if (unlikely(leader->ptrace))
    1148           0 :                         __wake_up_parent(leader, leader->parent);
    1149           0 :                 write_unlock_irq(&tasklist_lock);
    1150           0 :                 cgroup_threadgroup_change_end(tsk);
    1151             : 
    1152           0 :                 release_task(leader);
    1153             :         }
    1154             : 
    1155           0 :         sig->group_exit_task = NULL;
    1156           0 :         sig->notify_count = 0;
    1157             : 
    1158        1021 : no_thread_group:
    1159             :         /* we have changed execution domain */
    1160        1021 :         tsk->exit_signal = SIGCHLD;
    1161             : 
    1162        1021 :         BUG_ON(!thread_group_leader(tsk));
    1163             :         return 0;
    1164             : 
    1165           0 : killed:
    1166             :         /* protects against exit_notify() and __exit_signal() */
    1167           0 :         read_lock(&tasklist_lock);
    1168           0 :         sig->group_exit_task = NULL;
    1169           0 :         sig->notify_count = 0;
    1170           0 :         read_unlock(&tasklist_lock);
    1171           0 :         return -EAGAIN;
    1172             : }
    1173             : 
    1174             : 
    1175             : /*
    1176             :  * This function makes sure the current process has its own signal table,
    1177             :  * so that flush_signal_handlers can later reset the handlers without
    1178             :  * disturbing other processes.  (Other processes might share the signal
    1179             :  * table via the CLONE_SIGHAND option to clone().)
    1180             :  */
    1181        1021 : static int unshare_sighand(struct task_struct *me)
    1182             : {
    1183        1021 :         struct sighand_struct *oldsighand = me->sighand;
    1184             : 
    1185        1021 :         if (refcount_read(&oldsighand->count) != 1) {
    1186           0 :                 struct sighand_struct *newsighand;
    1187             :                 /*
    1188             :                  * This ->sighand is shared with the CLONE_SIGHAND
    1189             :                  * but not CLONE_THREAD task, switch to the new one.
    1190             :                  */
    1191           0 :                 newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
    1192           0 :                 if (!newsighand)
    1193             :                         return -ENOMEM;
    1194             : 
    1195           0 :                 refcount_set(&newsighand->count, 1);
    1196           0 :                 memcpy(newsighand->action, oldsighand->action,
    1197             :                        sizeof(newsighand->action));
    1198             : 
    1199           0 :                 write_lock_irq(&tasklist_lock);
    1200           0 :                 spin_lock(&oldsighand->siglock);
    1201           0 :                 rcu_assign_pointer(me->sighand, newsighand);
    1202           0 :                 spin_unlock(&oldsighand->siglock);
    1203           0 :                 write_unlock_irq(&tasklist_lock);
    1204             : 
    1205           0 :                 __cleanup_sighand(oldsighand);
    1206             :         }
    1207             :         return 0;
    1208             : }
    1209             : 
    1210         281 : char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
    1211             : {
    1212         281 :         task_lock(tsk);
    1213         281 :         strncpy(buf, tsk->comm, buf_size);
    1214         281 :         task_unlock(tsk);
    1215         281 :         return buf;
    1216             : }
    1217             : EXPORT_SYMBOL_GPL(__get_task_comm);
    1218             : 
    1219             : /*
    1220             :  * These functions flushes out all traces of the currently running executable
    1221             :  * so that a new one can be started
    1222             :  */
    1223             : 
    1224        1147 : void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
    1225             : {
    1226        1147 :         task_lock(tsk);
    1227        1147 :         trace_task_rename(tsk, buf);
    1228        1147 :         strlcpy(tsk->comm, buf, sizeof(tsk->comm));
    1229        1147 :         task_unlock(tsk);
    1230        1147 :         perf_event_comm(tsk, exec);
    1231        1147 : }
    1232             : 
    1233             : /*
    1234             :  * Calling this is the point of no return. None of the failures will be
    1235             :  * seen by userspace since either the process is already taking a fatal
    1236             :  * signal (via de_thread() or coredump), or will have SEGV raised
    1237             :  * (after exec_mmap()) by search_binary_handler (see below).
    1238             :  */
    1239        1021 : int begin_new_exec(struct linux_binprm * bprm)
    1240             : {
    1241        1021 :         struct task_struct *me = current;
    1242        1021 :         int retval;
    1243             : 
    1244             :         /* Once we are committed compute the creds */
    1245        1021 :         retval = bprm_creds_from_file(bprm);
    1246        1021 :         if (retval)
    1247             :                 return retval;
    1248             : 
    1249             :         /*
    1250             :          * Ensure all future errors are fatal.
    1251             :          */
    1252        1021 :         bprm->point_of_no_return = true;
    1253             : 
    1254             :         /*
    1255             :          * Make this the only thread in the thread group.
    1256             :          */
    1257        1021 :         retval = de_thread(me);
    1258        1021 :         if (retval)
    1259           0 :                 goto out;
    1260             : 
    1261             :         /*
    1262             :          * Cancel any io_uring activity across execve
    1263             :          */
    1264        1021 :         io_uring_task_cancel();
    1265             : 
    1266             :         /* Ensure the files table is not shared. */
    1267        1021 :         retval = unshare_files();
    1268        1021 :         if (retval)
    1269           0 :                 goto out;
    1270             : 
    1271             :         /*
    1272             :          * Must be called _before_ exec_mmap() as bprm->mm is
    1273             :          * not visibile until then. This also enables the update
    1274             :          * to be lockless.
    1275             :          */
    1276        1021 :         set_mm_exe_file(bprm->mm, bprm->file);
    1277             : 
    1278             :         /* If the binary is not readable then enforce mm->dumpable=0 */
    1279        1021 :         would_dump(bprm, bprm->file);
    1280        1021 :         if (bprm->have_execfd)
    1281           0 :                 would_dump(bprm, bprm->executable);
    1282             : 
    1283             :         /*
    1284             :          * Release all of the old mmap stuff
    1285             :          */
    1286        1021 :         acct_arg_size(bprm, 0);
    1287        1021 :         retval = exec_mmap(bprm->mm);
    1288        1021 :         if (retval)
    1289           0 :                 goto out;
    1290             : 
    1291        1021 :         bprm->mm = NULL;
    1292             : 
    1293             : #ifdef CONFIG_POSIX_TIMERS
    1294        1021 :         exit_itimers(me->signal);
    1295        1021 :         flush_itimer_signals();
    1296             : #endif
    1297             : 
    1298             :         /*
    1299             :          * Make the signal table private.
    1300             :          */
    1301        1021 :         retval = unshare_sighand(me);
    1302        1021 :         if (retval)
    1303           0 :                 goto out_unlock;
    1304             : 
    1305             :         /*
    1306             :          * Ensure that the uaccess routines can actually operate on userspace
    1307             :          * pointers:
    1308             :          */
    1309        1021 :         force_uaccess_begin();
    1310             : 
    1311        1021 :         me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
    1312             :                                         PF_NOFREEZE | PF_NO_SETAFFINITY);
    1313        1021 :         flush_thread();
    1314        1021 :         me->personality &= ~bprm->per_clear;
    1315             : 
    1316        1021 :         clear_syscall_work_syscall_user_dispatch(me);
    1317             : 
    1318             :         /*
    1319             :          * We have to apply CLOEXEC before we change whether the process is
    1320             :          * dumpable (in setup_new_exec) to avoid a race with a process in userspace
    1321             :          * trying to access the should-be-closed file descriptors of a process
    1322             :          * undergoing exec(2).
    1323             :          */
    1324        1021 :         do_close_on_exec(me->files);
    1325             : 
    1326        1021 :         if (bprm->secureexec) {
    1327             :                 /* Make sure parent cannot signal privileged process. */
    1328           0 :                 me->pdeath_signal = 0;
    1329             : 
    1330             :                 /*
    1331             :                  * For secureexec, reset the stack limit to sane default to
    1332             :                  * avoid bad behavior from the prior rlimits. This has to
    1333             :                  * happen before arch_pick_mmap_layout(), which examines
    1334             :                  * RLIMIT_STACK, but after the point of no return to avoid
    1335             :                  * needing to clean up the change on failure.
    1336             :                  */
    1337           0 :                 if (bprm->rlim_stack.rlim_cur > _STK_LIM)
    1338           0 :                         bprm->rlim_stack.rlim_cur = _STK_LIM;
    1339             :         }
    1340             : 
    1341        1021 :         me->sas_ss_sp = me->sas_ss_size = 0;
    1342             : 
    1343             :         /*
    1344             :          * Figure out dumpability. Note that this checking only of current
    1345             :          * is wrong, but userspace depends on it. This should be testing
    1346             :          * bprm->secureexec instead.
    1347             :          */
    1348        2042 :         if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
    1349        2042 :             !(uid_eq(current_euid(), current_uid()) &&
    1350        1021 :               gid_eq(current_egid(), current_gid())))
    1351           0 :                 set_dumpable(current->mm, suid_dumpable);
    1352             :         else
    1353        1021 :                 set_dumpable(current->mm, SUID_DUMP_USER);
    1354             : 
    1355        1021 :         perf_event_exec();
    1356        1021 :         __set_task_comm(me, kbasename(bprm->filename), true);
    1357             : 
    1358             :         /* An exec changes our domain. We are no longer part of the thread
    1359             :            group */
    1360        1021 :         WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
    1361        1021 :         flush_signal_handlers(me, 0);
    1362             : 
    1363             :         /*
    1364             :          * install the new credentials for this executable
    1365             :          */
    1366        1021 :         security_bprm_committing_creds(bprm);
    1367             : 
    1368        1021 :         commit_creds(bprm->cred);
    1369        1021 :         bprm->cred = NULL;
    1370             : 
    1371             :         /*
    1372             :          * Disable monitoring for regular users
    1373             :          * when executing setuid binaries. Must
    1374             :          * wait until new credentials are committed
    1375             :          * by commit_creds() above
    1376             :          */
    1377        1021 :         if (get_dumpable(me->mm) != SUID_DUMP_USER)
    1378           0 :                 perf_event_exit_task(me);
    1379             :         /*
    1380             :          * cred_guard_mutex must be held at least to this point to prevent
    1381             :          * ptrace_attach() from altering our determination of the task's
    1382             :          * credentials; any time after this it may be unlocked.
    1383             :          */
    1384        1021 :         security_bprm_committed_creds(bprm);
    1385             : 
    1386             :         /* Pass the opened binary to the interpreter. */
    1387        1021 :         if (bprm->have_execfd) {
    1388           0 :                 retval = get_unused_fd_flags(0);
    1389           0 :                 if (retval < 0)
    1390           0 :                         goto out_unlock;
    1391           0 :                 fd_install(retval, bprm->executable);
    1392           0 :                 bprm->executable = NULL;
    1393           0 :                 bprm->execfd = retval;
    1394             :         }
    1395             :         return 0;
    1396             : 
    1397           0 : out_unlock:
    1398           0 :         up_write(&me->signal->exec_update_lock);
    1399             : out:
    1400             :         return retval;
    1401             : }
    1402             : EXPORT_SYMBOL(begin_new_exec);
    1403             : 
    1404        2040 : void would_dump(struct linux_binprm *bprm, struct file *file)
    1405             : {
    1406        2040 :         struct inode *inode = file_inode(file);
    1407        2040 :         struct user_namespace *mnt_userns = file_mnt_user_ns(file);
    1408        2040 :         if (inode_permission(mnt_userns, inode, MAY_READ) < 0) {
    1409           0 :                 struct user_namespace *old, *user_ns;
    1410           0 :                 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
    1411             : 
    1412             :                 /* Ensure mm->user_ns contains the executable */
    1413           0 :                 user_ns = old = bprm->mm->user_ns;
    1414           0 :                 while ((user_ns != &init_user_ns) &&
    1415           0 :                        !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode))
    1416           0 :                         user_ns = user_ns->parent;
    1417             : 
    1418           0 :                 if (old != user_ns) {
    1419           0 :                         bprm->mm->user_ns = get_user_ns(user_ns);
    1420           0 :                         put_user_ns(old);
    1421             :                 }
    1422             :         }
    1423        2040 : }
    1424             : EXPORT_SYMBOL(would_dump);
    1425             : 
    1426        1021 : void setup_new_exec(struct linux_binprm * bprm)
    1427             : {
    1428             :         /* Setup things that can depend upon the personality */
    1429        1021 :         struct task_struct *me = current;
    1430             : 
    1431        1021 :         arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
    1432             : 
    1433        1021 :         arch_setup_new_exec();
    1434             : 
    1435             :         /* Set the new mm task size. We have to do that late because it may
    1436             :          * depend on TIF_32BIT which is only updated in flush_thread() on
    1437             :          * some architectures like powerpc
    1438             :          */
    1439        1021 :         me->mm->task_size = TASK_SIZE;
    1440        1021 :         up_write(&me->signal->exec_update_lock);
    1441        1021 :         mutex_unlock(&me->signal->cred_guard_mutex);
    1442        1021 : }
    1443             : EXPORT_SYMBOL(setup_new_exec);
    1444             : 
    1445             : /* Runs immediately before start_thread() takes over. */
    1446        1021 : void finalize_exec(struct linux_binprm *bprm)
    1447             : {
    1448             :         /* Store any stack rlimit changes before starting thread. */
    1449        1021 :         task_lock(current->group_leader);
    1450        1021 :         current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
    1451        1021 :         task_unlock(current->group_leader);
    1452        1021 : }
    1453             : EXPORT_SYMBOL(finalize_exec);
    1454             : 
    1455             : /*
    1456             :  * Prepare credentials and lock ->cred_guard_mutex.
    1457             :  * setup_new_exec() commits the new creds and drops the lock.
    1458             :  * Or, if exec fails before, free_bprm() should release ->cred
    1459             :  * and unlock.
    1460             :  */
    1461        2796 : static int prepare_bprm_creds(struct linux_binprm *bprm)
    1462             : {
    1463        2796 :         if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
    1464             :                 return -ERESTARTNOINTR;
    1465             : 
    1466        2796 :         bprm->cred = prepare_exec_creds();
    1467        2796 :         if (likely(bprm->cred))
    1468             :                 return 0;
    1469             : 
    1470           0 :         mutex_unlock(&current->signal->cred_guard_mutex);
    1471           0 :         return -ENOMEM;
    1472             : }
    1473             : 
    1474        2796 : static void free_bprm(struct linux_binprm *bprm)
    1475             : {
    1476        2796 :         if (bprm->mm) {
    1477        1775 :                 acct_arg_size(bprm, 0);
    1478        1775 :                 mmput(bprm->mm);
    1479             :         }
    1480        2796 :         free_arg_pages(bprm);
    1481        2796 :         if (bprm->cred) {
    1482        1775 :                 mutex_unlock(&current->signal->cred_guard_mutex);
    1483        1775 :                 abort_creds(bprm->cred);
    1484             :         }
    1485        2796 :         if (bprm->file) {
    1486        1021 :                 allow_write_access(bprm->file);
    1487        1021 :                 fput(bprm->file);
    1488             :         }
    1489        2796 :         if (bprm->executable)
    1490           0 :                 fput(bprm->executable);
    1491             :         /* If a binfmt changed the interp, free it. */
    1492        2796 :         if (bprm->interp != bprm->filename)
    1493          42 :                 kfree(bprm->interp);
    1494        2796 :         kfree(bprm->fdpath);
    1495        2796 :         kfree(bprm);
    1496        2796 : }
    1497             : 
    1498        2796 : static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
    1499             : {
    1500        2796 :         struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
    1501        2796 :         int retval = -ENOMEM;
    1502        2796 :         if (!bprm)
    1503           0 :                 goto out;
    1504             : 
    1505        2796 :         if (fd == AT_FDCWD || filename->name[0] == '/') {
    1506        2796 :                 bprm->filename = filename->name;
    1507             :         } else {
    1508           0 :                 if (filename->name[0] == '\0')
    1509           0 :                         bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
    1510             :                 else
    1511           0 :                         bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
    1512             :                                                   fd, filename->name);
    1513           0 :                 if (!bprm->fdpath)
    1514           0 :                         goto out_free;
    1515             : 
    1516           0 :                 bprm->filename = bprm->fdpath;
    1517             :         }
    1518        2796 :         bprm->interp = bprm->filename;
    1519             : 
    1520        2796 :         retval = bprm_mm_init(bprm);
    1521        2796 :         if (retval)
    1522           0 :                 goto out_free;
    1523             :         return bprm;
    1524             : 
    1525           0 : out_free:
    1526           0 :         free_bprm(bprm);
    1527           0 : out:
    1528           0 :         return ERR_PTR(retval);
    1529             : }
    1530             : 
    1531          42 : int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
    1532             : {
    1533             :         /* If a binfmt changed the interp, free it first. */
    1534          42 :         if (bprm->interp != bprm->filename)
    1535           0 :                 kfree(bprm->interp);
    1536          42 :         bprm->interp = kstrdup(interp, GFP_KERNEL);
    1537          42 :         if (!bprm->interp)
    1538           0 :                 return -ENOMEM;
    1539             :         return 0;
    1540             : }
    1541             : EXPORT_SYMBOL(bprm_change_interp);
    1542             : 
    1543             : /*
    1544             :  * determine how safe it is to execute the proposed program
    1545             :  * - the caller must hold ->cred_guard_mutex to protect against
    1546             :  *   PTRACE_ATTACH or seccomp thread-sync
    1547             :  */
    1548        2796 : static void check_unsafe_exec(struct linux_binprm *bprm)
    1549             : {
    1550        2796 :         struct task_struct *p = current, *t;
    1551        2796 :         unsigned n_fs;
    1552             : 
    1553        2796 :         if (p->ptrace)
    1554           0 :                 bprm->unsafe |= LSM_UNSAFE_PTRACE;
    1555             : 
    1556             :         /*
    1557             :          * This isn't strictly necessary, but it makes it harder for LSMs to
    1558             :          * mess up.
    1559             :          */
    1560        2796 :         if (task_no_new_privs(current))
    1561           6 :                 bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
    1562             : 
    1563        2796 :         t = p;
    1564        2796 :         n_fs = 1;
    1565        2796 :         spin_lock(&p->fs->lock);
    1566        2796 :         rcu_read_lock();
    1567        2796 :         while_each_thread(p, t) {
    1568           0 :                 if (t->fs == p->fs)
    1569           0 :                         n_fs++;
    1570             :         }
    1571        2796 :         rcu_read_unlock();
    1572             : 
    1573        2796 :         if (p->fs->users > n_fs)
    1574           1 :                 bprm->unsafe |= LSM_UNSAFE_SHARE;
    1575             :         else
    1576        2795 :                 p->fs->in_exec = 1;
    1577        2796 :         spin_unlock(&p->fs->lock);
    1578        2796 : }
    1579             : 
    1580        1021 : static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
    1581             : {
    1582             :         /* Handle suid and sgid on files */
    1583        1021 :         struct user_namespace *mnt_userns;
    1584        1021 :         struct inode *inode;
    1585        1021 :         unsigned int mode;
    1586        1021 :         kuid_t uid;
    1587        1021 :         kgid_t gid;
    1588             : 
    1589        1021 :         if (!mnt_may_suid(file->f_path.mnt))
    1590        1021 :                 return;
    1591             : 
    1592        1021 :         if (task_no_new_privs(current))
    1593             :                 return;
    1594             : 
    1595        1016 :         inode = file->f_path.dentry->d_inode;
    1596        1016 :         mode = READ_ONCE(inode->i_mode);
    1597        1016 :         if (!(mode & (S_ISUID|S_ISGID)))
    1598             :                 return;
    1599             : 
    1600           6 :         mnt_userns = file_mnt_user_ns(file);
    1601             : 
    1602             :         /* Be careful if suid/sgid is set */
    1603           6 :         inode_lock(inode);
    1604             : 
    1605             :         /* reload atomically mode/uid/gid now that lock held */
    1606           6 :         mode = inode->i_mode;
    1607           6 :         uid = i_uid_into_mnt(mnt_userns, inode);
    1608           6 :         gid = i_gid_into_mnt(mnt_userns, inode);
    1609           6 :         inode_unlock(inode);
    1610             : 
    1611             :         /* We ignore suid/sgid if there are no mappings for them in the ns */
    1612           6 :         if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
    1613           6 :                  !kgid_has_mapping(bprm->cred->user_ns, gid))
    1614             :                 return;
    1615             : 
    1616           6 :         if (mode & S_ISUID) {
    1617           6 :                 bprm->per_clear |= PER_CLEAR_ON_SETID;
    1618           6 :                 bprm->cred->euid = uid;
    1619             :         }
    1620             : 
    1621           6 :         if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
    1622           0 :                 bprm->per_clear |= PER_CLEAR_ON_SETID;
    1623           0 :                 bprm->cred->egid = gid;
    1624             :         }
    1625             : }
    1626             : 
    1627             : /*
    1628             :  * Compute brpm->cred based upon the final binary.
    1629             :  */
    1630        1021 : static int bprm_creds_from_file(struct linux_binprm *bprm)
    1631             : {
    1632             :         /* Compute creds based on which file? */
    1633        1021 :         struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
    1634             : 
    1635        1021 :         bprm_fill_uid(bprm, file);
    1636        1021 :         return security_bprm_creds_from_file(bprm, file);
    1637             : }
    1638             : 
    1639             : /*
    1640             :  * Fill the binprm structure from the inode.
    1641             :  * Read the first BINPRM_BUF_SIZE bytes
    1642             :  *
    1643             :  * This may be called multiple times for binary chains (scripts for example).
    1644             :  */
    1645        1063 : static int prepare_binprm(struct linux_binprm *bprm)
    1646             : {
    1647        1063 :         loff_t pos = 0;
    1648             : 
    1649        1063 :         memset(bprm->buf, 0, BINPRM_BUF_SIZE);
    1650        1063 :         return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
    1651             : }
    1652             : 
    1653             : /*
    1654             :  * Arguments are '\0' separated strings found at the location bprm->p
    1655             :  * points to; chop off the first by relocating brpm->p to right after
    1656             :  * the first '\0' encountered.
    1657             :  */
    1658          42 : int remove_arg_zero(struct linux_binprm *bprm)
    1659             : {
    1660          42 :         int ret = 0;
    1661          42 :         unsigned long offset;
    1662          42 :         char *kaddr;
    1663          42 :         struct page *page;
    1664             : 
    1665          42 :         if (!bprm->argc)
    1666             :                 return 0;
    1667             : 
    1668          42 :         do {
    1669          42 :                 offset = bprm->p & ~PAGE_MASK;
    1670          42 :                 page = get_arg_page(bprm, bprm->p, 0);
    1671          42 :                 if (!page) {
    1672           0 :                         ret = -EFAULT;
    1673           0 :                         goto out;
    1674             :                 }
    1675          42 :                 kaddr = kmap_atomic(page);
    1676             : 
    1677        1219 :                 for (; offset < PAGE_SIZE && kaddr[offset];
    1678        1135 :                                 offset++, bprm->p++)
    1679        1135 :                         ;
    1680             : 
    1681          42 :                 kunmap_atomic(kaddr);
    1682          42 :                 put_arg_page(page);
    1683          42 :         } while (offset == PAGE_SIZE);
    1684             : 
    1685          42 :         bprm->p++;
    1686          42 :         bprm->argc--;
    1687          42 :         ret = 0;
    1688             : 
    1689             : out:
    1690             :         return ret;
    1691             : }
    1692             : EXPORT_SYMBOL(remove_arg_zero);
    1693             : 
    1694             : #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
    1695             : /*
    1696             :  * cycle the list of binary formats handler, until one recognizes the image
    1697             :  */
    1698        1063 : static int search_binary_handler(struct linux_binprm *bprm)
    1699             : {
    1700        1063 :         bool need_retry = IS_ENABLED(CONFIG_MODULES);
    1701        1063 :         struct linux_binfmt *fmt;
    1702        1063 :         int retval;
    1703             : 
    1704        1063 :         retval = prepare_binprm(bprm);
    1705        1063 :         if (retval < 0)
    1706             :                 return retval;
    1707             : 
    1708        1063 :         retval = security_bprm_check(bprm);
    1709        1063 :         if (retval)
    1710             :                 return retval;
    1711             : 
    1712        1063 :         retval = -ENOENT;
    1713        1063 :  retry:
    1714        1063 :         read_lock(&binfmt_lock);
    1715        2084 :         list_for_each_entry(fmt, &formats, lh) {
    1716        2084 :                 if (!try_module_get(fmt->module))
    1717             :                         continue;
    1718        2084 :                 read_unlock(&binfmt_lock);
    1719             : 
    1720        2084 :                 retval = fmt->load_binary(bprm);
    1721             : 
    1722        2084 :                 read_lock(&binfmt_lock);
    1723        2084 :                 put_binfmt(fmt);
    1724        2084 :                 if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
    1725        1063 :                         read_unlock(&binfmt_lock);
    1726        1063 :                         return retval;
    1727             :                 }
    1728             :         }
    1729           0 :         read_unlock(&binfmt_lock);
    1730             : 
    1731           0 :         if (need_retry) {
    1732             :                 if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
    1733             :                     printable(bprm->buf[2]) && printable(bprm->buf[3]))
    1734             :                         return retval;
    1735             :                 if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
    1736             :                         return retval;
    1737             :                 need_retry = false;
    1738             :                 goto retry;
    1739             :         }
    1740             : 
    1741           0 :         return retval;
    1742             : }
    1743             : 
    1744        1021 : static int exec_binprm(struct linux_binprm *bprm)
    1745             : {
    1746        1021 :         pid_t old_pid, old_vpid;
    1747        1021 :         int ret, depth;
    1748             : 
    1749             :         /* Need to fetch pid before load_binary changes it */
    1750        1021 :         old_pid = current->pid;
    1751        1021 :         rcu_read_lock();
    1752        1021 :         old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
    1753        1021 :         rcu_read_unlock();
    1754             : 
    1755             :         /* This allows 4 levels of binfmt rewrites before failing hard. */
    1756        1063 :         for (depth = 0;; depth++) {
    1757        1063 :                 struct file *exec;
    1758        1063 :                 if (depth > 5)
    1759             :                         return -ELOOP;
    1760             : 
    1761        1063 :                 ret = search_binary_handler(bprm);
    1762        1063 :                 if (ret < 0)
    1763           0 :                         return ret;
    1764        1063 :                 if (!bprm->interpreter)
    1765             :                         break;
    1766             : 
    1767          42 :                 exec = bprm->file;
    1768          42 :                 bprm->file = bprm->interpreter;
    1769          42 :                 bprm->interpreter = NULL;
    1770             : 
    1771          42 :                 allow_write_access(exec);
    1772          42 :                 if (unlikely(bprm->have_execfd)) {
    1773           0 :                         if (bprm->executable) {
    1774           0 :                                 fput(exec);
    1775           0 :                                 return -ENOEXEC;
    1776             :                         }
    1777           0 :                         bprm->executable = exec;
    1778             :                 } else
    1779          42 :                         fput(exec);
    1780             :         }
    1781             : 
    1782        1021 :         audit_bprm(bprm);
    1783        1021 :         trace_sched_process_exec(current, old_pid, bprm);
    1784        1021 :         ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
    1785        1021 :         proc_exec_connector(current);
    1786        1021 :         return 0;
    1787             : }
    1788             : 
    1789             : /*
    1790             :  * sys_execve() executes a new program.
    1791             :  */
    1792        2796 : static int bprm_execve(struct linux_binprm *bprm,
    1793             :                        int fd, struct filename *filename, int flags)
    1794             : {
    1795        2796 :         struct file *file;
    1796        2796 :         int retval;
    1797             : 
    1798        2796 :         retval = prepare_bprm_creds(bprm);
    1799        2796 :         if (retval)
    1800             :                 return retval;
    1801             : 
    1802        2796 :         check_unsafe_exec(bprm);
    1803        2796 :         current->in_execve = 1;
    1804             : 
    1805        2796 :         file = do_open_execat(fd, filename, flags);
    1806        2796 :         retval = PTR_ERR(file);
    1807        2796 :         if (IS_ERR(file))
    1808        1775 :                 goto out_unmark;
    1809             : 
    1810        1021 :         sched_exec();
    1811             : 
    1812        1021 :         bprm->file = file;
    1813             :         /*
    1814             :          * Record that a name derived from an O_CLOEXEC fd will be
    1815             :          * inaccessible after exec.  This allows the code in exec to
    1816             :          * choose to fail when the executable is not mmaped into the
    1817             :          * interpreter and an open file descriptor is not passed to
    1818             :          * the interpreter.  This makes for a better user experience
    1819             :          * than having the interpreter start and then immediately fail
    1820             :          * when it finds the executable is inaccessible.
    1821             :          */
    1822        1021 :         if (bprm->fdpath && get_close_on_exec(fd))
    1823           0 :                 bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
    1824             : 
    1825             :         /* Set the unchanging part of bprm->cred */
    1826        1021 :         retval = security_bprm_creds_for_exec(bprm);
    1827        1021 :         if (retval)
    1828           0 :                 goto out;
    1829             : 
    1830        1021 :         retval = exec_binprm(bprm);
    1831        1021 :         if (retval < 0)
    1832           0 :                 goto out;
    1833             : 
    1834             :         /* execve succeeded */
    1835        1021 :         current->fs->in_exec = 0;
    1836        1021 :         current->in_execve = 0;
    1837        1021 :         rseq_execve(current);
    1838        1021 :         acct_update_integrals(current);
    1839        1021 :         task_numa_free(current, false);
    1840        1021 :         return retval;
    1841             : 
    1842           0 : out:
    1843             :         /*
    1844             :          * If past the point of no return ensure the code never
    1845             :          * returns to the userspace process.  Use an existing fatal
    1846             :          * signal if present otherwise terminate the process with
    1847             :          * SIGSEGV.
    1848             :          */
    1849           0 :         if (bprm->point_of_no_return && !fatal_signal_pending(current))
    1850           0 :                 force_sigsegv(SIGSEGV);
    1851             : 
    1852           0 : out_unmark:
    1853        1775 :         current->fs->in_exec = 0;
    1854        1775 :         current->in_execve = 0;
    1855             : 
    1856        1775 :         return retval;
    1857             : }
    1858             : 
    1859        2795 : static int do_execveat_common(int fd, struct filename *filename,
    1860             :                               struct user_arg_ptr argv,
    1861             :                               struct user_arg_ptr envp,
    1862             :                               int flags)
    1863             : {
    1864        2795 :         struct linux_binprm *bprm;
    1865        2795 :         int retval;
    1866             : 
    1867        2795 :         if (IS_ERR(filename))
    1868           0 :                 return PTR_ERR(filename);
    1869             : 
    1870             :         /*
    1871             :          * We move the actual failure in case of RLIMIT_NPROC excess from
    1872             :          * set*uid() to execve() because too many poorly written programs
    1873             :          * don't check setuid() return code.  Here we additionally recheck
    1874             :          * whether NPROC limit is still exceeded.
    1875             :          */
    1876        2795 :         if ((current->flags & PF_NPROC_EXCEEDED) &&
    1877           0 :             atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
    1878           0 :                 retval = -EAGAIN;
    1879           0 :                 goto out_ret;
    1880             :         }
    1881             : 
    1882             :         /* We're below the limit (still or again), so we don't want to make
    1883             :          * further execve() calls fail. */
    1884        2795 :         current->flags &= ~PF_NPROC_EXCEEDED;
    1885             : 
    1886        2795 :         bprm = alloc_bprm(fd, filename);
    1887        2795 :         if (IS_ERR(bprm)) {
    1888           0 :                 retval = PTR_ERR(bprm);
    1889           0 :                 goto out_ret;
    1890             :         }
    1891             : 
    1892        2795 :         retval = count(argv, MAX_ARG_STRINGS);
    1893        2795 :         if (retval < 0)
    1894           0 :                 goto out_free;
    1895        2795 :         bprm->argc = retval;
    1896             : 
    1897        2795 :         retval = count(envp, MAX_ARG_STRINGS);
    1898        2795 :         if (retval < 0)
    1899           0 :                 goto out_free;
    1900        2795 :         bprm->envc = retval;
    1901             : 
    1902        2795 :         retval = bprm_stack_limits(bprm);
    1903        2795 :         if (retval < 0)
    1904           0 :                 goto out_free;
    1905             : 
    1906        2795 :         retval = copy_string_kernel(bprm->filename, bprm);
    1907        2795 :         if (retval < 0)
    1908           0 :                 goto out_free;
    1909        2795 :         bprm->exec = bprm->p;
    1910             : 
    1911        2795 :         retval = copy_strings(bprm->envc, envp, bprm);
    1912        2795 :         if (retval < 0)
    1913           0 :                 goto out_free;
    1914             : 
    1915        2795 :         retval = copy_strings(bprm->argc, argv, bprm);
    1916        2795 :         if (retval < 0)
    1917           0 :                 goto out_free;
    1918             : 
    1919        2795 :         retval = bprm_execve(bprm, fd, filename, flags);
    1920        2795 : out_free:
    1921        2795 :         free_bprm(bprm);
    1922             : 
    1923        2795 : out_ret:
    1924        2795 :         putname(filename);
    1925        2795 :         return retval;
    1926             : }
    1927             : 
    1928           1 : int kernel_execve(const char *kernel_filename,
    1929             :                   const char *const *argv, const char *const *envp)
    1930             : {
    1931           1 :         struct filename *filename;
    1932           1 :         struct linux_binprm *bprm;
    1933           1 :         int fd = AT_FDCWD;
    1934           1 :         int retval;
    1935             : 
    1936           1 :         filename = getname_kernel(kernel_filename);
    1937           1 :         if (IS_ERR(filename))
    1938           0 :                 return PTR_ERR(filename);
    1939             : 
    1940           1 :         bprm = alloc_bprm(fd, filename);
    1941           1 :         if (IS_ERR(bprm)) {
    1942           0 :                 retval = PTR_ERR(bprm);
    1943           0 :                 goto out_ret;
    1944             :         }
    1945             : 
    1946           1 :         retval = count_strings_kernel(argv);
    1947           1 :         if (retval < 0)
    1948           0 :                 goto out_free;
    1949           1 :         bprm->argc = retval;
    1950             : 
    1951           1 :         retval = count_strings_kernel(envp);
    1952           1 :         if (retval < 0)
    1953           0 :                 goto out_free;
    1954           1 :         bprm->envc = retval;
    1955             : 
    1956           1 :         retval = bprm_stack_limits(bprm);
    1957           1 :         if (retval < 0)
    1958           0 :                 goto out_free;
    1959             : 
    1960           1 :         retval = copy_string_kernel(bprm->filename, bprm);
    1961           1 :         if (retval < 0)
    1962           0 :                 goto out_free;
    1963           1 :         bprm->exec = bprm->p;
    1964             : 
    1965           1 :         retval = copy_strings_kernel(bprm->envc, envp, bprm);
    1966           1 :         if (retval < 0)
    1967           0 :                 goto out_free;
    1968             : 
    1969           1 :         retval = copy_strings_kernel(bprm->argc, argv, bprm);
    1970           1 :         if (retval < 0)
    1971           0 :                 goto out_free;
    1972             : 
    1973           1 :         retval = bprm_execve(bprm, fd, filename, 0);
    1974           1 : out_free:
    1975           1 :         free_bprm(bprm);
    1976           1 : out_ret:
    1977           1 :         putname(filename);
    1978           1 :         return retval;
    1979             : }
    1980             : 
    1981        2795 : static int do_execve(struct filename *filename,
    1982             :         const char __user *const __user *__argv,
    1983             :         const char __user *const __user *__envp)
    1984             : {
    1985        2795 :         struct user_arg_ptr argv = { .ptr.native = __argv };
    1986        2795 :         struct user_arg_ptr envp = { .ptr.native = __envp };
    1987        2795 :         return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
    1988             : }
    1989             : 
    1990           0 : static int do_execveat(int fd, struct filename *filename,
    1991             :                 const char __user *const __user *__argv,
    1992             :                 const char __user *const __user *__envp,
    1993             :                 int flags)
    1994             : {
    1995           0 :         struct user_arg_ptr argv = { .ptr.native = __argv };
    1996           0 :         struct user_arg_ptr envp = { .ptr.native = __envp };
    1997             : 
    1998           0 :         return do_execveat_common(fd, filename, argv, envp, flags);
    1999             : }
    2000             : 
    2001             : #ifdef CONFIG_COMPAT
    2002           0 : static int compat_do_execve(struct filename *filename,
    2003             :         const compat_uptr_t __user *__argv,
    2004             :         const compat_uptr_t __user *__envp)
    2005             : {
    2006           0 :         struct user_arg_ptr argv = {
    2007             :                 .is_compat = true,
    2008             :                 .ptr.compat = __argv,
    2009             :         };
    2010           0 :         struct user_arg_ptr envp = {
    2011             :                 .is_compat = true,
    2012             :                 .ptr.compat = __envp,
    2013             :         };
    2014           0 :         return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
    2015             : }
    2016             : 
    2017           0 : static int compat_do_execveat(int fd, struct filename *filename,
    2018             :                               const compat_uptr_t __user *__argv,
    2019             :                               const compat_uptr_t __user *__envp,
    2020             :                               int flags)
    2021             : {
    2022           0 :         struct user_arg_ptr argv = {
    2023             :                 .is_compat = true,
    2024             :                 .ptr.compat = __argv,
    2025             :         };
    2026           0 :         struct user_arg_ptr envp = {
    2027             :                 .is_compat = true,
    2028             :                 .ptr.compat = __envp,
    2029             :         };
    2030           0 :         return do_execveat_common(fd, filename, argv, envp, flags);
    2031             : }
    2032             : #endif
    2033             : 
    2034        1021 : void set_binfmt(struct linux_binfmt *new)
    2035             : {
    2036        1021 :         struct mm_struct *mm = current->mm;
    2037             : 
    2038        1021 :         if (mm->binfmt)
    2039        1021 :                 module_put(mm->binfmt->module);
    2040             : 
    2041        1021 :         mm->binfmt = new;
    2042        1021 :         if (new)
    2043        1021 :                 __module_get(new->module);
    2044        1021 : }
    2045             : EXPORT_SYMBOL(set_binfmt);
    2046             : 
    2047             : /*
    2048             :  * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
    2049             :  */
    2050        1067 : void set_dumpable(struct mm_struct *mm, int value)
    2051             : {
    2052        1067 :         if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
    2053             :                 return;
    2054             : 
    2055        1067 :         set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
    2056             : }
    2057             : 
    2058        5590 : SYSCALL_DEFINE3(execve,
    2059             :                 const char __user *, filename,
    2060             :                 const char __user *const __user *, argv,
    2061             :                 const char __user *const __user *, envp)
    2062             : {
    2063        2795 :         return do_execve(getname(filename), argv, envp);
    2064             : }
    2065             : 
    2066           0 : SYSCALL_DEFINE5(execveat,
    2067             :                 int, fd, const char __user *, filename,
    2068             :                 const char __user *const __user *, argv,
    2069             :                 const char __user *const __user *, envp,
    2070             :                 int, flags)
    2071             : {
    2072           0 :         int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
    2073             : 
    2074           0 :         return do_execveat(fd,
    2075             :                            getname_flags(filename, lookup_flags, NULL),
    2076             :                            argv, envp, flags);
    2077             : }
    2078             : 
    2079             : #ifdef CONFIG_COMPAT
    2080           0 : COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
    2081             :         const compat_uptr_t __user *, argv,
    2082             :         const compat_uptr_t __user *, envp)
    2083             : {
    2084           0 :         return compat_do_execve(getname(filename), argv, envp);
    2085             : }
    2086             : 
    2087           0 : COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
    2088             :                        const char __user *, filename,
    2089             :                        const compat_uptr_t __user *, argv,
    2090             :                        const compat_uptr_t __user *, envp,
    2091             :                        int,  flags)
    2092             : {
    2093           0 :         int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
    2094             : 
    2095           0 :         return compat_do_execveat(fd,
    2096             :                                   getname_flags(filename, lookup_flags, NULL),
    2097             :                                   argv, envp, flags);
    2098             : }
    2099             : #endif

Generated by: LCOV version 1.14