LCOV - code coverage report
Current view: top level - mm - util.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 138 325 42.5 %
Date: 2021-04-22 12:43:58 Functions: 20 38 52.6 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : #include <linux/mm.h>
       3             : #include <linux/slab.h>
       4             : #include <linux/string.h>
       5             : #include <linux/compiler.h>
       6             : #include <linux/export.h>
       7             : #include <linux/err.h>
       8             : #include <linux/sched.h>
       9             : #include <linux/sched/mm.h>
      10             : #include <linux/sched/signal.h>
      11             : #include <linux/sched/task_stack.h>
      12             : #include <linux/security.h>
      13             : #include <linux/swap.h>
      14             : #include <linux/swapops.h>
      15             : #include <linux/mman.h>
      16             : #include <linux/hugetlb.h>
      17             : #include <linux/vmalloc.h>
      18             : #include <linux/userfaultfd_k.h>
      19             : #include <linux/elf.h>
      20             : #include <linux/elf-randomize.h>
      21             : #include <linux/personality.h>
      22             : #include <linux/random.h>
      23             : #include <linux/processor.h>
      24             : #include <linux/sizes.h>
      25             : #include <linux/compat.h>
      26             : 
      27             : #include <linux/uaccess.h>
      28             : 
      29             : #include "internal.h"
      30             : 
      31             : /**
      32             :  * kfree_const - conditionally free memory
      33             :  * @x: pointer to the memory
      34             :  *
      35             :  * Function calls kfree only if @x is not in .rodata section.
      36             :  */
      37        2556 : void kfree_const(const void *x)
      38             : {
      39        5112 :         if (!is_kernel_rodata((unsigned long)x))
      40        1996 :                 kfree(x);
      41        2556 : }
      42             : EXPORT_SYMBOL(kfree_const);
      43             : 
      44             : /**
      45             :  * kstrdup - allocate space for and copy an existing string
      46             :  * @s: the string to duplicate
      47             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      48             :  *
      49             :  * Return: newly allocated copy of @s or %NULL in case of error
      50             :  */
      51        5779 : char *kstrdup(const char *s, gfp_t gfp)
      52             : {
      53        5779 :         size_t len;
      54        5779 :         char *buf;
      55             : 
      56        5779 :         if (!s)
      57             :                 return NULL;
      58             : 
      59        5777 :         len = strlen(s) + 1;
      60        5777 :         buf = kmalloc_track_caller(len, gfp);
      61        5777 :         if (buf)
      62        5777 :                 memcpy(buf, s, len);
      63             :         return buf;
      64             : }
      65             : EXPORT_SYMBOL(kstrdup);
      66             : 
      67             : /**
      68             :  * kstrdup_const - conditionally duplicate an existing const string
      69             :  * @s: the string to duplicate
      70             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      71             :  *
      72             :  * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
      73             :  * must not be passed to krealloc().
      74             :  *
      75             :  * Return: source string if it is in .rodata section otherwise
      76             :  * fallback to kstrdup.
      77             :  */
      78       10272 : const char *kstrdup_const(const char *s, gfp_t gfp)
      79             : {
      80       20544 :         if (is_kernel_rodata((unsigned long)s))
      81             :                 return s;
      82             : 
      83        2605 :         return kstrdup(s, gfp);
      84             : }
      85             : EXPORT_SYMBOL(kstrdup_const);
      86             : 
      87             : /**
      88             :  * kstrndup - allocate space for and copy an existing string
      89             :  * @s: the string to duplicate
      90             :  * @max: read at most @max chars from @s
      91             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      92             :  *
      93             :  * Note: Use kmemdup_nul() instead if the size is known exactly.
      94             :  *
      95             :  * Return: newly allocated copy of @s or %NULL in case of error
      96             :  */
      97           0 : char *kstrndup(const char *s, size_t max, gfp_t gfp)
      98             : {
      99           0 :         size_t len;
     100           0 :         char *buf;
     101             : 
     102           0 :         if (!s)
     103             :                 return NULL;
     104             : 
     105           0 :         len = strnlen(s, max);
     106           0 :         buf = kmalloc_track_caller(len+1, gfp);
     107           0 :         if (buf) {
     108           0 :                 memcpy(buf, s, len);
     109           0 :                 buf[len] = '\0';
     110             :         }
     111             :         return buf;
     112             : }
     113             : EXPORT_SYMBOL(kstrndup);
     114             : 
     115             : /**
     116             :  * kmemdup - duplicate region of memory
     117             :  *
     118             :  * @src: memory region to duplicate
     119             :  * @len: memory region length
     120             :  * @gfp: GFP mask to use
     121             :  *
     122             :  * Return: newly allocated copy of @src or %NULL in case of error
     123             :  */
     124         479 : void *kmemdup(const void *src, size_t len, gfp_t gfp)
     125             : {
     126         479 :         void *p;
     127             : 
     128         479 :         p = kmalloc_track_caller(len, gfp);
     129         479 :         if (p)
     130         479 :                 memcpy(p, src, len);
     131         479 :         return p;
     132             : }
     133             : EXPORT_SYMBOL(kmemdup);
     134             : 
     135             : /**
     136             :  * kmemdup_nul - Create a NUL-terminated string from unterminated data
     137             :  * @s: The data to stringify
     138             :  * @len: The size of the data
     139             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
     140             :  *
     141             :  * Return: newly allocated copy of @s with NUL-termination or %NULL in
     142             :  * case of error
     143             :  */
     144         337 : char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
     145             : {
     146         337 :         char *buf;
     147             : 
     148         337 :         if (!s)
     149             :                 return NULL;
     150             : 
     151         337 :         buf = kmalloc_track_caller(len + 1, gfp);
     152         337 :         if (buf) {
     153         337 :                 memcpy(buf, s, len);
     154         337 :                 buf[len] = '\0';
     155             :         }
     156             :         return buf;
     157             : }
     158             : EXPORT_SYMBOL(kmemdup_nul);
     159             : 
     160             : /**
     161             :  * memdup_user - duplicate memory region from user space
     162             :  *
     163             :  * @src: source address in user space
     164             :  * @len: number of bytes to copy
     165             :  *
     166             :  * Return: an ERR_PTR() on failure.  Result is physically
     167             :  * contiguous, to be freed by kfree().
     168             :  */
     169         322 : void *memdup_user(const void __user *src, size_t len)
     170             : {
     171         322 :         void *p;
     172             : 
     173         322 :         p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
     174         322 :         if (!p)
     175         322 :                 return ERR_PTR(-ENOMEM);
     176             : 
     177         644 :         if (copy_from_user(p, src, len)) {
     178           0 :                 kfree(p);
     179           0 :                 return ERR_PTR(-EFAULT);
     180             :         }
     181             : 
     182             :         return p;
     183             : }
     184             : EXPORT_SYMBOL(memdup_user);
     185             : 
     186             : /**
     187             :  * vmemdup_user - duplicate memory region from user space
     188             :  *
     189             :  * @src: source address in user space
     190             :  * @len: number of bytes to copy
     191             :  *
     192             :  * Return: an ERR_PTR() on failure.  Result may be not
     193             :  * physically contiguous.  Use kvfree() to free.
     194             :  */
     195           0 : void *vmemdup_user(const void __user *src, size_t len)
     196             : {
     197           0 :         void *p;
     198             : 
     199           0 :         p = kvmalloc(len, GFP_USER);
     200           0 :         if (!p)
     201           0 :                 return ERR_PTR(-ENOMEM);
     202             : 
     203           0 :         if (copy_from_user(p, src, len)) {
     204           0 :                 kvfree(p);
     205           0 :                 return ERR_PTR(-EFAULT);
     206             :         }
     207             : 
     208             :         return p;
     209             : }
     210             : EXPORT_SYMBOL(vmemdup_user);
     211             : 
     212             : /**
     213             :  * strndup_user - duplicate an existing string from user space
     214             :  * @s: The string to duplicate
     215             :  * @n: Maximum number of bytes to copy, including the trailing NUL.
     216             :  *
     217             :  * Return: newly allocated copy of @s or an ERR_PTR() in case of error
     218             :  */
     219         321 : char *strndup_user(const char __user *s, long n)
     220             : {
     221         321 :         char *p;
     222         321 :         long length;
     223             : 
     224         321 :         length = strnlen_user(s, n);
     225             : 
     226         321 :         if (!length)
     227         321 :                 return ERR_PTR(-EFAULT);
     228             : 
     229         321 :         if (length > n)
     230         321 :                 return ERR_PTR(-EINVAL);
     231             : 
     232         321 :         p = memdup_user(s, length);
     233             : 
     234         321 :         if (IS_ERR(p))
     235             :                 return p;
     236             : 
     237         321 :         p[length - 1] = '\0';
     238             : 
     239         321 :         return p;
     240             : }
     241             : EXPORT_SYMBOL(strndup_user);
     242             : 
     243             : /**
     244             :  * memdup_user_nul - duplicate memory region from user space and NUL-terminate
     245             :  *
     246             :  * @src: source address in user space
     247             :  * @len: number of bytes to copy
     248             :  *
     249             :  * Return: an ERR_PTR() on failure.
     250             :  */
     251           0 : void *memdup_user_nul(const void __user *src, size_t len)
     252             : {
     253           0 :         char *p;
     254             : 
     255             :         /*
     256             :          * Always use GFP_KERNEL, since copy_from_user() can sleep and
     257             :          * cause pagefault, which makes it pointless to use GFP_NOFS
     258             :          * or GFP_ATOMIC.
     259             :          */
     260           0 :         p = kmalloc_track_caller(len + 1, GFP_KERNEL);
     261           0 :         if (!p)
     262           0 :                 return ERR_PTR(-ENOMEM);
     263             : 
     264           0 :         if (copy_from_user(p, src, len)) {
     265           0 :                 kfree(p);
     266           0 :                 return ERR_PTR(-EFAULT);
     267             :         }
     268           0 :         p[len] = '\0';
     269             : 
     270           0 :         return p;
     271             : }
     272             : EXPORT_SYMBOL(memdup_user_nul);
     273             : 
     274       56376 : void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
     275             :                 struct vm_area_struct *prev)
     276             : {
     277       56376 :         struct vm_area_struct *next;
     278             : 
     279       56376 :         vma->vm_prev = prev;
     280       56376 :         if (prev) {
     281       53092 :                 next = prev->vm_next;
     282       53092 :                 prev->vm_next = vma;
     283             :         } else {
     284        3284 :                 next = mm->mmap;
     285        3284 :                 mm->mmap = vma;
     286             :         }
     287       56376 :         vma->vm_next = next;
     288       56376 :         if (next)
     289       53092 :                 next->vm_prev = vma;
     290       56376 : }
     291             : 
     292           0 : void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
     293             : {
     294           0 :         struct vm_area_struct *prev, *next;
     295             : 
     296           0 :         next = vma->vm_next;
     297           0 :         prev = vma->vm_prev;
     298           0 :         if (prev)
     299           0 :                 prev->vm_next = next;
     300             :         else
     301           0 :                 mm->mmap = next;
     302           0 :         if (next)
     303           0 :                 next->vm_prev = prev;
     304           0 : }
     305             : 
     306             : /* Check if the vma is being used as a stack by this task */
     307           0 : int vma_is_stack_for_current(struct vm_area_struct *vma)
     308             : {
     309           0 :         struct task_struct * __maybe_unused t = current;
     310             : 
     311           0 :         return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
     312             : }
     313             : 
     314             : /*
     315             :  * Change backing file, only valid to use during initial VMA setup.
     316             :  */
     317           0 : void vma_set_file(struct vm_area_struct *vma, struct file *file)
     318             : {
     319             :         /* Changing an anonymous vma with this is illegal */
     320           0 :         get_file(file);
     321           0 :         swap(vma->vm_file, file);
     322           0 :         fput(file);
     323           0 : }
     324             : EXPORT_SYMBOL(vma_set_file);
     325             : 
     326             : #ifndef STACK_RND_MASK
     327             : #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
     328             : #endif
     329             : 
     330         902 : unsigned long randomize_stack_top(unsigned long stack_top)
     331             : {
     332         902 :         unsigned long random_variable = 0;
     333             : 
     334         902 :         if (current->flags & PF_RANDOMIZE) {
     335         902 :                 random_variable = get_random_long();
     336         902 :                 random_variable &= STACK_RND_MASK;
     337         902 :                 random_variable <<= PAGE_SHIFT;
     338             :         }
     339             : #ifdef CONFIG_STACK_GROWSUP
     340             :         return PAGE_ALIGN(stack_top) + random_variable;
     341             : #else
     342         902 :         return PAGE_ALIGN(stack_top) - random_variable;
     343             : #endif
     344             : }
     345             : 
     346             : #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
     347             : unsigned long arch_randomize_brk(struct mm_struct *mm)
     348             : {
     349             :         /* Is the current task 32bit ? */
     350             :         if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
     351             :                 return randomize_page(mm->brk, SZ_32M);
     352             : 
     353             :         return randomize_page(mm->brk, SZ_1G);
     354             : }
     355             : 
     356             : unsigned long arch_mmap_rnd(void)
     357             : {
     358             :         unsigned long rnd;
     359             : 
     360             : #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
     361             :         if (is_compat_task())
     362             :                 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
     363             :         else
     364             : #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
     365             :                 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
     366             : 
     367             :         return rnd << PAGE_SHIFT;
     368             : }
     369             : 
     370             : static int mmap_is_legacy(struct rlimit *rlim_stack)
     371             : {
     372             :         if (current->personality & ADDR_COMPAT_LAYOUT)
     373             :                 return 1;
     374             : 
     375             :         if (rlim_stack->rlim_cur == RLIM_INFINITY)
     376             :                 return 1;
     377             : 
     378             :         return sysctl_legacy_va_layout;
     379             : }
     380             : 
     381             : /*
     382             :  * Leave enough space between the mmap area and the stack to honour ulimit in
     383             :  * the face of randomisation.
     384             :  */
     385             : #define MIN_GAP         (SZ_128M)
     386             : #define MAX_GAP         (STACK_TOP / 6 * 5)
     387             : 
     388             : static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
     389             : {
     390             :         unsigned long gap = rlim_stack->rlim_cur;
     391             :         unsigned long pad = stack_guard_gap;
     392             : 
     393             :         /* Account for stack randomization if necessary */
     394             :         if (current->flags & PF_RANDOMIZE)
     395             :                 pad += (STACK_RND_MASK << PAGE_SHIFT);
     396             : 
     397             :         /* Values close to RLIM_INFINITY can overflow. */
     398             :         if (gap + pad > gap)
     399             :                 gap += pad;
     400             : 
     401             :         if (gap < MIN_GAP)
     402             :                 gap = MIN_GAP;
     403             :         else if (gap > MAX_GAP)
     404             :                 gap = MAX_GAP;
     405             : 
     406             :         return PAGE_ALIGN(STACK_TOP - gap - rnd);
     407             : }
     408             : 
     409             : void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
     410             : {
     411             :         unsigned long random_factor = 0UL;
     412             : 
     413             :         if (current->flags & PF_RANDOMIZE)
     414             :                 random_factor = arch_mmap_rnd();
     415             : 
     416             :         if (mmap_is_legacy(rlim_stack)) {
     417             :                 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
     418             :                 mm->get_unmapped_area = arch_get_unmapped_area;
     419             :         } else {
     420             :                 mm->mmap_base = mmap_base(random_factor, rlim_stack);
     421             :                 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
     422             :         }
     423             : }
     424             : #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
     425             : void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
     426             : {
     427             :         mm->mmap_base = TASK_UNMAPPED_BASE;
     428             :         mm->get_unmapped_area = arch_get_unmapped_area;
     429             : }
     430             : #endif
     431             : 
     432             : /**
     433             :  * __account_locked_vm - account locked pages to an mm's locked_vm
     434             :  * @mm:          mm to account against
     435             :  * @pages:       number of pages to account
     436             :  * @inc:         %true if @pages should be considered positive, %false if not
     437             :  * @task:        task used to check RLIMIT_MEMLOCK
     438             :  * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
     439             :  *
     440             :  * Assumes @task and @mm are valid (i.e. at least one reference on each), and
     441             :  * that mmap_lock is held as writer.
     442             :  *
     443             :  * Return:
     444             :  * * 0       on success
     445             :  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
     446             :  */
     447           0 : int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
     448             :                         struct task_struct *task, bool bypass_rlim)
     449             : {
     450           0 :         unsigned long locked_vm, limit;
     451           0 :         int ret = 0;
     452             : 
     453           0 :         mmap_assert_write_locked(mm);
     454             : 
     455           0 :         locked_vm = mm->locked_vm;
     456           0 :         if (inc) {
     457           0 :                 if (!bypass_rlim) {
     458           0 :                         limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
     459           0 :                         if (locked_vm + pages > limit)
     460             :                                 ret = -ENOMEM;
     461             :                 }
     462             :                 if (!ret)
     463           0 :                         mm->locked_vm = locked_vm + pages;
     464             :         } else {
     465           0 :                 WARN_ON_ONCE(pages > locked_vm);
     466           0 :                 mm->locked_vm = locked_vm - pages;
     467             :         }
     468             : 
     469           0 :         pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
     470             :                  (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
     471             :                  locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
     472             :                  ret ? " - exceeded" : "");
     473             : 
     474           0 :         return ret;
     475             : }
     476             : EXPORT_SYMBOL_GPL(__account_locked_vm);
     477             : 
     478             : /**
     479             :  * account_locked_vm - account locked pages to an mm's locked_vm
     480             :  * @mm:          mm to account against, may be NULL
     481             :  * @pages:       number of pages to account
     482             :  * @inc:         %true if @pages should be considered positive, %false if not
     483             :  *
     484             :  * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
     485             :  *
     486             :  * Return:
     487             :  * * 0       on success, or if mm is NULL
     488             :  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
     489             :  */
     490           0 : int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
     491             : {
     492           0 :         int ret;
     493             : 
     494           0 :         if (pages == 0 || !mm)
     495             :                 return 0;
     496             : 
     497           0 :         mmap_write_lock(mm);
     498           0 :         ret = __account_locked_vm(mm, pages, inc, current,
     499           0 :                                   capable(CAP_IPC_LOCK));
     500           0 :         mmap_write_unlock(mm);
     501             : 
     502           0 :         return ret;
     503             : }
     504             : EXPORT_SYMBOL_GPL(account_locked_vm);
     505             : 
     506       28170 : unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
     507             :         unsigned long len, unsigned long prot,
     508             :         unsigned long flag, unsigned long pgoff)
     509             : {
     510       28170 :         unsigned long ret;
     511       28170 :         struct mm_struct *mm = current->mm;
     512       28170 :         unsigned long populate;
     513       28170 :         LIST_HEAD(uf);
     514             : 
     515       28170 :         ret = security_mmap_file(file, prot, flag);
     516       28170 :         if (!ret) {
     517       28170 :                 if (mmap_write_lock_killable(mm))
     518             :                         return -EINTR;
     519       28170 :                 ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
     520             :                               &uf);
     521       28170 :                 mmap_write_unlock(mm);
     522       28170 :                 userfaultfd_unmap_complete(mm, &uf);
     523       28170 :                 if (populate)
     524          10 :                         mm_populate(ret, populate);
     525             :         }
     526             :         return ret;
     527             : }
     528             : 
     529        7206 : unsigned long vm_mmap(struct file *file, unsigned long addr,
     530             :         unsigned long len, unsigned long prot,
     531             :         unsigned long flag, unsigned long offset)
     532             : {
     533        7206 :         if (unlikely(offset + PAGE_ALIGN(len) < offset))
     534             :                 return -EINVAL;
     535        7206 :         if (unlikely(offset_in_page(offset)))
     536             :                 return -EINVAL;
     537             : 
     538        7206 :         return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
     539             : }
     540             : EXPORT_SYMBOL(vm_mmap);
     541             : 
     542             : /**
     543             :  * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
     544             :  * failure, fall back to non-contiguous (vmalloc) allocation.
     545             :  * @size: size of the request.
     546             :  * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
     547             :  * @node: numa node to allocate from
     548             :  *
     549             :  * Uses kmalloc to get the memory but if the allocation fails then falls back
     550             :  * to the vmalloc allocator. Use kvfree for freeing the memory.
     551             :  *
     552             :  * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
     553             :  * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
     554             :  * preferable to the vmalloc fallback, due to visible performance drawbacks.
     555             :  *
     556             :  * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
     557             :  * fall back to vmalloc.
     558             :  *
     559             :  * Return: pointer to the allocated memory of %NULL in case of failure
     560             :  */
     561        2427 : void *kvmalloc_node(size_t size, gfp_t flags, int node)
     562             : {
     563        2427 :         gfp_t kmalloc_flags = flags;
     564        2427 :         void *ret;
     565             : 
     566             :         /*
     567             :          * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
     568             :          * so the given set of flags has to be compatible.
     569             :          */
     570        2427 :         if ((flags & GFP_KERNEL) != GFP_KERNEL)
     571           0 :                 return kmalloc_node(size, flags, node);
     572             : 
     573             :         /*
     574             :          * We want to attempt a large physically contiguous block first because
     575             :          * it is less likely to fragment multiple larger blocks and therefore
     576             :          * contribute to a long term fragmentation less than vmalloc fallback.
     577             :          * However make sure that larger requests are not too disruptive - no
     578             :          * OOM killer and no allocation failure warnings as we have a fallback.
     579             :          */
     580        2427 :         if (size > PAGE_SIZE) {
     581          60 :                 kmalloc_flags |= __GFP_NOWARN;
     582             : 
     583          60 :                 if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
     584          60 :                         kmalloc_flags |= __GFP_NORETRY;
     585             :         }
     586             : 
     587        2427 :         ret = kmalloc_node(size, kmalloc_flags, node);
     588             : 
     589             :         /*
     590             :          * It doesn't really make sense to fallback to vmalloc for sub page
     591             :          * requests
     592             :          */
     593        2428 :         if (ret || size <= PAGE_SIZE)
     594             :                 return ret;
     595             : 
     596           0 :         return __vmalloc_node(size, 1, flags, node,
     597           0 :                         __builtin_return_address(0));
     598             : }
     599             : EXPORT_SYMBOL(kvmalloc_node);
     600             : 
     601             : /**
     602             :  * kvfree() - Free memory.
     603             :  * @addr: Pointer to allocated memory.
     604             :  *
     605             :  * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
     606             :  * It is slightly more efficient to use kfree() or vfree() if you are certain
     607             :  * that you know which one to use.
     608             :  *
     609             :  * Context: Either preemptible task context or not-NMI interrupt.
     610             :  */
     611       11881 : void kvfree(const void *addr)
     612             : {
     613       11881 :         if (is_vmalloc_addr(addr))
     614           0 :                 vfree(addr);
     615             :         else
     616       11881 :                 kfree(addr);
     617       11881 : }
     618             : EXPORT_SYMBOL(kvfree);
     619             : 
     620             : /**
     621             :  * kvfree_sensitive - Free a data object containing sensitive information.
     622             :  * @addr: address of the data object to be freed.
     623             :  * @len: length of the data object.
     624             :  *
     625             :  * Use the special memzero_explicit() function to clear the content of a
     626             :  * kvmalloc'ed object containing sensitive data to make sure that the
     627             :  * compiler won't optimize out the data clearing.
     628             :  */
     629           0 : void kvfree_sensitive(const void *addr, size_t len)
     630             : {
     631           0 :         if (likely(!ZERO_OR_NULL_PTR(addr))) {
     632           0 :                 memzero_explicit((void *)addr, len);
     633           0 :                 kvfree(addr);
     634             :         }
     635           0 : }
     636             : EXPORT_SYMBOL(kvfree_sensitive);
     637             : 
     638         458 : static inline void *__page_rmapping(struct page *page)
     639             : {
     640         458 :         unsigned long mapping;
     641             : 
     642         458 :         mapping = (unsigned long)page->mapping;
     643         458 :         mapping &= ~PAGE_MAPPING_FLAGS;
     644             : 
     645         458 :         return (void *)mapping;
     646             : }
     647             : 
     648             : /* Neutral page->mapping pointer to address_space or anon_vma or other */
     649         458 : void *page_rmapping(struct page *page)
     650             : {
     651         458 :         page = compound_head(page);
     652         458 :         return __page_rmapping(page);
     653             : }
     654             : 
     655             : /*
     656             :  * Return true if this page is mapped into pagetables.
     657             :  * For compound page it returns true if any subpage of compound page is mapped.
     658             :  */
     659        5244 : bool page_mapped(struct page *page)
     660             : {
     661        5244 :         int i;
     662             : 
     663       10488 :         if (likely(!PageCompound(page)))
     664        5244 :                 return atomic_read(&page->_mapcount) >= 0;
     665           0 :         page = compound_head(page);
     666           0 :         if (atomic_read(compound_mapcount_ptr(page)) >= 0)
     667             :                 return true;
     668           0 :         if (PageHuge(page))
     669             :                 return false;
     670           0 :         for (i = 0; i < compound_nr(page); i++) {
     671           0 :                 if (atomic_read(&page[i]._mapcount) >= 0)
     672             :                         return true;
     673             :         }
     674             :         return false;
     675             : }
     676             : EXPORT_SYMBOL(page_mapped);
     677             : 
     678           0 : struct anon_vma *page_anon_vma(struct page *page)
     679             : {
     680           0 :         unsigned long mapping;
     681             : 
     682           0 :         page = compound_head(page);
     683           0 :         mapping = (unsigned long)page->mapping;
     684           0 :         if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     685             :                 return NULL;
     686           0 :         return __page_rmapping(page);
     687             : }
     688             : 
     689      109010 : struct address_space *page_mapping(struct page *page)
     690             : {
     691      109010 :         struct address_space *mapping;
     692             : 
     693      109010 :         page = compound_head(page);
     694             : 
     695             :         /* This happens if someone calls flush_dcache_page on slab page */
     696      218022 :         if (unlikely(PageSlab(page)))
     697             :                 return NULL;
     698             : 
     699      109012 :         if (unlikely(PageSwapCache(page))) {
     700             :                 swp_entry_t entry;
     701             : 
     702             :                 entry.val = page_private(page);
     703             :                 return swap_address_space(entry);
     704             :         }
     705             : 
     706      109012 :         mapping = page->mapping;
     707      109012 :         if ((unsigned long)mapping & PAGE_MAPPING_ANON)
     708             :                 return NULL;
     709             : 
     710       41647 :         return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
     711             : }
     712             : EXPORT_SYMBOL(page_mapping);
     713             : 
     714             : /*
     715             :  * For file cache pages, return the address_space, otherwise return NULL
     716             :  */
     717           0 : struct address_space *page_mapping_file(struct page *page)
     718             : {
     719           0 :         if (unlikely(PageSwapCache(page)))
     720             :                 return NULL;
     721           0 :         return page_mapping(page);
     722             : }
     723             : 
     724             : /* Slow path of page_mapcount() for compound pages */
     725          17 : int __page_mapcount(struct page *page)
     726             : {
     727          17 :         int ret;
     728             : 
     729          17 :         ret = atomic_read(&page->_mapcount) + 1;
     730             :         /*
     731             :          * For file THP page->_mapcount contains total number of mapping
     732             :          * of the page: no need to look into compound_mapcount.
     733             :          */
     734          17 :         if (!PageAnon(page) && !PageHuge(page))
     735             :                 return ret;
     736          17 :         page = compound_head(page);
     737          17 :         ret += atomic_read(compound_mapcount_ptr(page)) + 1;
     738          17 :         if (PageDoubleMap(page))
     739           0 :                 ret--;
     740             :         return ret;
     741             : }
     742             : EXPORT_SYMBOL_GPL(__page_mapcount);
     743             : 
     744             : int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
     745             : int sysctl_overcommit_ratio __read_mostly = 50;
     746             : unsigned long sysctl_overcommit_kbytes __read_mostly;
     747             : int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
     748             : unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
     749             : unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
     750             : 
     751           0 : int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
     752             :                 size_t *lenp, loff_t *ppos)
     753             : {
     754           0 :         int ret;
     755             : 
     756           0 :         ret = proc_dointvec(table, write, buffer, lenp, ppos);
     757           0 :         if (ret == 0 && write)
     758           0 :                 sysctl_overcommit_kbytes = 0;
     759           0 :         return ret;
     760             : }
     761             : 
     762           0 : static void sync_overcommit_as(struct work_struct *dummy)
     763             : {
     764           0 :         percpu_counter_sync(&vm_committed_as);
     765           0 : }
     766             : 
     767           0 : int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
     768             :                 size_t *lenp, loff_t *ppos)
     769             : {
     770           0 :         struct ctl_table t;
     771           0 :         int new_policy;
     772           0 :         int ret;
     773             : 
     774             :         /*
     775             :          * The deviation of sync_overcommit_as could be big with loose policy
     776             :          * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
     777             :          * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
     778             :          * with the strict "NEVER", and to avoid possible race condtion (even
     779             :          * though user usually won't too frequently do the switching to policy
     780             :          * OVERCOMMIT_NEVER), the switch is done in the following order:
     781             :          *      1. changing the batch
     782             :          *      2. sync percpu count on each CPU
     783             :          *      3. switch the policy
     784             :          */
     785           0 :         if (write) {
     786           0 :                 t = *table;
     787           0 :                 t.data = &new_policy;
     788           0 :                 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
     789           0 :                 if (ret)
     790             :                         return ret;
     791             : 
     792           0 :                 mm_compute_batch(new_policy);
     793           0 :                 if (new_policy == OVERCOMMIT_NEVER)
     794           0 :                         schedule_on_each_cpu(sync_overcommit_as);
     795           0 :                 sysctl_overcommit_memory = new_policy;
     796             :         } else {
     797           0 :                 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
     798             :         }
     799             : 
     800             :         return ret;
     801             : }
     802             : 
     803           0 : int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
     804             :                 size_t *lenp, loff_t *ppos)
     805             : {
     806           0 :         int ret;
     807             : 
     808           0 :         ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
     809           0 :         if (ret == 0 && write)
     810           0 :                 sysctl_overcommit_ratio = 0;
     811           0 :         return ret;
     812             : }
     813             : 
     814             : /*
     815             :  * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
     816             :  */
     817           1 : unsigned long vm_commit_limit(void)
     818             : {
     819           1 :         unsigned long allowed;
     820             : 
     821           1 :         if (sysctl_overcommit_kbytes)
     822           0 :                 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
     823             :         else
     824           1 :                 allowed = ((totalram_pages() - hugetlb_total_pages())
     825           1 :                            * sysctl_overcommit_ratio / 100);
     826           1 :         allowed += total_swap_pages;
     827             : 
     828           1 :         return allowed;
     829             : }
     830             : 
     831             : /*
     832             :  * Make sure vm_committed_as in one cacheline and not cacheline shared with
     833             :  * other variables. It can be updated by several CPUs frequently.
     834             :  */
     835             : struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
     836             : 
     837             : /*
     838             :  * The global memory commitment made in the system can be a metric
     839             :  * that can be used to drive ballooning decisions when Linux is hosted
     840             :  * as a guest. On Hyper-V, the host implements a policy engine for dynamically
     841             :  * balancing memory across competing virtual machines that are hosted.
     842             :  * Several metrics drive this policy engine including the guest reported
     843             :  * memory commitment.
     844             :  *
     845             :  * The time cost of this is very low for small platforms, and for big
     846             :  * platform like a 2S/36C/72T Skylake server, in worst case where
     847             :  * vm_committed_as's spinlock is under severe contention, the time cost
     848             :  * could be about 30~40 microseconds.
     849             :  */
     850           1 : unsigned long vm_memory_committed(void)
     851             : {
     852           1 :         return percpu_counter_sum_positive(&vm_committed_as);
     853             : }
     854             : EXPORT_SYMBOL_GPL(vm_memory_committed);
     855             : 
     856             : /*
     857             :  * Check that a process has enough memory to allocate a new virtual
     858             :  * mapping. 0 means there is enough memory for the allocation to
     859             :  * succeed and -ENOMEM implies there is not.
     860             :  *
     861             :  * We currently support three overcommit policies, which are set via the
     862             :  * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting.rst
     863             :  *
     864             :  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
     865             :  * Additional code 2002 Jul 20 by Robert Love.
     866             :  *
     867             :  * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
     868             :  *
     869             :  * Note this is a helper function intended to be used by LSMs which
     870             :  * wish to use this logic.
     871             :  */
     872       44064 : int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
     873             : {
     874       44064 :         long allowed;
     875             : 
     876       44064 :         vm_acct_memory(pages);
     877             : 
     878             :         /*
     879             :          * Sometimes we want to use more memory than we have
     880             :          */
     881       44064 :         if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
     882             :                 return 0;
     883             : 
     884       44064 :         if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
     885       44064 :                 if (pages > totalram_pages() + total_swap_pages)
     886           0 :                         goto error;
     887             :                 return 0;
     888             :         }
     889             : 
     890           0 :         allowed = vm_commit_limit();
     891             :         /*
     892             :          * Reserve some for root
     893             :          */
     894           0 :         if (!cap_sys_admin)
     895           0 :                 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
     896             : 
     897             :         /*
     898             :          * Don't let a single process grow so big a user can't recover
     899             :          */
     900           0 :         if (mm) {
     901           0 :                 long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
     902             : 
     903           0 :                 allowed -= min_t(long, mm->total_vm / 32, reserve);
     904             :         }
     905             : 
     906           0 :         if (percpu_counter_read_positive(&vm_committed_as) < allowed)
     907             :                 return 0;
     908           0 : error:
     909           0 :         vm_unacct_memory(pages);
     910             : 
     911           0 :         return -ENOMEM;
     912             : }
     913             : 
     914             : /**
     915             :  * get_cmdline() - copy the cmdline value to a buffer.
     916             :  * @task:     the task whose cmdline value to copy.
     917             :  * @buffer:   the buffer to copy to.
     918             :  * @buflen:   the length of the buffer. Larger cmdline values are truncated
     919             :  *            to this length.
     920             :  *
     921             :  * Return: the size of the cmdline field copied. Note that the copy does
     922             :  * not guarantee an ending NULL byte.
     923             :  */
     924           0 : int get_cmdline(struct task_struct *task, char *buffer, int buflen)
     925             : {
     926           0 :         int res = 0;
     927           0 :         unsigned int len;
     928           0 :         struct mm_struct *mm = get_task_mm(task);
     929           0 :         unsigned long arg_start, arg_end, env_start, env_end;
     930           0 :         if (!mm)
     931           0 :                 goto out;
     932           0 :         if (!mm->arg_end)
     933           0 :                 goto out_mm;    /* Shh! No looking before we're done */
     934             : 
     935           0 :         spin_lock(&mm->arg_lock);
     936           0 :         arg_start = mm->arg_start;
     937           0 :         arg_end = mm->arg_end;
     938           0 :         env_start = mm->env_start;
     939           0 :         env_end = mm->env_end;
     940           0 :         spin_unlock(&mm->arg_lock);
     941             : 
     942           0 :         len = arg_end - arg_start;
     943             : 
     944           0 :         if (len > buflen)
     945             :                 len = buflen;
     946             : 
     947           0 :         res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
     948             : 
     949             :         /*
     950             :          * If the nul at the end of args has been overwritten, then
     951             :          * assume application is using setproctitle(3).
     952             :          */
     953           0 :         if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
     954           0 :                 len = strnlen(buffer, res);
     955           0 :                 if (len < res) {
     956           0 :                         res = len;
     957             :                 } else {
     958           0 :                         len = env_end - env_start;
     959           0 :                         if (len > buflen - res)
     960             :                                 len = buflen - res;
     961           0 :                         res += access_process_vm(task, env_start,
     962           0 :                                                  buffer+res, len,
     963             :                                                  FOLL_FORCE);
     964           0 :                         res = strnlen(buffer, res);
     965             :                 }
     966             :         }
     967           0 : out_mm:
     968           0 :         mmput(mm);
     969           0 : out:
     970           0 :         return res;
     971             : }
     972             : 
     973           0 : int __weak memcmp_pages(struct page *page1, struct page *page2)
     974             : {
     975           0 :         char *addr1, *addr2;
     976           0 :         int ret;
     977             : 
     978           0 :         addr1 = kmap_atomic(page1);
     979           0 :         addr2 = kmap_atomic(page2);
     980           0 :         ret = memcmp(addr1, addr2, PAGE_SIZE);
     981           0 :         kunmap_atomic(addr2);
     982           0 :         kunmap_atomic(addr1);
     983           0 :         return ret;
     984             : }
     985             : 
     986             : /**
     987             :  * mem_dump_obj - Print available provenance information
     988             :  * @object: object for which to find provenance information.
     989             :  *
     990             :  * This function uses pr_cont(), so that the caller is expected to have
     991             :  * printed out whatever preamble is appropriate.  The provenance information
     992             :  * depends on the type of object and on how much debugging is enabled.
     993             :  * For example, for a slab-cache object, the slab name is printed, and,
     994             :  * if available, the return address and stack trace from the allocation
     995             :  * of that object.
     996             :  */
     997           0 : void mem_dump_obj(void *object)
     998             : {
     999           0 :         if (kmem_valid_obj(object)) {
    1000           0 :                 kmem_dump_obj(object);
    1001           0 :                 return;
    1002             :         }
    1003           0 :         if (vmalloc_dump_obj(object))
    1004             :                 return;
    1005           0 :         if (!virt_addr_valid(object)) {
    1006           0 :                 if (object == NULL)
    1007           0 :                         pr_cont(" NULL pointer.\n");
    1008           0 :                 else if (object == ZERO_SIZE_PTR)
    1009           0 :                         pr_cont(" zero-size pointer.\n");
    1010             :                 else
    1011           0 :                         pr_cont(" non-paged memory.\n");
    1012           0 :                 return;
    1013             :         }
    1014           0 :         pr_cont(" non-slab/vmalloc memory.\n");
    1015             : }

Generated by: LCOV version 1.14