LCOV - landlock.info

LCOV - code coverage report

Current view:	top level - mm - mmap.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	932	1357	68.7 %
Date:	2021-04-22 12:43:58	Functions:	69	93	74.2 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * mm/mmap.c
       4             :  *
       5             :  * Written by obz.
       6             :  *
       7             :  * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
       8             :  */
       9             : 
      10             : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      11             : 
      12             : #include <linux/kernel.h>
      13             : #include <linux/slab.h>
      14             : #include <linux/backing-dev.h>
      15             : #include <linux/mm.h>
      16             : #include <linux/vmacache.h>
      17             : #include <linux/shm.h>
      18             : #include <linux/mman.h>
      19             : #include <linux/pagemap.h>
      20             : #include <linux/swap.h>
      21             : #include <linux/syscalls.h>
      22             : #include <linux/capability.h>
      23             : #include <linux/init.h>
      24             : #include <linux/file.h>
      25             : #include <linux/fs.h>
      26             : #include <linux/personality.h>
      27             : #include <linux/security.h>
      28             : #include <linux/hugetlb.h>
      29             : #include <linux/shmem_fs.h>
      30             : #include <linux/profile.h>
      31             : #include <linux/export.h>
      32             : #include <linux/mount.h>
      33             : #include <linux/mempolicy.h>
      34             : #include <linux/rmap.h>
      35             : #include <linux/mmu_notifier.h>
      36             : #include <linux/mmdebug.h>
      37             : #include <linux/perf_event.h>
      38             : #include <linux/audit.h>
      39             : #include <linux/khugepaged.h>
      40             : #include <linux/uprobes.h>
      41             : #include <linux/rbtree_augmented.h>
      42             : #include <linux/notifier.h>
      43             : #include <linux/memory.h>
      44             : #include <linux/printk.h>
      45             : #include <linux/userfaultfd_k.h>
      46             : #include <linux/moduleparam.h>
      47             : #include <linux/pkeys.h>
      48             : #include <linux/oom.h>
      49             : #include <linux/sched/mm.h>
      50             : 
      51             : #include <linux/uaccess.h>
      52             : #include <asm/cacheflush.h>
      53             : #include <asm/tlb.h>
      54             : #include <asm/mmu_context.h>
      55             : 
      56             : #define CREATE_TRACE_POINTS
      57             : #include <trace/events/mmap.h>
      58             : 
      59             : #include "internal.h"
      60             : 
      61             : #ifndef arch_mmap_check
      62             : #define arch_mmap_check(addr, len, flags)       (0)
      63             : #endif
      64             : 
      65             : #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
      66             : const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
      67             : const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
      68             : int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
      69             : #endif
      70             : #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
      71             : const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
      72             : const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
      73             : int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
      74             : #endif
      75             : 
      76             : static bool ignore_rlimit_data;
      77             : core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
      78             : 
      79             : static void unmap_region(struct mm_struct *mm,
      80             :                 struct vm_area_struct *vma, struct vm_area_struct *prev,
      81             :                 unsigned long start, unsigned long end);
      82             : 
      83             : /* description of effects of mapping type and prot in current implementation.
      84             :  * this is due to the limited x86 page protection hardware.  The expected
      85             :  * behavior is in parens:
      86             :  *
      87             :  * map_type     prot
      88             :  *              PROT_NONE       PROT_READ       PROT_WRITE      PROT_EXEC
      89             :  * MAP_SHARED   r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
      90             :  *              w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
      91             :  *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
      92             :  *
      93             :  * MAP_PRIVATE  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
      94             :  *              w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
      95             :  *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
      96             :  */
      97             : pgprot_t protection_map[16] __ro_after_init = {
      98             :         __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
      99             :         __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
     100             : };
     101             : 
     102             : #ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
     103             : static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
     104             : {
     105             :         return prot;
     106             : }
     107             : #endif
     108             : 
     109       68999 : pgprot_t vm_get_page_prot(unsigned long vm_flags)
     110             : {
     111       68999 :         pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
     112             :                                 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
     113             :                         pgprot_val(arch_vm_get_page_prot(vm_flags)));
     114             : 
     115        2508 :         return arch_filter_pgprot(ret);
     116             : }
     117             : EXPORT_SYMBOL(vm_get_page_prot);
     118             : 
     119       36848 : static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
     120             : {
     121       36848 :         return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
     122             : }
     123             : 
     124             : /* Update vma->vm_page_prot to reflect vma->vm_flags. */
     125       36706 : void vma_set_page_prot(struct vm_area_struct *vma)
     126             : {
     127       36706 :         unsigned long vm_flags = vma->vm_flags;
     128       36706 :         pgprot_t vm_page_prot;
     129             : 
     130       36706 :         vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
     131       36706 :         if (vma_wants_writenotify(vma, vm_page_prot)) {
     132         136 :                 vm_flags &= ~VM_SHARED;
     133         136 :                 vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
     134             :         }
     135             :         /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
     136       36706 :         WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
     137       36706 : }
     138             : 
     139             : /*
     140             :  * Requires inode->i_mapping->i_mmap_rwsem
     141             :  */
     142       93402 : static void __remove_shared_vm_struct(struct vm_area_struct *vma,
     143             :                 struct file *file, struct address_space *mapping)
     144             : {
     145       93402 :         if (vma->vm_flags & VM_DENYWRITE)
     146       23440 :                 allow_write_access(file);
     147       93402 :         if (vma->vm_flags & VM_SHARED)
     148         259 :                 mapping_unmap_writable(mapping);
     149             : 
     150       93402 :         flush_dcache_mmap_lock(mapping);
     151       93402 :         vma_interval_tree_remove(vma, &mapping->i_mmap);
     152       93412 :         flush_dcache_mmap_unlock(mapping);
     153       93412 : }
     154             : 
     155             : /*
     156             :  * Unlink a file-based vm structure from its interval tree, to hide
     157             :  * vma from rmap and vmtruncate before freeing its page tables.
     158             :  */
     159      114271 : void unlink_file_vma(struct vm_area_struct *vma)
     160             : {
     161      114271 :         struct file *file = vma->vm_file;
     162             : 
     163      114271 :         if (file) {
     164       93409 :                 struct address_space *mapping = file->f_mapping;
     165       93409 :                 i_mmap_lock_write(mapping);
     166       93404 :                 __remove_shared_vm_struct(vma, file, mapping);
     167       93412 :                 i_mmap_unlock_write(mapping);
     168             :         }
     169      114277 : }
     170             : 
     171             : /*
     172             :  * Close a vm structure and free it, returning the next.
     173             :  */
     174      114271 : static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
     175             : {
     176      114271 :         struct vm_area_struct *next = vma->vm_next;
     177             : 
     178      114271 :         might_sleep();
     179      114272 :         if (vma->vm_ops && vma->vm_ops->close)
     180        4316 :                 vma->vm_ops->close(vma);
     181      114274 :         if (vma->vm_file)
     182       93412 :                 fput(vma->vm_file);
     183      114276 :         mpol_put(vma_policy(vma));
     184      114276 :         vm_area_free(vma);
     185      114248 :         return next;
     186             : }
     187             : 
     188             : static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
     189             :                 struct list_head *uf);
     190        6190 : SYSCALL_DEFINE1(brk, unsigned long, brk)
     191             : {
     192        3095 :         unsigned long newbrk, oldbrk, origbrk;
     193        3095 :         struct mm_struct *mm = current->mm;
     194        3095 :         struct vm_area_struct *next;
     195        3095 :         unsigned long min_brk;
     196        3095 :         bool populate;
     197        3095 :         bool downgraded = false;
     198        3095 :         LIST_HEAD(uf);
     199             : 
     200        3095 :         if (mmap_write_lock_killable(mm))
     201             :                 return -EINTR;
     202             : 
     203        3095 :         origbrk = mm->brk;
     204             : 
     205             : #ifdef CONFIG_COMPAT_BRK
     206             :         /*
     207             :          * CONFIG_COMPAT_BRK can still be overridden by setting
     208             :          * randomize_va_space to 2, which will still cause mm->start_brk
     209             :          * to be arbitrarily shifted
     210             :          */
     211             :         if (current->brk_randomized)
     212             :                 min_brk = mm->start_brk;
     213             :         else
     214             :                 min_brk = mm->end_data;
     215             : #else
     216        3095 :         min_brk = mm->start_brk;
     217             : #endif
     218        3095 :         if (brk < min_brk)
     219        1860 :                 goto out;
     220             : 
     221             :         /*
     222             :          * Check against rlimit here. If this check is done later after the test
     223             :          * of oldbrk with newbrk then it can escape the test and let the data
     224             :          * segment grow beyond its set limit the in case where the limit is
     225             :          * not page aligned -Ram Gupta
     226             :          */
     227        1235 :         if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
     228             :                               mm->end_data, mm->start_data))
     229           0 :                 goto out;
     230             : 
     231        1235 :         newbrk = PAGE_ALIGN(brk);
     232        1235 :         oldbrk = PAGE_ALIGN(mm->brk);
     233        1235 :         if (oldbrk == newbrk) {
     234           2 :                 mm->brk = brk;
     235           2 :                 goto success;
     236             :         }
     237             : 
     238             :         /*
     239             :          * Always allow shrinking brk.
     240             :          * __do_munmap() may downgrade mmap_lock to read.
     241             :          */
     242        1233 :         if (brk <= mm->brk) {
     243          50 :                 int ret;
     244             : 
     245             :                 /*
     246             :                  * mm->brk must to be protected by write mmap_lock so update it
     247             :                  * before downgrading mmap_lock. When __do_munmap() fails,
     248             :                  * mm->brk will be restored from origbrk.
     249             :                  */
     250          50 :                 mm->brk = brk;
     251          50 :                 ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
     252          50 :                 if (ret < 0) {
     253           0 :                         mm->brk = origbrk;
     254           0 :                         goto out;
     255          50 :                 } else if (ret == 1) {
     256          50 :                         downgraded = true;
     257             :                 }
     258          50 :                 goto success;
     259             :         }
     260             : 
     261             :         /* Check against existing mmap mappings. */
     262        1183 :         next = find_vma(mm, oldbrk);
     263        1187 :         if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
     264           0 :                 goto out;
     265             : 
     266             :         /* Ok, looks good - let it rip. */
     267        1183 :         if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
     268           0 :                 goto out;
     269        1183 :         mm->brk = brk;
     270             : 
     271        1235 : success:
     272        1235 :         populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
     273        1235 :         if (downgraded)
     274          50 :                 mmap_read_unlock(mm);
     275             :         else
     276        1185 :                 mmap_write_unlock(mm);
     277        1235 :         userfaultfd_unmap_complete(mm, &uf);
     278        1235 :         if (populate)
     279           0 :                 mm_populate(oldbrk, newbrk - oldbrk);
     280        1235 :         return brk;
     281             : 
     282        1860 : out:
     283        1860 :         mmap_write_unlock(mm);
     284        1860 :         return origbrk;
     285             : }
     286             : 
     287      461024 : static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
     288             : {
     289      461024 :         unsigned long gap, prev_end;
     290             : 
     291             :         /*
     292             :          * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
     293             :          * allow two stack_guard_gaps between them here, and when choosing
     294             :          * an unmapped area; whereas when expanding we only require one.
     295             :          * That's a little inconsistent, but keeps the code here simpler.
     296             :          */
     297      461024 :         gap = vm_start_gap(vma);
     298      461024 :         if (vma->vm_prev) {
     299      451341 :                 prev_end = vm_end_gap(vma->vm_prev);
     300      451341 :                 if (gap > prev_end)
     301      105273 :                         gap -= prev_end;
     302             :                 else
     303             :                         gap = 0;
     304             :         }
     305      461024 :         return gap;
     306             : }
     307             : 
     308             : #ifdef CONFIG_DEBUG_VM_RB
     309             : static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
     310             : {
     311             :         unsigned long max = vma_compute_gap(vma), subtree_gap;
     312             :         if (vma->vm_rb.rb_left) {
     313             :                 subtree_gap = rb_entry(vma->vm_rb.rb_left,
     314             :                                 struct vm_area_struct, vm_rb)->rb_subtree_gap;
     315             :                 if (subtree_gap > max)
     316             :                         max = subtree_gap;
     317             :         }
     318             :         if (vma->vm_rb.rb_right) {
     319             :                 subtree_gap = rb_entry(vma->vm_rb.rb_right,
     320             :                                 struct vm_area_struct, vm_rb)->rb_subtree_gap;
     321             :                 if (subtree_gap > max)
     322             :                         max = subtree_gap;
     323             :         }
     324             :         return max;
     325             : }
     326             : 
     327             : static int browse_rb(struct mm_struct *mm)
     328             : {
     329             :         struct rb_root *root = &mm->mm_rb;
     330             :         int i = 0, j, bug = 0;
     331             :         struct rb_node *nd, *pn = NULL;
     332             :         unsigned long prev = 0, pend = 0;
     333             : 
     334             :         for (nd = rb_first(root); nd; nd = rb_next(nd)) {
     335             :                 struct vm_area_struct *vma;
     336             :                 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
     337             :                 if (vma->vm_start < prev) {
     338             :                         pr_emerg("vm_start %lx < prev %lx\n",
     339             :                                   vma->vm_start, prev);
     340             :                         bug = 1;
     341             :                 }
     342             :                 if (vma->vm_start < pend) {
     343             :                         pr_emerg("vm_start %lx < pend %lx\n",
     344             :                                   vma->vm_start, pend);
     345             :                         bug = 1;
     346             :                 }
     347             :                 if (vma->vm_start > vma->vm_end) {
     348             :                         pr_emerg("vm_start %lx > vm_end %lx\n",
     349             :                                   vma->vm_start, vma->vm_end);
     350             :                         bug = 1;
     351             :                 }
     352             :                 spin_lock(&mm->page_table_lock);
     353             :                 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
     354             :                         pr_emerg("free gap %lx, correct %lx\n",
     355             :                                vma->rb_subtree_gap,
     356             :                                vma_compute_subtree_gap(vma));
     357             :                         bug = 1;
     358             :                 }
     359             :                 spin_unlock(&mm->page_table_lock);
     360             :                 i++;
     361             :                 pn = nd;
     362             :                 prev = vma->vm_start;
     363             :                 pend = vma->vm_end;
     364             :         }
     365             :         j = 0;
     366             :         for (nd = pn; nd; nd = rb_prev(nd))
     367             :                 j++;
     368             :         if (i != j) {
     369             :                 pr_emerg("backwards %d, forwards %d\n", j, i);
     370             :                 bug = 1;
     371             :         }
     372             :         return bug ? -1 : i;
     373             : }
     374             : 
     375             : static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
     376             : {
     377             :         struct rb_node *nd;
     378             : 
     379             :         for (nd = rb_first(root); nd; nd = rb_next(nd)) {
     380             :                 struct vm_area_struct *vma;
     381             :                 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
     382             :                 VM_BUG_ON_VMA(vma != ignore &&
     383             :                         vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
     384             :                         vma);
     385             :         }
     386             : }
     387             : 
     388             : static void validate_mm(struct mm_struct *mm)
     389             : {
     390             :         int bug = 0;
     391             :         int i = 0;
     392             :         unsigned long highest_address = 0;
     393             :         struct vm_area_struct *vma = mm->mmap;
     394             : 
     395             :         while (vma) {
     396             :                 struct anon_vma *anon_vma = vma->anon_vma;
     397             :                 struct anon_vma_chain *avc;
     398             : 
     399             :                 if (anon_vma) {
     400             :                         anon_vma_lock_read(anon_vma);
     401             :                         list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
     402             :                                 anon_vma_interval_tree_verify(avc);
     403             :                         anon_vma_unlock_read(anon_vma);
     404             :                 }
     405             : 
     406             :                 highest_address = vm_end_gap(vma);
     407             :                 vma = vma->vm_next;
     408             :                 i++;
     409             :         }
     410             :         if (i != mm->map_count) {
     411             :                 pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
     412             :                 bug = 1;
     413             :         }
     414             :         if (highest_address != mm->highest_vm_end) {
     415             :                 pr_emerg("mm->highest_vm_end %lx, found %lx\n",
     416             :                           mm->highest_vm_end, highest_address);
     417             :                 bug = 1;
     418             :         }
     419             :         i = browse_rb(mm);
     420             :         if (i != mm->map_count) {
     421             :                 if (i != -1)
     422             :                         pr_emerg("map_count %d rb %d\n", mm->map_count, i);
     423             :                 bug = 1;
     424             :         }
     425             :         VM_BUG_ON_MM(bug, mm);
     426             : }
     427             : #else
     428             : #define validate_mm_rb(root, ignore) do { } while (0)
     429             : #define validate_mm(mm) do { } while (0)
     430             : #endif
     431             : 
     432      934302 : RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
     433             :                          struct vm_area_struct, vm_rb,
     434             :                          unsigned long, rb_subtree_gap, vma_compute_gap)
     435             : 
     436             : /*
     437             :  * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
     438             :  * vma->vm_prev->vm_end values changed, without modifying the vma's position
     439             :  * in the rbtree.
     440             :  */
     441      193282 : static void vma_gap_update(struct vm_area_struct *vma)
     442             : {
     443             :         /*
     444             :          * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
     445             :          * a callback function that does exactly what we want.
     446             :          */
     447      193282 :         vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
     448       75333 : }
     449             : 
     450      116996 : static inline void vma_rb_insert(struct vm_area_struct *vma,
     451             :                                  struct rb_root *root)
     452             : {
     453             :         /* All rb_subtree_gap values must be consistent prior to insertion */
     454      116996 :         validate_mm_rb(root, NULL);
     455             : 
     456      233991 :         rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
     457             : }
     458             : 
     459       16940 : static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
     460             : {
     461             :         /*
     462             :          * Note rb_erase_augmented is a fairly large inline function,
     463             :          * so make sure we instantiate it only once with our desired
     464             :          * augmented rbtree callbacks.
     465             :          */
     466       16940 :         rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
     467       16940 : }
     468             : 
     469       16940 : static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
     470             :                                                 struct rb_root *root,
     471             :                                                 struct vm_area_struct *ignore)
     472             : {
     473             :         /*
     474             :          * All rb_subtree_gap values must be consistent prior to erase,
     475             :          * with the possible exception of
     476             :          *
     477             :          * a. the "next" vma being erased if next->vm_start was reduced in
     478             :          *    __vma_adjust() -> __vma_unlink()
     479             :          * b. the vma being erased in detach_vmas_to_be_unmapped() ->
     480             :          *    vma_rb_erase()
     481             :          */
     482       16940 :         validate_mm_rb(root, ignore);
     483             : 
     484       16940 :         __vma_rb_erase(vma, root);
     485             : }
     486             : 
     487       16940 : static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
     488             :                                          struct rb_root *root)
     489             : {
     490       33880 :         vma_rb_erase_ignore(vma, root, vma);
     491             : }
     492             : 
     493             : /*
     494             :  * vma has some anon_vma assigned, and is already inserted on that
     495             :  * anon_vma's interval trees.
     496             :  *
     497             :  * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
     498             :  * vma must be removed from the anon_vma's interval trees using
     499             :  * anon_vma_interval_tree_pre_update_vma().
     500             :  *
     501             :  * After the update, the vma will be reinserted using
     502             :  * anon_vma_interval_tree_post_update_vma().
     503             :  *
     504             :  * The entire update must be protected by exclusive mmap_lock and by
     505             :  * the root anon_vma's mutex.
     506             :  */
     507             : static inline void
     508       10363 : anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
     509             : {
     510       10363 :         struct anon_vma_chain *avc;
     511             : 
     512       20702 :         list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
     513       10339 :                 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
     514       10363 : }
     515             : 
     516             : static inline void
     517       10363 : anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
     518             : {
     519       10363 :         struct anon_vma_chain *avc;
     520             : 
     521       20702 :         list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
     522       10339 :                 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
     523       10363 : }
     524             : 
     525       74395 : static int find_vma_links(struct mm_struct *mm, unsigned long addr,
     526             :                 unsigned long end, struct vm_area_struct **pprev,
     527             :                 struct rb_node ***rb_link, struct rb_node **rb_parent)
     528             : {
     529       74395 :         struct rb_node **__rb_link, *__rb_parent, *rb_prev;
     530             : 
     531       74395 :         __rb_link = &mm->mm_rb.rb_node;
     532       74395 :         rb_prev = __rb_parent = NULL;
     533             : 
     534      509255 :         while (*__rb_link) {
     535      448192 :                 struct vm_area_struct *vma_tmp;
     536             : 
     537      448192 :                 __rb_parent = *__rb_link;
     538      448192 :                 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
     539             : 
     540      448192 :                 if (vma_tmp->vm_end > addr) {
     541             :                         /* Fail if an existing vma overlaps the area */
     542      290183 :                         if (vma_tmp->vm_start < end)
     543             :                                 return -ENOMEM;
     544      276851 :                         __rb_link = &__rb_parent->rb_left;
     545             :                 } else {
     546      158009 :                         rb_prev = __rb_parent;
     547      158009 :                         __rb_link = &__rb_parent->rb_right;
     548             :                 }
     549             :         }
     550             : 
     551       61063 :         *pprev = NULL;
     552       61063 :         if (rb_prev)
     553       57617 :                 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
     554       31266 :         *rb_link = __rb_link;
     555       31266 :         *rb_parent = __rb_parent;
     556       31266 :         return 0;
     557             : }
     558             : 
     559             : /*
     560             :  * vma_next() - Get the next VMA.
     561             :  * @mm: The mm_struct.
     562             :  * @vma: The current vma.
     563             :  *
     564             :  * If @vma is NULL, return the first vma in the mm.
     565             :  *
     566             :  * Returns: The next VMA after @vma.
     567             :  */
     568       72785 : static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
     569             :                                          struct vm_area_struct *vma)
     570             : {
     571       72785 :         if (!vma)
     572         938 :                 return mm->mmap;
     573             : 
     574       71847 :         return vma->vm_next;
     575             : }
     576             : 
     577             : /*
     578             :  * munmap_vma_range() - munmap VMAs that overlap a range.
     579             :  * @mm: The mm struct
     580             :  * @start: The start of the range.
     581             :  * @len: The length of the range.
     582             :  * @pprev: pointer to the pointer that will be set to previous vm_area_struct
     583             :  * @rb_link: the rb_node
     584             :  * @rb_parent: the parent rb_node
     585             :  *
     586             :  * Find all the vm_area_struct that overlap from @start to
     587             :  * @end and munmap them.  Set @pprev to the previous vm_area_struct.
     588             :  *
     589             :  * Returns: -ENOMEM on munmap failure or 0 on success.
     590             :  */
     591             : static inline int
     592       31268 : munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
     593             :                  struct vm_area_struct **pprev, struct rb_node ***link,
     594             :                  struct rb_node **parent, struct list_head *uf)
     595             : {
     596             : 
     597       75864 :         while (find_vma_links(mm, start, start + len, pprev, link, parent))
     598       13330 :                 if (do_munmap(mm, start, len, uf))
     599             :                         return -ENOMEM;
     600             : 
     601             :         return 0;
     602             : }
     603           0 : static unsigned long count_vma_pages_range(struct mm_struct *mm,
     604             :                 unsigned long addr, unsigned long end)
     605             : {
     606           0 :         unsigned long nr_pages = 0;
     607           0 :         struct vm_area_struct *vma;
     608             : 
     609             :         /* Find first overlaping mapping */
     610           0 :         vma = find_vma_intersection(mm, addr, end);
     611           0 :         if (!vma)
     612             :                 return 0;
     613             : 
     614           0 :         nr_pages = (min(end, vma->vm_end) -
     615           0 :                 max(addr, vma->vm_start)) >> PAGE_SHIFT;
     616             : 
     617             :         /* Iterate over the rest of the overlaps */
     618           0 :         for (vma = vma->vm_next; vma; vma = vma->vm_next) {
     619           0 :                 unsigned long overlap_len;
     620             : 
     621           0 :                 if (vma->vm_start > end)
     622             :                         break;
     623             : 
     624           0 :                 overlap_len = min(end, vma->vm_end) - vma->vm_start;
     625           0 :                 nr_pages += overlap_len >> PAGE_SHIFT;
     626             :         }
     627             : 
     628             :         return nr_pages;
     629             : }
     630             : 
     631      116995 : void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
     632             :                 struct rb_node **rb_link, struct rb_node *rb_parent)
     633             : {
     634             :         /* Update tracking information for the gap following the new vma. */
     635      116995 :         if (vma->vm_next)
     636       54118 :                 vma_gap_update(vma->vm_next);
     637             :         else
     638       62877 :                 mm->highest_vm_end = vm_end_gap(vma);
     639             : 
     640             :         /*
     641             :          * vma->vm_prev wasn't known when we followed the rbtree to find the
     642             :          * correct insertion point for that vma. As a result, we could not
     643             :          * update the vma vm_rb parents rb_subtree_gap values on the way down.
     644             :          * So, we first insert the vma with a zero rb_subtree_gap value
     645             :          * (to be consistent with what we did on the way down), and then
     646             :          * immediately update the gap to the correct value. Finally we
     647             :          * rebalance the rbtree after all augmented values have been set.
     648             :          */
     649      116995 :         rb_link_node(&vma->vm_rb, rb_parent, rb_link);
     650      116995 :         vma->rb_subtree_gap = 0;
     651      116995 :         vma_gap_update(vma);
     652      116996 :         vma_rb_insert(vma, &mm->mm_rb);
     653      116995 : }
     654             : 
     655       57397 : static void __vma_link_file(struct vm_area_struct *vma)
     656             : {
     657       57397 :         struct file *file;
     658             : 
     659       57397 :         file = vma->vm_file;
     660       57397 :         if (file) {
     661       47883 :                 struct address_space *mapping = file->f_mapping;
     662             : 
     663       47883 :                 if (vma->vm_flags & VM_DENYWRITE)
     664       11240 :                         put_write_access(file_inode(file));
     665       47883 :                 if (vma->vm_flags & VM_SHARED)
     666         142 :                         mapping_allow_writable(mapping);
     667             : 
     668       47883 :                 flush_dcache_mmap_lock(mapping);
     669       47883 :                 vma_interval_tree_insert(vma, &mapping->i_mmap);
     670       47883 :                 flush_dcache_mmap_unlock(mapping);
     671             :         }
     672       57397 : }
     673             : 
     674             : static void
     675       57564 : __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
     676             :         struct vm_area_struct *prev, struct rb_node **rb_link,
     677             :         struct rb_node *rb_parent)
     678             : {
     679       57564 :         __vma_link_list(mm, vma, prev);
     680       57564 :         __vma_link_rb(mm, vma, rb_link, rb_parent);
     681       57564 : }
     682             : 
     683       32151 : static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
     684             :                         struct vm_area_struct *prev, struct rb_node **rb_link,
     685             :                         struct rb_node *rb_parent)
     686             : {
     687       32151 :         struct address_space *mapping = NULL;
     688             : 
     689       32151 :         if (vma->vm_file) {
     690       22637 :                 mapping = vma->vm_file->f_mapping;
     691       22637 :                 i_mmap_lock_write(mapping);
     692             :         }
     693             : 
     694       32151 :         __vma_link(mm, vma, prev, rb_link, rb_parent);
     695       32151 :         __vma_link_file(vma);
     696             : 
     697       32151 :         if (mapping)
     698       22637 :                 i_mmap_unlock_write(mapping);
     699             : 
     700       32151 :         mm->map_count++;
     701       32151 :         validate_mm(mm);
     702       32151 : }
     703             : 
     704             : /*
     705             :  * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
     706             :  * mm's list and rbtree.  It has already been inserted into the interval tree.
     707             :  */
     708       25413 : static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
     709             : {
     710       25413 :         struct vm_area_struct *prev;
     711       25413 :         struct rb_node **rb_link, *rb_parent;
     712             : 
     713       50826 :         if (find_vma_links(mm, vma->vm_start, vma->vm_end,
     714             :                            &prev, &rb_link, &rb_parent))
     715           0 :                 BUG();
     716       25413 :         __vma_link(mm, vma, prev, rb_link, rb_parent);
     717       25413 :         mm->map_count++;
     718       25413 : }
     719             : 
     720           0 : static __always_inline void __vma_unlink(struct mm_struct *mm,
     721             :                                                 struct vm_area_struct *vma,
     722             :                                                 struct vm_area_struct *ignore)
     723             : {
     724           0 :         vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
     725           0 :         __vma_unlink_list(mm, vma);
     726             :         /* Kill the cache */
     727           0 :         vmacache_invalidate(mm);
     728             : }
     729             : 
     730             : /*
     731             :  * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
     732             :  * is already present in an i_mmap tree without adjusting the tree.
     733             :  * The following helper function should be used when such adjustments
     734             :  * are necessary.  The "insert" vma (if any) is to be inserted
     735             :  * before we drop the necessary locks.
     736             :  */
     737       30812 : int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
     738             :         unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
     739             :         struct vm_area_struct *expand)
     740             : {
     741       30812 :         struct mm_struct *mm = vma->vm_mm;
     742       30812 :         struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
     743       30812 :         struct address_space *mapping = NULL;
     744       30812 :         struct rb_root_cached *root = NULL;
     745       30812 :         struct anon_vma *anon_vma = NULL;
     746       30812 :         struct file *file = vma->vm_file;
     747       30812 :         bool start_changed = false, end_changed = false;
     748       30812 :         long adjust_next = 0;
     749       30812 :         int remove_next = 0;
     750             : 
     751       30812 :         if (next && !insert) {
     752        3523 :                 struct vm_area_struct *exporter = NULL, *importer = NULL;
     753             : 
     754        3523 :                 if (end >= next->vm_end) {
     755             :                         /*
     756             :                          * vma expands, overlapping all the next, and
     757             :                          * perhaps the one after too (mprotect case 6).
     758             :                          * The only other cases that gets here are
     759             :                          * case 1, case 7 and case 8.
     760             :                          */
     761           0 :                         if (next == expand) {
     762             :                                 /*
     763             :                                  * The only case where we don't expand "vma"
     764             :                                  * and we expand "next" instead is case 8.
     765             :                                  */
     766           0 :                                 VM_WARN_ON(end != next->vm_end);
     767             :                                 /*
     768             :                                  * remove_next == 3 means we're
     769             :                                  * removing "vma" and that to do so we
     770             :                                  * swapped "vma" and "next".
     771             :                                  */
     772           0 :                                 remove_next = 3;
     773           0 :                                 VM_WARN_ON(file != next->vm_file);
     774           0 :                                 swap(vma, next);
     775             :                         } else {
     776           0 :                                 VM_WARN_ON(expand != vma);
     777             :                                 /*
     778             :                                  * case 1, 6, 7, remove_next == 2 is case 6,
     779             :                                  * remove_next == 1 is case 1 or 7.
     780             :                                  */
     781           0 :                                 remove_next = 1 + (end > next->vm_end);
     782           0 :                                 VM_WARN_ON(remove_next == 2 &&
     783             :                                            end != next->vm_next->vm_end);
     784             :                                 /* trim end to next, for case 6 first pass */
     785             :                                 end = next->vm_end;
     786             :                         }
     787             : 
     788           0 :                         exporter = next;
     789           0 :                         importer = vma;
     790             : 
     791             :                         /*
     792             :                          * If next doesn't have anon_vma, import from vma after
     793             :                          * next, if the vma overlaps with it.
     794             :                          */
     795           0 :                         if (remove_next == 2 && !next->anon_vma)
     796           0 :                                 exporter = next->vm_next;
     797             : 
     798        3523 :                 } else if (end > next->vm_start) {
     799             :                         /*
     800             :                          * vma expands, overlapping part of the next:
     801             :                          * mprotect case 5 shifting the boundary up.
     802             :                          */
     803          21 :                         adjust_next = (end - next->vm_start);
     804          21 :                         exporter = next;
     805          21 :                         importer = vma;
     806          21 :                         VM_WARN_ON(expand != importer);
     807        3502 :                 } else if (end < vma->vm_end) {
     808             :                         /*
     809             :                          * vma shrinks, and !insert tells it's not
     810             :                          * split_vma inserting another: so it must be
     811             :                          * mprotect case 4 shifting the boundary down.
     812             :                          */
     813           3 :                         adjust_next = -(vma->vm_end - end);
     814           3 :                         exporter = vma;
     815           3 :                         importer = next;
     816           3 :                         VM_WARN_ON(expand != importer);
     817             :                 }
     818             : 
     819             :                 /*
     820             :                  * Easily overlooked: when mprotect shifts the boundary,
     821             :                  * make sure the expanding vma has anon_vma set if the
     822             :                  * shrinking vma had, to cover any anon pages imported.
     823             :                  */
     824        3523 :                 if (exporter && exporter->anon_vma && !importer->anon_vma) {
     825           0 :                         int error;
     826             : 
     827           0 :                         importer->anon_vma = exporter->anon_vma;
     828           0 :                         error = anon_vma_clone(importer, exporter);
     829           0 :                         if (error)
     830             :                                 return error;
     831             :                 }
     832             :         }
     833       30812 : again:
     834       30812 :         vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
     835             : 
     836       30812 :         if (file) {
     837       27360 :                 mapping = file->f_mapping;
     838       27360 :                 root = &mapping->i_mmap;
     839       27360 :                 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
     840             : 
     841       27360 :                 if (adjust_next)
     842       27360 :                         uprobe_munmap(next, next->vm_start, next->vm_end);
     843             : 
     844       27360 :                 i_mmap_lock_write(mapping);
     845       27360 :                 if (insert) {
     846             :                         /*
     847             :                          * Put into interval tree now, so instantiated pages
     848             :                          * are visible to arm/parisc __flush_dcache_page
     849             :                          * throughout; but we cannot insert into address
     850             :                          * space until vma start or end is updated.
     851             :                          */
     852       25246 :                         __vma_link_file(insert);
     853             :                 }
     854             :         }
     855             : 
     856       30812 :         anon_vma = vma->anon_vma;
     857       30812 :         if (!anon_vma && adjust_next)
     858           3 :                 anon_vma = next->anon_vma;
     859       30812 :         if (anon_vma) {
     860       18770 :                 VM_WARN_ON(adjust_next && next->anon_vma &&
     861             :                            anon_vma != next->anon_vma);
     862        9385 :                 anon_vma_lock_write(anon_vma);
     863        9385 :                 anon_vma_interval_tree_pre_update_vma(vma);
     864        9385 :                 if (adjust_next)
     865          24 :                         anon_vma_interval_tree_pre_update_vma(next);
     866             :         }
     867             : 
     868       30812 :         if (file) {
     869       27360 :                 flush_dcache_mmap_lock(mapping);
     870       27360 :                 vma_interval_tree_remove(vma, root);
     871       27360 :                 if (adjust_next)
     872           0 :                         vma_interval_tree_remove(next, root);
     873             :         }
     874             : 
     875       30812 :         if (start != vma->vm_start) {
     876       15445 :                 vma->vm_start = start;
     877       15445 :                 start_changed = true;
     878             :         }
     879       30812 :         if (end != vma->vm_end) {
     880       15367 :                 vma->vm_end = end;
     881       15367 :                 end_changed = true;
     882             :         }
     883       30812 :         vma->vm_pgoff = pgoff;
     884       30812 :         if (adjust_next) {
     885          24 :                 next->vm_start += adjust_next;
     886          24 :                 next->vm_pgoff += adjust_next >> PAGE_SHIFT;
     887             :         }
     888             : 
     889       30812 :         if (file) {
     890       27360 :                 if (adjust_next)
     891           0 :                         vma_interval_tree_insert(next, root);
     892       27360 :                 vma_interval_tree_insert(vma, root);
     893       27360 :                 flush_dcache_mmap_unlock(mapping);
     894             :         }
     895             : 
     896       30812 :         if (remove_next) {
     897             :                 /*
     898             :                  * vma_merge has merged next into vma, and needs
     899             :                  * us to remove next before dropping the locks.
     900             :                  */
     901           0 :                 if (remove_next != 3)
     902           0 :                         __vma_unlink(mm, next, next);
     903             :                 else
     904             :                         /*
     905             :                          * vma is not before next if they've been
     906             :                          * swapped.
     907             :                          *
     908             :                          * pre-swap() next->vm_start was reduced so
     909             :                          * tell validate_mm_rb to ignore pre-swap()
     910             :                          * "next" (which is stored in post-swap()
     911             :                          * "vma").
     912             :                          */
     913           0 :                         __vma_unlink(mm, next, vma);
     914           0 :                 if (file)
     915           0 :                         __remove_shared_vm_struct(next, file, mapping);
     916       30812 :         } else if (insert) {
     917             :                 /*
     918             :                  * split_vma has split insert from vma, and needs
     919             :                  * us to insert it before dropping the locks
     920             :                  * (it may either follow vma or precede it).
     921             :                  */
     922       25413 :                 __insert_vm_struct(mm, insert);
     923             :         } else {
     924        5399 :                 if (start_changed)
     925        4182 :                         vma_gap_update(vma);
     926        5399 :                 if (end_changed) {
     927        1217 :                         if (!next)
     928         938 :                                 mm->highest_vm_end = vm_end_gap(vma);
     929         279 :                         else if (!adjust_next)
     930         255 :                                 vma_gap_update(next);
     931             :                 }
     932             :         }
     933             : 
     934       30812 :         if (anon_vma) {
     935        9385 :                 anon_vma_interval_tree_post_update_vma(vma);
     936        9385 :                 if (adjust_next)
     937          24 :                         anon_vma_interval_tree_post_update_vma(next);
     938        9385 :                 anon_vma_unlock_write(anon_vma);
     939             :         }
     940             : 
     941       30812 :         if (file) {
     942       27360 :                 i_mmap_unlock_write(mapping);
     943       27360 :                 uprobe_mmap(vma);
     944             : 
     945       27360 :                 if (adjust_next)
     946       30812 :                         uprobe_mmap(next);
     947             :         }
     948             : 
     949       30812 :         if (remove_next) {
     950           0 :                 if (file) {
     951           0 :                         uprobe_munmap(next, next->vm_start, next->vm_end);
     952           0 :                         fput(file);
     953             :                 }
     954           0 :                 if (next->anon_vma)
     955           0 :                         anon_vma_merge(vma, next);
     956           0 :                 mm->map_count--;
     957           0 :                 mpol_put(vma_policy(next));
     958           0 :                 vm_area_free(next);
     959             :                 /*
     960             :                  * In mprotect's case 6 (see comments on vma_merge),
     961             :                  * we must remove another next too. It would clutter
     962             :                  * up the code too much to do both in one go.
     963             :                  */
     964           0 :                 if (remove_next != 3) {
     965             :                         /*
     966             :                          * If "next" was removed and vma->vm_end was
     967             :                          * expanded (up) over it, in turn
     968             :                          * "next->vm_prev->vm_end" changed and the
     969             :                          * "vma->vm_next" gap must be updated.
     970             :                          */
     971           0 :                         next = vma->vm_next;
     972             :                 } else {
     973             :                         /*
     974             :                          * For the scope of the comment "next" and
     975             :                          * "vma" considered pre-swap(): if "vma" was
     976             :                          * removed, next->vm_start was expanded (down)
     977             :                          * over it and the "next" gap must be updated.
     978             :                          * Because of the swap() the post-swap() "vma"
     979             :                          * actually points to pre-swap() "next"
     980             :                          * (post-swap() "next" as opposed is now a
     981             :                          * dangling pointer).
     982             :                          */
     983             :                         next = vma;
     984             :                 }
     985           0 :                 if (remove_next == 2) {
     986           0 :                         remove_next = 1;
     987           0 :                         end = next->vm_end;
     988           0 :                         goto again;
     989             :                 }
     990           0 :                 else if (next)
     991           0 :                         vma_gap_update(next);
     992             :                 else {
     993             :                         /*
     994             :                          * If remove_next == 2 we obviously can't
     995             :                          * reach this path.
     996             :                          *
     997             :                          * If remove_next == 3 we can't reach this
     998             :                          * path because pre-swap() next is always not
     999             :                          * NULL. pre-swap() "next" is not being
    1000             :                          * removed and its next->vm_end is not altered
    1001             :                          * (and furthermore "end" already matches
    1002             :                          * next->vm_end in remove_next == 3).
    1003             :                          *
    1004             :                          * We reach this only in the remove_next == 1
    1005             :                          * case if the "next" vma that was removed was
    1006             :                          * the highest vma of the mm. However in such
    1007             :                          * case next->vm_end == "end" and the extended
    1008             :                          * "vma" has vma->vm_end == next->vm_end so
    1009             :                          * mm->highest_vm_end doesn't need any update
    1010             :                          * in remove_next == 1 case.
    1011             :                          */
    1012           0 :                         VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
    1013             :                 }
    1014             :         }
    1015       30812 :         if (insert && file)
    1016       30812 :                 uprobe_mmap(insert);
    1017             : 
    1018       30812 :         validate_mm(mm);
    1019             : 
    1020       30812 :         return 0;
    1021             : }
    1022             : 
    1023             : /*
    1024             :  * If the vma has a ->close operation then the driver probably needs to release
    1025             :  * per-vma resources, so we don't attempt to merge those.
    1026             :  */
    1027       46882 : static inline int is_mergeable_vma(struct vm_area_struct *vma,
    1028             :                                 struct file *file, unsigned long vm_flags,
    1029             :                                 struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
    1030             : {
    1031             :         /*
    1032             :          * VM_SOFTDIRTY should not prevent from VMA merging, if we
    1033             :          * match the flags but dirty bit -- the caller should mark
    1034             :          * merged VMA as dirty. If dirty bit won't be excluded from
    1035             :          * comparison, we increase pressure on the memory system forcing
    1036             :          * the kernel to generate new VMAs when old one could be
    1037             :          * extended instead.
    1038             :          */
    1039       46882 :         if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
    1040             :                 return 0;
    1041        8388 :         if (vma->vm_file != file)
    1042             :                 return 0;
    1043        3538 :         if (vma->vm_ops && vma->vm_ops->close)
    1044             :                 return 0;
    1045        3538 :         if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
    1046             :                 return 0;
    1047        3538 :         return 1;
    1048             : }
    1049             : 
    1050        3538 : static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
    1051             :                                         struct anon_vma *anon_vma2,
    1052             :                                         struct vm_area_struct *vma)
    1053             : {
    1054             :         /*
    1055             :          * The list_is_singular() test is to avoid merging VMA cloned from
    1056             :          * parents. This can improve scalability caused by anon_vma lock.
    1057             :          */
    1058        3538 :         if ((!anon_vma1 || !anon_vma2) && (!vma ||
    1059        3538 :                 list_is_singular(&vma->anon_vma_chain)))
    1060             :                 return 1;
    1061        2133 :         return anon_vma1 == anon_vma2;
    1062             : }
    1063             : 
    1064             : /*
    1065             :  * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
    1066             :  * in front of (at a lower virtual address and file offset than) the vma.
    1067             :  *
    1068             :  * We cannot merge two vmas if they have differently assigned (non-NULL)
    1069             :  * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
    1070             :  *
    1071             :  * We don't check here for the merged mmap wrapping around the end of pagecache
    1072             :  * indices (16TB on ia32) because do_mmap() does not permit mmap's which
    1073             :  * wrap, nor mmaps which cover the final page at index -1UL.
    1074             :  */
    1075             : static int
    1076       21253 : can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
    1077             :                      struct anon_vma *anon_vma, struct file *file,
    1078             :                      pgoff_t vm_pgoff,
    1079             :                      struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
    1080             : {
    1081       27439 :         if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
    1082        3248 :             is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
    1083        3247 :                 if (vma->vm_pgoff == vm_pgoff)
    1084        3247 :                         return 1;
    1085             :         }
    1086             :         return 0;
    1087             : }
    1088             : 
    1089             : /*
    1090             :  * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
    1091             :  * beyond (at a higher virtual address and file offset than) the vma.
    1092             :  *
    1093             :  * We cannot merge two vmas if they have differently assigned (non-NULL)
    1094             :  * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
    1095             :  */
    1096             : static int
    1097       25629 : can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
    1098             :                     struct anon_vma *anon_vma, struct file *file,
    1099             :                     pgoff_t vm_pgoff,
    1100             :                     struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
    1101             : {
    1102       27831 :         if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
    1103         290 :             is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
    1104         285 :                 pgoff_t vm_pglen;
    1105         285 :                 vm_pglen = vma_pages(vma);
    1106         285 :                 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
    1107         276 :                         return 1;
    1108             :         }
    1109             :         return 0;
    1110             : }
    1111             : 
    1112             : /*
    1113             :  * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
    1114             :  * whether that can be merged with its predecessor or its successor.
    1115             :  * Or both (it neatly fills a hole).
    1116             :  *
    1117             :  * In most cases - when called for mmap, brk or mremap - [addr,end) is
    1118             :  * certain not to be mapped by the time vma_merge is called; but when
    1119             :  * called for mprotect, it is certain to be already mapped (either at
    1120             :  * an offset within prev, or at the start of next), and the flags of
    1121             :  * this area are about to be changed to vm_flags - and the no-change
    1122             :  * case has already been eliminated.
    1123             :  *
    1124             :  * The following mprotect cases have to be considered, where AAAA is
    1125             :  * the area passed down from mprotect_fixup, never extending beyond one
    1126             :  * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
    1127             :  *
    1128             :  *     AAAA             AAAA                   AAAA
    1129             :  *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPNNNNNN
    1130             :  *    cannot merge    might become       might become
    1131             :  *                    PPNNNNNNNNNN       PPPPPPPPPPNN
    1132             :  *    mmap, brk or    case 4 below       case 5 below
    1133             :  *    mremap move:
    1134             :  *                        AAAA               AAAA
    1135             :  *                    PPPP    NNNN       PPPPNNNNXXXX
    1136             :  *                    might become       might become
    1137             :  *                    PPPPPPPPPPPP 1 or  PPPPPPPPPPPP 6 or
    1138             :  *                    PPPPPPPPNNNN 2 or  PPPPPPPPXXXX 7 or
    1139             :  *                    PPPPNNNNNNNN 3     PPPPXXXXXXXX 8
    1140             :  *
    1141             :  * It is important for case 8 that the vma NNNN overlapping the
    1142             :  * region AAAA is never going to extended over XXXX. Instead XXXX must
    1143             :  * be extended in region AAAA and NNNN must be removed. This way in
    1144             :  * all cases where vma_merge succeeds, the moment vma_adjust drops the
    1145             :  * rmap_locks, the properties of the merged vma will be already
    1146             :  * correct for the whole merged range. Some of those properties like
    1147             :  * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
    1148             :  * be correct for the whole merged range immediately after the
    1149             :  * rmap_locks are released. Otherwise if XXXX would be removed and
    1150             :  * NNNN would be extended over the XXXX range, remove_migration_ptes
    1151             :  * or other rmap walkers (if working on addresses beyond the "end"
    1152             :  * parameter) may establish ptes with the wrong permissions of NNNN
    1153             :  * instead of the right permissions of XXXX.
    1154             :  */
    1155       39229 : struct vm_area_struct *vma_merge(struct mm_struct *mm,
    1156             :                         struct vm_area_struct *prev, unsigned long addr,
    1157             :                         unsigned long end, unsigned long vm_flags,
    1158             :                         struct anon_vma *anon_vma, struct file *file,
    1159             :                         pgoff_t pgoff, struct mempolicy *policy,
    1160             :                         struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
    1161             : {
    1162       39229 :         pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
    1163       39229 :         struct vm_area_struct *area, *next;
    1164       39229 :         int err;
    1165             : 
    1166             :         /*
    1167             :          * We later require that vma->vm_flags == vm_flags,
    1168             :          * so this tests vma->vm_flags & VM_SPECIAL, too.
    1169             :          */
    1170       39229 :         if (vm_flags & VM_SPECIAL)
    1171             :                 return NULL;
    1172             : 
    1173       39229 :         next = vma_next(mm, prev);
    1174       39229 :         area = next;
    1175       39229 :         if (area && area->vm_end == end)             /* cases 6, 7, 8 */
    1176           1 :                 next = next->vm_next;
    1177             : 
    1178             :         /* verify some invariant that must be enforced by the caller */
    1179       78455 :         VM_WARN_ON(prev && addr <= prev->vm_start);
    1180       78457 :         VM_WARN_ON(area && end > area->vm_end);
    1181       39229 :         VM_WARN_ON(addr >= end);
    1182             : 
    1183             :         /*
    1184             :          * Can it merge with the predecessor?
    1185             :          */
    1186       39229 :         if (prev && prev->vm_end == addr &&
    1187       51255 :                         mpol_equal(vma_policy(prev), policy) &&
    1188       25628 :                         can_vma_merge_after(prev, vm_flags,
    1189             :                                             anon_vma, file, pgoff,
    1190             :                                             vm_userfaultfd_ctx)) {
    1191             :                 /*
    1192             :                  * OK, it can.  Can we now merge in the successor as well?
    1193             :                  */
    1194         276 :                 if (next && end == next->vm_start &&
    1195           2 :                                 mpol_equal(policy, vma_policy(next)) &&
    1196           1 :                                 can_vma_merge_before(next, vm_flags,
    1197             :                                                      anon_vma, file,
    1198             :                                                      pgoff+pglen,
    1199           0 :                                                      vm_userfaultfd_ctx) &&
    1200           0 :                                 is_mergeable_anon_vma(prev->anon_vma,
    1201             :                                                       next->anon_vma, NULL)) {
    1202             :                                                         /* cases 1, 6 */
    1203           0 :                         err = __vma_adjust(prev, prev->vm_start,
    1204             :                                          next->vm_end, prev->vm_pgoff, NULL,
    1205             :                                          prev);
    1206             :                 } else                                  /* cases 2, 5, 7 */
    1207         276 :                         err = __vma_adjust(prev, prev->vm_start,
    1208             :                                          end, prev->vm_pgoff, NULL, prev);
    1209         276 :                 if (err)
    1210             :                         return NULL;
    1211         276 :                 khugepaged_enter_vma_merge(prev, vm_flags);
    1212         276 :                 return prev;
    1213             :         }
    1214             : 
    1215             :         /*
    1216             :          * Can this new request be merged in front of next?
    1217             :          */
    1218       38952 :         if (next && end == next->vm_start &&
    1219       42507 :                         mpol_equal(policy, vma_policy(next)) &&
    1220       21255 :                         can_vma_merge_before(next, vm_flags,
    1221             :                                              anon_vma, file, pgoff+pglen,
    1222             :                                              vm_userfaultfd_ctx)) {
    1223        3247 :                 if (prev && addr < prev->vm_end)  /* case 4 */
    1224           3 :                         err = __vma_adjust(prev, prev->vm_start,
    1225             :                                          addr, prev->vm_pgoff, NULL, next);
    1226             :                 else {                                  /* cases 3, 8 */
    1227        3244 :                         err = __vma_adjust(area, addr, next->vm_end,
    1228        3244 :                                          next->vm_pgoff - pglen, NULL, next);
    1229             :                         /*
    1230             :                          * In case 3 area is already equal to next and
    1231             :                          * this is a noop, but in case 8 "area" has
    1232             :                          * been removed and next was expanded over it.
    1233             :                          */
    1234        3244 :                         area = next;
    1235             :                 }
    1236        3247 :                 if (err)
    1237             :                         return NULL;
    1238        3247 :                 khugepaged_enter_vma_merge(area, vm_flags);
    1239        3247 :                 return area;
    1240             :         }
    1241             : 
    1242             :         return NULL;
    1243             : }
    1244             : 
    1245             : /*
    1246             :  * Rough compatibility check to quickly see if it's even worth looking
    1247             :  * at sharing an anon_vma.
    1248             :  *
    1249             :  * They need to have the same vm_file, and the flags can only differ
    1250             :  * in things that mprotect may change.
    1251             :  *
    1252             :  * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
    1253             :  * we can merge the two vma's. For example, we refuse to merge a vma if
    1254             :  * there is a vm_ops->close() function, because that indicates that the
    1255             :  * driver is doing some kind of reference counting. But that doesn't
    1256             :  * really matter for the anon_vma sharing case.
    1257             :  */
    1258       21812 : static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
    1259             : {
    1260       21812 :         return a->vm_end == b->vm_start &&
    1261       14693 :                 mpol_equal(vma_policy(a), vma_policy(b)) &&
    1262       14693 :                 a->vm_file == b->vm_file &&
    1263       29134 :                 !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
    1264          13 :                 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
    1265             : }
    1266             : 
    1267             : /*
    1268             :  * Do some basic sanity checking to see if we can re-use the anon_vma
    1269             :  * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
    1270             :  * the same as 'old', the other will be the new one that is trying
    1271             :  * to share the anon_vma.
    1272             :  *
    1273             :  * NOTE! This runs with mm_sem held for reading, so it is possible that
    1274             :  * the anon_vma of 'old' is concurrently in the process of being set up
    1275             :  * by another page fault trying to merge _that_. But that's ok: if it
    1276             :  * is being set up, that automatically means that it will be a singleton
    1277             :  * acceptable for merging, so we can do all of this optimistically. But
    1278             :  * we do that READ_ONCE() to make sure that we never re-load the pointer.
    1279             :  *
    1280             :  * IOW: that the "list_is_singular()" test on the anon_vma_chain only
    1281             :  * matters for the 'stable anon_vma' case (ie the thing we want to avoid
    1282             :  * is to return an anon_vma that is "complex" due to having gone through
    1283             :  * a fork).
    1284             :  *
    1285             :  * We also make sure that the two vma's are compatible (adjacent,
    1286             :  * and with the same memory policies). That's all stable, even with just
    1287             :  * a read lock on the mm_sem.
    1288             :  */
    1289       21812 : static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
    1290             : {
    1291       21812 :         if (anon_vma_compatible(a, b)) {
    1292          13 :                 struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
    1293             : 
    1294          13 :                 if (anon_vma && list_is_singular(&old->anon_vma_chain))
    1295           0 :                         return anon_vma;
    1296             :         }
    1297             :         return NULL;
    1298             : }
    1299             : 
    1300             : /*
    1301             :  * find_mergeable_anon_vma is used by anon_vma_prepare, to check
    1302             :  * neighbouring vmas for a suitable anon_vma, before it goes off
    1303             :  * to allocate a new anon_vma.  It checks because a repetitive
    1304             :  * sequence of mprotects and faults may otherwise lead to distinct
    1305             :  * anon_vmas being allocated, preventing vma merge in subsequent
    1306             :  * mprotect.
    1307             :  */
    1308       13414 : struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
    1309             : {
    1310       13414 :         struct anon_vma *anon_vma = NULL;
    1311             : 
    1312             :         /* Try next first. */
    1313       13414 :         if (vma->vm_next) {
    1314       10906 :                 anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
    1315       10906 :                 if (anon_vma)
    1316             :                         return anon_vma;
    1317             :         }
    1318             : 
    1319             :         /* Try prev next. */
    1320       13414 :         if (vma->vm_prev)
    1321       10906 :                 anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
    1322             : 
    1323             :         /*
    1324             :          * We might reach here with anon_vma == NULL if we can't find
    1325             :          * any reusable anon_vma.
    1326             :          * There's no absolute need to look only at touching neighbours:
    1327             :          * we could search further afield for "compatible" anon_vmas.
    1328             :          * But it would probably just be a waste of time searching,
    1329             :          * or lead to too many vmas hanging off the same anon_vma.
    1330             :          * We're trying to allow mprotect remerging later on,
    1331             :          * not trying to minimize memory used for anon_vmas.
    1332             :          */
    1333             :         return anon_vma;
    1334             : }
    1335             : 
    1336             : /*
    1337             :  * If a hint addr is less than mmap_min_addr change hint to be as
    1338             :  * low as possible but still greater than mmap_min_addr
    1339             :  */
    1340        8856 : static inline unsigned long round_hint_to_min(unsigned long hint)
    1341             : {
    1342        8856 :         hint &= PAGE_MASK;
    1343        8857 :         if (((void *)hint != NULL) &&
    1344           1 :             (hint < mmap_min_addr))
    1345           0 :                 return PAGE_ALIGN(mmap_min_addr);
    1346             :         return hint;
    1347             : }
    1348             : 
    1349       31266 : static inline int mlock_future_check(struct mm_struct *mm,
    1350             :                                      unsigned long flags,
    1351             :                                      unsigned long len)
    1352             : {
    1353       31266 :         unsigned long locked, lock_limit;
    1354             : 
    1355             :         /*  mlock MCL_FUTURE? */
    1356       31266 :         if (flags & VM_LOCKED) {
    1357           0 :                 locked = len >> PAGE_SHIFT;
    1358           0 :                 locked += mm->locked_vm;
    1359           0 :                 lock_limit = rlimit(RLIMIT_MEMLOCK);
    1360           0 :                 lock_limit >>= PAGE_SHIFT;
    1361           0 :                 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
    1362           0 :                         return -EAGAIN;
    1363             :         }
    1364             :         return 0;
    1365             : }
    1366             : 
    1367       24748 : static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
    1368             : {
    1369       24748 :         if (S_ISREG(inode->i_mode))
    1370             :                 return MAX_LFS_FILESIZE;
    1371             : 
    1372           0 :         if (S_ISBLK(inode->i_mode))
    1373             :                 return MAX_LFS_FILESIZE;
    1374             : 
    1375           0 :         if (S_ISSOCK(inode->i_mode))
    1376             :                 return MAX_LFS_FILESIZE;
    1377             : 
    1378             :         /* Special "we do even unsigned file positions" case */
    1379           0 :         if (file->f_mode & FMODE_UNSIGNED_OFFSET)
    1380           0 :                 return 0;
    1381             : 
    1382             :         /* Yes, random drivers might want more. But I'm tired of buggy drivers */
    1383             :         return ULONG_MAX;
    1384             : }
    1385             : 
    1386       24748 : static inline bool file_mmap_ok(struct file *file, struct inode *inode,
    1387             :                                 unsigned long pgoff, unsigned long len)
    1388             : {
    1389       24748 :         u64 maxsize = file_mmap_size_max(file, inode);
    1390             : 
    1391       24748 :         if (maxsize && len > maxsize)
    1392             :                 return false;
    1393       24748 :         maxsize -= len;
    1394       24748 :         if (pgoff > maxsize >> PAGE_SHIFT)
    1395           0 :                 return false;
    1396             :         return true;
    1397             : }
    1398             : 
    1399             : /*
    1400             :  * The caller must write-lock current->mm->mmap_lock.
    1401             :  */
    1402       28746 : unsigned long do_mmap(struct file *file, unsigned long addr,
    1403             :                         unsigned long len, unsigned long prot,
    1404             :                         unsigned long flags, unsigned long pgoff,
    1405             :                         unsigned long *populate, struct list_head *uf)
    1406             : {
    1407       28746 :         struct mm_struct *mm = current->mm;
    1408       28746 :         vm_flags_t vm_flags;
    1409       28746 :         int pkey = 0;
    1410             : 
    1411       28746 :         *populate = 0;
    1412             : 
    1413       28746 :         if (!len)
    1414             :                 return -EINVAL;
    1415             : 
    1416             :         /*
    1417             :          * Does the application expect PROT_READ to imply PROT_EXEC?
    1418             :          *
    1419             :          * (the exception is when the underlying filesystem is noexec
    1420             :          *  mounted, in which case we dont add PROT_EXEC.)
    1421             :          */
    1422       28746 :         if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
    1423           0 :                 if (!(file && path_noexec(&file->f_path)))
    1424           0 :                         prot |= PROT_EXEC;
    1425             : 
    1426             :         /* force arch specific MAP_FIXED handling in get_unmapped_area */
    1427       28746 :         if (flags & MAP_FIXED_NOREPLACE)
    1428        2808 :                 flags |= MAP_FIXED;
    1429             : 
    1430       28746 :         if (!(flags & MAP_FIXED))
    1431        8856 :                 addr = round_hint_to_min(addr);
    1432             : 
    1433             :         /* Careful about overflows.. */
    1434       28746 :         len = PAGE_ALIGN(len);
    1435       28746 :         if (!len)
    1436             :                 return -ENOMEM;
    1437             : 
    1438             :         /* offset overflow? */
    1439       28746 :         if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
    1440             :                 return -EOVERFLOW;
    1441             : 
    1442             :         /* Too many mappings? */
    1443       28746 :         if (mm->map_count > sysctl_max_map_count)
    1444             :                 return -ENOMEM;
    1445             : 
    1446             :         /* Obtain the address to map to. we verify (or select) it and ensure
    1447             :          * that it represents a valid section of the address space.
    1448             :          */
    1449       28746 :         addr = get_unmapped_area(file, addr, len, pgoff, flags);
    1450       28746 :         if (IS_ERR_VALUE(addr))
    1451             :                 return addr;
    1452             : 
    1453       28746 :         if (flags & MAP_FIXED_NOREPLACE) {
    1454        2808 :                 struct vm_area_struct *vma = find_vma(mm, addr);
    1455             : 
    1456        2808 :                 if (vma && vma->vm_start < addr + len)
    1457             :                         return -EEXIST;
    1458             :         }
    1459             : 
    1460       28746 :         if (prot == PROT_EXEC) {
    1461             :                 pkey = execute_only_pkey(mm);
    1462             :                 if (pkey < 0)
    1463             :                         pkey = 0;
    1464             :         }
    1465             : 
    1466             :         /* Do simple checking here so the lower-level routines won't have
    1467             :          * to. we assume access permissions have been handled by the open
    1468             :          * of the memory object, so we don't do any here.
    1469             :          */
    1470       28746 :         vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
    1471       28746 :                         mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
    1472             : 
    1473       28746 :         if (flags & MAP_LOCKED)
    1474           0 :                 if (!can_do_mlock())
    1475             :                         return -EPERM;
    1476             : 
    1477       28746 :         if (mlock_future_check(mm, vm_flags, len))
    1478             :                 return -EAGAIN;
    1479             : 
    1480       28746 :         if (file) {
    1481       24748 :                 struct inode *inode = file_inode(file);
    1482       24748 :                 unsigned long flags_mask;
    1483             : 
    1484       24748 :                 if (!file_mmap_ok(file, inode, pgoff, len))
    1485             :                         return -EOVERFLOW;
    1486             : 
    1487       24748 :                 flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
    1488             : 
    1489       24748 :                 switch (flags & MAP_TYPE) {
    1490         219 :                 case MAP_SHARED:
    1491             :                         /*
    1492             :                          * Force use of MAP_SHARED_VALIDATE with non-legacy
    1493             :                          * flags. E.g. MAP_SYNC is dangerous to use with
    1494             :                          * MAP_SHARED as you don't know which consistency model
    1495             :                          * you will get. We silently ignore unsupported flags
    1496             :                          * with MAP_SHARED to preserve backward compatibility.
    1497             :                          */
    1498         219 :                         flags &= LEGACY_MAP_MASK;
    1499         219 :                         fallthrough;
    1500         219 :                 case MAP_SHARED_VALIDATE:
    1501         219 :                         if (flags & ~flags_mask)
    1502             :                                 return -EOPNOTSUPP;
    1503         219 :                         if (prot & PROT_WRITE) {
    1504         139 :                                 if (!(file->f_mode & FMODE_WRITE))
    1505             :                                         return -EACCES;
    1506         139 :                                 if (IS_SWAPFILE(file->f_mapping->host))
    1507             :                                         return -ETXTBSY;
    1508             :                         }
    1509             : 
    1510             :                         /*
    1511             :                          * Make sure we don't allow writing to an append-only
    1512             :                          * file..
    1513             :                          */
    1514         219 :                         if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
    1515             :                                 return -EACCES;
    1516             : 
    1517             :                         /*
    1518             :                          * Make sure there are no mandatory locks on the file.
    1519             :                          */
    1520         219 :                         if (locks_verify_locked(file))
    1521             :                                 return -EAGAIN;
    1522             : 
    1523         219 :                         vm_flags |= VM_SHARED | VM_MAYSHARE;
    1524         219 :                         if (!(file->f_mode & FMODE_WRITE))
    1525          80 :                                 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
    1526       24748 :                         fallthrough;
    1527             :                 case MAP_PRIVATE:
    1528       24748 :                         if (!(file->f_mode & FMODE_READ))
    1529             :                                 return -EACCES;
    1530       24748 :                         if (path_noexec(&file->f_path)) {
    1531           0 :                                 if (vm_flags & VM_EXEC)
    1532             :                                         return -EPERM;
    1533           0 :                                 vm_flags &= ~VM_MAYEXEC;
    1534             :                         }
    1535             : 
    1536       24748 :                         if (!file->f_op->mmap)
    1537             :                                 return -ENODEV;
    1538       24748 :                         if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
    1539             :                                 return -EINVAL;
    1540             :                         break;
    1541             : 
    1542             :                 default:
    1543             :                         return -EINVAL;
    1544             :                 }
    1545             :         } else {
    1546        3998 :                 switch (flags & MAP_TYPE) {
    1547           3 :                 case MAP_SHARED:
    1548           3 :                         if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
    1549             :                                 return -EINVAL;
    1550             :                         /*
    1551             :                          * Ignore pgoff.
    1552             :                          */
    1553           3 :                         pgoff = 0;
    1554           3 :                         vm_flags |= VM_SHARED | VM_MAYSHARE;
    1555           3 :                         break;
    1556        3995 :                 case MAP_PRIVATE:
    1557             :                         /*
    1558             :                          * Set pgoff according to addr for anon_vma.
    1559             :                          */
    1560        3995 :                         pgoff = addr >> PAGE_SHIFT;
    1561        3995 :                         break;
    1562             :                 default:
    1563             :                         return -EINVAL;
    1564             :                 }
    1565             :         }
    1566             : 
    1567             :         /*
    1568             :          * Set 'VM_NORESERVE' if we should not account for the
    1569             :          * memory use of this mapping.
    1570             :          */
    1571       28746 :         if (flags & MAP_NORESERVE) {
    1572             :                 /* We honor MAP_NORESERVE if allowed to overcommit */
    1573           6 :                 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
    1574           6 :                         vm_flags |= VM_NORESERVE;
    1575             : 
    1576             :                 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
    1577             :                 if (file && is_file_hugepages(file))
    1578             :                         vm_flags |= VM_NORESERVE;
    1579             :         }
    1580             : 
    1581       28746 :         addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
    1582       28745 :         if (!IS_ERR_VALUE(addr) &&
    1583       28745 :             ((vm_flags & VM_LOCKED) ||
    1584       28745 :              (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
    1585          10 :                 *populate = len;
    1586             :         return addr;
    1587             : }
    1588             : 
    1589       21251 : unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
    1590             :                               unsigned long prot, unsigned long flags,
    1591             :                               unsigned long fd, unsigned long pgoff)
    1592             : {
    1593       21251 :         struct file *file = NULL;
    1594       21251 :         unsigned long retval;
    1595             : 
    1596       21251 :         if (!(flags & MAP_ANONYMOUS)) {
    1597       17253 :                 audit_mmap_fd(fd, flags);
    1598       17253 :                 file = fget(fd);
    1599       17254 :                 if (!file)
    1600             :                         return -EBADF;
    1601       17254 :                 if (is_file_hugepages(file)) {
    1602             :                         len = ALIGN(len, huge_page_size(hstate_file(file)));
    1603       17254 :                 } else if (unlikely(flags & MAP_HUGETLB)) {
    1604           0 :                         retval = -EINVAL;
    1605           0 :                         goto out_fput;
    1606             :                 }
    1607        3998 :         } else if (flags & MAP_HUGETLB) {
    1608       21252 :                 struct user_struct *user = NULL;
    1609             :                 struct hstate *hs;
    1610             : 
    1611       21252 :                 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
    1612             :                 if (!hs)
    1613       21252 :                         return -EINVAL;
    1614             : 
    1615             :                 len = ALIGN(len, huge_page_size(hs));
    1616             :                 /*
    1617             :                  * VM_NORESERVE is used because the reservations will be
    1618             :                  * taken when vm_ops->mmap() is called
    1619             :                  * A dummy user value is used because we are not locking
    1620             :                  * memory so no accounting is necessary
    1621             :                  */
    1622             :                 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
    1623             :                                 VM_NORESERVE,
    1624             :                                 &user, HUGETLB_ANONHUGE_INODE,
    1625             :                                 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
    1626             :                 if (IS_ERR(file))
    1627             :                         return PTR_ERR(file);
    1628             :         }
    1629             : 
    1630       21252 :         flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
    1631             : 
    1632       21252 :         retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
    1633       21252 : out_fput:
    1634       21252 :         if (file)
    1635       17254 :                 fput(file);
    1636             :         return retval;
    1637             : }
    1638             : 
    1639           0 : SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
    1640             :                 unsigned long, prot, unsigned long, flags,
    1641             :                 unsigned long, fd, unsigned long, pgoff)
    1642             : {
    1643           0 :         return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
    1644             : }
    1645             : 
    1646             : #ifdef __ARCH_WANT_SYS_OLD_MMAP
    1647             : struct mmap_arg_struct {
    1648             :         unsigned long addr;
    1649             :         unsigned long len;
    1650             :         unsigned long prot;
    1651             :         unsigned long flags;
    1652             :         unsigned long fd;
    1653             :         unsigned long offset;
    1654             : };
    1655             : 
    1656             : SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
    1657             : {
    1658             :         struct mmap_arg_struct a;
    1659             : 
    1660             :         if (copy_from_user(&a, arg, sizeof(a)))
    1661             :                 return -EFAULT;
    1662             :         if (offset_in_page(a.offset))
    1663             :                 return -EINVAL;
    1664             : 
    1665             :         return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
    1666             :                                a.offset >> PAGE_SHIFT);
    1667             : }
    1668             : #endif /* __ARCH_WANT_SYS_OLD_MMAP */
    1669             : 
    1670             : /*
    1671             :  * Some shared mappings will want the pages marked read-only
    1672             :  * to track write events. If so, we'll downgrade vm_page_prot
    1673             :  * to the private version (using protection_map[] without the
    1674             :  * VM_SHARED bit).
    1675             :  */
    1676       44667 : int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
    1677             : {
    1678       44667 :         vm_flags_t vm_flags = vma->vm_flags;
    1679       44667 :         const struct vm_operations_struct *vm_ops = vma->vm_ops;
    1680             : 
    1681             :         /* If it was private or non-writable, the write bit is already clear */
    1682       44667 :         if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
    1683             :                 return 0;
    1684             : 
    1685             :         /* The backer wishes to know when pages are first written to? */
    1686         142 :         if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
    1687             :                 return 1;
    1688             : 
    1689             :         /* The open routine did something to the protections that pgprot_modify
    1690             :          * won't preserve? */
    1691           6 :         if (pgprot_val(vm_page_prot) !=
    1692           6 :             pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
    1693             :                 return 0;
    1694             : 
    1695             :         /* Do we need to track softdirty? */
    1696           6 :         if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
    1697             :                 return 1;
    1698             : 
    1699             :         /* Specialty mapping? */
    1700           6 :         if (vm_flags & VM_PFNMAP)
    1701             :                 return 0;
    1702             : 
    1703             :         /* Can the mapping track the dirty pages? */
    1704           6 :         return vma->vm_file && vma->vm_file->f_mapping &&
    1705           6 :                 mapping_can_writeback(vma->vm_file->f_mapping);
    1706             : }
    1707             : 
    1708             : /*
    1709             :  * We account for memory if it's a private writeable mapping,
    1710             :  * not hugepages and VM_NORESERVE wasn't set.
    1711             :  */
    1712       28745 : static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
    1713             : {
    1714             :         /*
    1715             :          * hugetlb has its own accounting separate from the core VM
    1716             :          * VM_HUGETLB may not be set yet so we cannot check for that flag.
    1717             :          */
    1718       28745 :         if (file && is_file_hugepages(file))
    1719             :                 return 0;
    1720             : 
    1721       28745 :         return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
    1722             : }
    1723             : 
    1724       28746 : unsigned long mmap_region(struct file *file, unsigned long addr,
    1725             :                 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
    1726             :                 struct list_head *uf)
    1727             : {
    1728       28746 :         struct mm_struct *mm = current->mm;
    1729       28746 :         struct vm_area_struct *vma, *prev, *merge;
    1730       28746 :         int error;
    1731       28746 :         struct rb_node **rb_link, *rb_parent;
    1732       28746 :         unsigned long charged = 0;
    1733             : 
    1734             :         /* Check against address space limit. */
    1735       28746 :         if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
    1736           0 :                 unsigned long nr_pages;
    1737             : 
    1738             :                 /*
    1739             :                  * MAP_FIXED may remove pages of mappings that intersects with
    1740             :                  * requested mapping. Account for the pages it would unmap.
    1741             :                  */
    1742           0 :                 nr_pages = count_vma_pages_range(mm, addr, addr + len);
    1743             : 
    1744           0 :                 if (!may_expand_vm(mm, vm_flags,
    1745             :                                         (len >> PAGE_SHIFT) - nr_pages))
    1746             :                         return -ENOMEM;
    1747             :         }
    1748             : 
    1749             :         /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
    1750       28746 :         if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
    1751             :                 return -ENOMEM;
    1752             :         /*
    1753             :          * Private writable mapping: check memory availability
    1754             :          */
    1755       28745 :         if (accountable_mapping(file, vm_flags)) {
    1756        9943 :                 charged = len >> PAGE_SHIFT;
    1757        9943 :                 if (security_vm_enough_memory_mm(mm, charged))
    1758             :                         return -ENOMEM;
    1759        9942 :                 vm_flags |= VM_ACCOUNT;
    1760             :         }
    1761             : 
    1762             :         /*
    1763             :          * Can we just expand an old mapping?
    1764             :          */
    1765       28744 :         vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
    1766             :                         NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
    1767       28743 :         if (vma)
    1768        3245 :                 goto out;
    1769             : 
    1770             :         /*
    1771             :          * Determine the object being mapped and call the appropriate
    1772             :          * specific mapper. the address has already been validated, but
    1773             :          * not unmapped, but the maps are removed from the list.
    1774             :          */
    1775       25498 :         vma = vm_area_alloc(mm);
    1776       25501 :         if (!vma) {
    1777           0 :                 error = -ENOMEM;
    1778           0 :                 goto unacct_error;
    1779             :         }
    1780             : 
    1781       25501 :         vma->vm_start = addr;
    1782       25501 :         vma->vm_end = addr + len;
    1783       25501 :         vma->vm_flags = vm_flags;
    1784       25501 :         vma->vm_page_prot = vm_get_page_prot(vm_flags);
    1785       25501 :         vma->vm_pgoff = pgoff;
    1786             : 
    1787       25501 :         if (file) {
    1788       22634 :                 if (vm_flags & VM_DENYWRITE) {
    1789        7494 :                         error = deny_write_access(file);
    1790        7494 :                         if (error)
    1791           0 :                                 goto free_vma;
    1792             :                 }
    1793       22634 :                 if (vm_flags & VM_SHARED) {
    1794         139 :                         error = mapping_map_writable(file->f_mapping);
    1795         139 :                         if (error)
    1796           0 :                                 goto allow_write_and_free_vma;
    1797             :                 }
    1798             : 
    1799             :                 /* ->mmap() can change vma->vm_file, but must guarantee that
    1800             :                  * vma_link() below can deny write-access if VM_DENYWRITE is set
    1801             :                  * and map writably if VM_SHARED is set. This usually means the
    1802             :                  * new file must not have been exposed to user-space, yet.
    1803             :                  */
    1804       22634 :                 vma->vm_file = get_file(file);
    1805       22634 :                 error = call_mmap(file, vma);
    1806       22634 :                 if (error)
    1807           0 :                         goto unmap_and_free_vma;
    1808             : 
    1809             :                 /* Can addr have changed??
    1810             :                  *
    1811             :                  * Answer: Yes, several device drivers can do it in their
    1812             :                  *         f_op->mmap method. -DaveM
    1813             :                  * Bug: If addr is changed, prev, rb_link, rb_parent should
    1814             :                  *      be updated for vma_link()
    1815             :                  */
    1816       22634 :                 WARN_ON_ONCE(addr != vma->vm_start);
    1817             : 
    1818       22634 :                 addr = vma->vm_start;
    1819             : 
    1820             :                 /* If vm_flags changed after call_mmap(), we should try merge vma again
    1821             :                  * as we may succeed this time.
    1822             :                  */
    1823       22634 :                 if (unlikely(vm_flags != vma->vm_flags && prev)) {
    1824           0 :                         merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
    1825             :                                 NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
    1826           0 :                         if (merge) {
    1827             :                                 /* ->mmap() can change vma->vm_file and fput the original file. So
    1828             :                                  * fput the vma->vm_file here or we would add an extra fput for file
    1829             :                                  * and cause general protection fault ultimately.
    1830             :                                  */
    1831           0 :                                 fput(vma->vm_file);
    1832           0 :                                 vm_area_free(vma);
    1833           0 :                                 vma = merge;
    1834             :                                 /* Update vm_flags to pick up the change. */
    1835           0 :                                 vm_flags = vma->vm_flags;
    1836           0 :                                 goto unmap_writable;
    1837             :                         }
    1838             :                 }
    1839             : 
    1840       22634 :                 vm_flags = vma->vm_flags;
    1841        2867 :         } else if (vm_flags & VM_SHARED) {
    1842           3 :                 error = shmem_zero_setup(vma);
    1843           3 :                 if (error)
    1844           0 :                         goto free_vma;
    1845             :         } else {
    1846        2864 :                 vma_set_anonymous(vma);
    1847             :         }
    1848             : 
    1849             :         /* Allow architectures to sanity-check the vm_flags */
    1850       25501 :         if (!arch_validate_flags(vma->vm_flags)) {
    1851             :                 error = -EINVAL;
    1852             :                 if (file)
    1853             :                         goto unmap_and_free_vma;
    1854             :                 else
    1855             :                         goto free_vma;
    1856             :         }
    1857             : 
    1858       25501 :         vma_link(mm, vma, prev, rb_link, rb_parent);
    1859             :         /* Once vma denies write, undo our temporary denial count */
    1860       25501 :         if (file) {
    1861       22634 : unmap_writable:
    1862       22634 :                 if (vm_flags & VM_SHARED)
    1863         139 :                         mapping_unmap_writable(file->f_mapping);
    1864       22634 :                 if (vm_flags & VM_DENYWRITE)
    1865        7494 :                         allow_write_access(file);
    1866             :         }
    1867       25501 :         file = vma->vm_file;
    1868       28746 : out:
    1869       28746 :         perf_event_mmap(vma);
    1870             : 
    1871       28745 :         vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
    1872       28745 :         if (vm_flags & VM_LOCKED) {
    1873           0 :                 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
    1874           0 :                                         is_vm_hugetlb_page(vma) ||
    1875           0 :                                         vma == get_gate_vma(current->mm))
    1876           0 :                         vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
    1877             :                 else
    1878           0 :                         mm->locked_vm += (len >> PAGE_SHIFT);
    1879             :         }
    1880             : 
    1881       28745 :         if (file)
    1882       28745 :                 uprobe_mmap(vma);
    1883             : 
    1884             :         /*
    1885             :          * New (or expanded) vma always get soft dirty status.
    1886             :          * Otherwise user-space soft-dirty page tracker won't
    1887             :          * be able to distinguish situation when vma area unmapped,
    1888             :          * then new mapped in-place (which must be aimed as
    1889             :          * a completely new data area).
    1890             :          */
    1891       28745 :         vma->vm_flags |= VM_SOFTDIRTY;
    1892             : 
    1893       28745 :         vma_set_page_prot(vma);
    1894             : 
    1895       28745 :         return addr;
    1896             : 
    1897           0 : unmap_and_free_vma:
    1898           0 :         fput(vma->vm_file);
    1899           0 :         vma->vm_file = NULL;
    1900             : 
    1901             :         /* Undo any partial mapping done by a device driver. */
    1902           0 :         unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
    1903           0 :         charged = 0;
    1904           0 :         if (vm_flags & VM_SHARED)
    1905           0 :                 mapping_unmap_writable(file->f_mapping);
    1906           0 : allow_write_and_free_vma:
    1907           0 :         if (vm_flags & VM_DENYWRITE)
    1908           0 :                 allow_write_access(file);
    1909           0 : free_vma:
    1910           0 :         vm_area_free(vma);
    1911           0 : unacct_error:
    1912           0 :         if (charged)
    1913           0 :                 vm_unacct_memory(charged);
    1914           0 :         return error;
    1915             : }
    1916             : 
    1917           0 : static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
    1918             : {
    1919             :         /*
    1920             :          * We implement the search by looking for an rbtree node that
    1921             :          * immediately follows a suitable gap. That is,
    1922             :          * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
    1923             :          * - gap_end   = vma->vm_start        >= info->low_limit  + length;
    1924             :          * - gap_end - gap_start >= length
    1925             :          */
    1926             : 
    1927           0 :         struct mm_struct *mm = current->mm;
    1928           0 :         struct vm_area_struct *vma;
    1929           0 :         unsigned long length, low_limit, high_limit, gap_start, gap_end;
    1930             : 
    1931             :         /* Adjust search length to account for worst case alignment overhead */
    1932           0 :         length = info->length + info->align_mask;
    1933           0 :         if (length < info->length)
    1934             :                 return -ENOMEM;
    1935             : 
    1936             :         /* Adjust search limits by the desired length */
    1937           0 :         if (info->high_limit < length)
    1938             :                 return -ENOMEM;
    1939           0 :         high_limit = info->high_limit - length;
    1940             : 
    1941           0 :         if (info->low_limit > high_limit)
    1942             :                 return -ENOMEM;
    1943           0 :         low_limit = info->low_limit + length;
    1944             : 
    1945             :         /* Check if rbtree root looks promising */
    1946           0 :         if (RB_EMPTY_ROOT(&mm->mm_rb))
    1947           0 :                 goto check_highest;
    1948           0 :         vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
    1949           0 :         if (vma->rb_subtree_gap < length)
    1950           0 :                 goto check_highest;
    1951             : 
    1952           0 :         while (true) {
    1953             :                 /* Visit left subtree if it looks promising */
    1954           0 :                 gap_end = vm_start_gap(vma);
    1955           0 :                 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
    1956           0 :                         struct vm_area_struct *left =
    1957           0 :                                 rb_entry(vma->vm_rb.rb_left,
    1958             :                                          struct vm_area_struct, vm_rb);
    1959           0 :                         if (left->rb_subtree_gap >= length) {
    1960           0 :                                 vma = left;
    1961           0 :                                 continue;
    1962             :                         }
    1963             :                 }
    1964             : 
    1965           0 :                 gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
    1966           0 : check_current:
    1967             :                 /* Check if current node has a suitable gap */
    1968           0 :                 if (gap_start > high_limit)
    1969             :                         return -ENOMEM;
    1970           0 :                 if (gap_end >= low_limit &&
    1971           0 :                     gap_end > gap_start && gap_end - gap_start >= length)
    1972           0 :                         goto found;
    1973             : 
    1974             :                 /* Visit right subtree if it looks promising */
    1975           0 :                 if (vma->vm_rb.rb_right) {
    1976           0 :                         struct vm_area_struct *right =
    1977           0 :                                 rb_entry(vma->vm_rb.rb_right,
    1978             :                                          struct vm_area_struct, vm_rb);
    1979           0 :                         if (right->rb_subtree_gap >= length) {
    1980           0 :                                 vma = right;
    1981           0 :                                 continue;
    1982             :                         }
    1983             :                 }
    1984             : 
    1985             :                 /* Go back up the rbtree to find next candidate node */
    1986           0 :                 while (true) {
    1987           0 :                         struct rb_node *prev = &vma->vm_rb;
    1988           0 :                         if (!rb_parent(prev))
    1989           0 :                                 goto check_highest;
    1990           0 :                         vma = rb_entry(rb_parent(prev),
    1991             :                                        struct vm_area_struct, vm_rb);
    1992           0 :                         if (prev == vma->vm_rb.rb_left) {
    1993           0 :                                 gap_start = vm_end_gap(vma->vm_prev);
    1994           0 :                                 gap_end = vm_start_gap(vma);
    1995           0 :                                 goto check_current;
    1996             :                         }
    1997             :                 }
    1998             :         }
    1999             : 
    2000           0 : check_highest:
    2001             :         /* Check highest gap, which does not precede any rbtree node */
    2002           0 :         gap_start = mm->highest_vm_end;
    2003           0 :         gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
    2004           0 :         if (gap_start > high_limit)
    2005             :                 return -ENOMEM;
    2006             : 
    2007           0 : found:
    2008             :         /* We found a suitable gap. Clip it with the original low_limit. */
    2009           0 :         if (gap_start < info->low_limit)
    2010             :                 gap_start = info->low_limit;
    2011             : 
    2012             :         /* Adjust gap address to the desired alignment */
    2013           0 :         gap_start += (info->align_offset - gap_start) & info->align_mask;
    2014             : 
    2015           0 :         VM_BUG_ON(gap_start + info->length > info->high_limit);
    2016           0 :         VM_BUG_ON(gap_start + info->length > gap_end);
    2017             :         return gap_start;
    2018             : }
    2019             : 
    2020        8854 : static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
    2021             : {
    2022        8854 :         struct mm_struct *mm = current->mm;
    2023        8854 :         struct vm_area_struct *vma;
    2024        8854 :         unsigned long length, low_limit, high_limit, gap_start, gap_end;
    2025             : 
    2026             :         /* Adjust search length to account for worst case alignment overhead */
    2027        8854 :         length = info->length + info->align_mask;
    2028        8854 :         if (length < info->length)
    2029             :                 return -ENOMEM;
    2030             : 
    2031             :         /*
    2032             :          * Adjust search limits by the desired length.
    2033             :          * See implementation comment at top of unmapped_area().
    2034             :          */
    2035        8854 :         gap_end = info->high_limit;
    2036        8854 :         if (gap_end < length)
    2037             :                 return -ENOMEM;
    2038        8854 :         high_limit = gap_end - length;
    2039             : 
    2040        8854 :         if (info->low_limit > high_limit)
    2041             :                 return -ENOMEM;
    2042        8854 :         low_limit = info->low_limit + length;
    2043             : 
    2044             :         /* Check highest gap, which does not precede any rbtree node */
    2045        8854 :         gap_start = mm->highest_vm_end;
    2046        8854 :         if (gap_start <= high_limit)
    2047           0 :                 goto found_highest;
    2048             : 
    2049             :         /* Check if rbtree root looks promising */
    2050        8854 :         if (RB_EMPTY_ROOT(&mm->mm_rb))
    2051             :                 return -ENOMEM;
    2052        8854 :         vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
    2053        8854 :         if (vma->rb_subtree_gap < length)
    2054             :                 return -ENOMEM;
    2055             : 
    2056       60867 :         while (true) {
    2057             :                 /* Visit right subtree if it looks promising */
    2058       60867 :                 gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
    2059       60867 :                 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
    2060       41445 :                         struct vm_area_struct *right =
    2061       41445 :                                 rb_entry(vma->vm_rb.rb_right,
    2062             :                                          struct vm_area_struct, vm_rb);
    2063       41445 :                         if (right->rb_subtree_gap >= length) {
    2064       20720 :                                 vma = right;
    2065       20720 :                                 continue;
    2066             :                         }
    2067             :                 }
    2068             : 
    2069       40147 : check_current:
    2070             :                 /* Check if current node has a suitable gap */
    2071       49165 :                 gap_end = vm_start_gap(vma);
    2072       49165 :                 if (gap_end < low_limit)
    2073             :                         return -ENOMEM;
    2074       49165 :                 if (gap_start <= high_limit &&
    2075       49165 :                     gap_end > gap_start && gap_end - gap_start >= length)
    2076        8855 :                         goto found;
    2077             : 
    2078             :                 /* Visit left subtree if it looks promising */
    2079       40310 :                 if (vma->vm_rb.rb_left) {
    2080       40263 :                         struct vm_area_struct *left =
    2081       40263 :                                 rb_entry(vma->vm_rb.rb_left,
    2082             :                                          struct vm_area_struct, vm_rb);
    2083       40263 :                         if (left->rb_subtree_gap >= length) {
    2084       31293 :                                 vma = left;
    2085       31293 :                                 continue;
    2086             :                         }
    2087             :                 }
    2088             : 
    2089             :                 /* Go back up the rbtree to find next candidate node */
    2090        9064 :                 while (true) {
    2091        9064 :                         struct rb_node *prev = &vma->vm_rb;
    2092        9064 :                         if (!rb_parent(prev))
    2093             :                                 return -ENOMEM;
    2094        9065 :                         vma = rb_entry(rb_parent(prev),
    2095             :                                        struct vm_area_struct, vm_rb);
    2096        9065 :                         if (prev == vma->vm_rb.rb_right) {
    2097       18036 :                                 gap_start = vma->vm_prev ?
    2098        9018 :                                         vm_end_gap(vma->vm_prev) : 0;
    2099        9018 :                                 goto check_current;
    2100             :                         }
    2101             :                 }
    2102             :         }
    2103             : 
    2104        8855 : found:
    2105             :         /* We found a suitable gap. Clip it with the original high_limit. */
    2106        8855 :         if (gap_end > info->high_limit)
    2107             :                 gap_end = info->high_limit;
    2108             : 
    2109        8855 : found_highest:
    2110             :         /* Compute highest gap address at the desired alignment */
    2111        8855 :         gap_end -= info->length;
    2112        8855 :         gap_end -= (gap_end - info->align_offset) & info->align_mask;
    2113             : 
    2114        8855 :         VM_BUG_ON(gap_end < info->low_limit);
    2115        8855 :         VM_BUG_ON(gap_end < gap_start);
    2116             :         return gap_end;
    2117             : }
    2118             : 
    2119             : /*
    2120             :  * Search for an unmapped address range.
    2121             :  *
    2122             :  * We are looking for a range that:
    2123             :  * - does not intersect with any VMA;
    2124             :  * - is contained within the [low_limit, high_limit) interval;
    2125             :  * - is at least the desired size.
    2126             :  * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
    2127             :  */
    2128        8855 : unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
    2129             : {
    2130        8855 :         unsigned long addr;
    2131             : 
    2132        8855 :         if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
    2133        8855 :                 addr = unmapped_area_topdown(info);
    2134             :         else
    2135           0 :                 addr = unmapped_area(info);
    2136             : 
    2137        8855 :         trace_vm_unmapped_area(addr, info);
    2138        8855 :         return addr;
    2139             : }
    2140             : 
    2141             : #ifndef arch_get_mmap_end
    2142             : #define arch_get_mmap_end(addr) (TASK_SIZE)
    2143             : #endif
    2144             : 
    2145             : #ifndef arch_get_mmap_base
    2146             : #define arch_get_mmap_base(addr, base) (base)
    2147             : #endif
    2148             : 
    2149             : /* Get an address range which is currently unmapped.
    2150             :  * For shmat() with addr=0.
    2151             :  *
    2152             :  * Ugly calling convention alert:
    2153             :  * Return value with the low bits set means error value,
    2154             :  * ie
    2155             :  *      if (ret & ~PAGE_MASK)
    2156             :  *              error = ret;
    2157             :  *
    2158             :  * This function "knows" that -ENOMEM has the bits set.
    2159             :  */
    2160             : #ifndef HAVE_ARCH_UNMAPPED_AREA
    2161             : unsigned long
    2162             : arch_get_unmapped_area(struct file *filp, unsigned long addr,
    2163             :                 unsigned long len, unsigned long pgoff, unsigned long flags)
    2164             : {
    2165             :         struct mm_struct *mm = current->mm;
    2166             :         struct vm_area_struct *vma, *prev;
    2167             :         struct vm_unmapped_area_info info;
    2168             :         const unsigned long mmap_end = arch_get_mmap_end(addr);
    2169             : 
    2170             :         if (len > mmap_end - mmap_min_addr)
    2171             :                 return -ENOMEM;
    2172             : 
    2173             :         if (flags & MAP_FIXED)
    2174             :                 return addr;
    2175             : 
    2176             :         if (addr) {
    2177             :                 addr = PAGE_ALIGN(addr);
    2178             :                 vma = find_vma_prev(mm, addr, &prev);
    2179             :                 if (mmap_end - len >= addr && addr >= mmap_min_addr &&
    2180             :                     (!vma || addr + len <= vm_start_gap(vma)) &&
    2181             :                     (!prev || addr >= vm_end_gap(prev)))
    2182             :                         return addr;
    2183             :         }
    2184             : 
    2185             :         info.flags = 0;
    2186             :         info.length = len;
    2187             :         info.low_limit = mm->mmap_base;
    2188             :         info.high_limit = mmap_end;
    2189             :         info.align_mask = 0;
    2190             :         info.align_offset = 0;
    2191             :         return vm_unmapped_area(&info);
    2192             : }
    2193             : #endif
    2194             : 
    2195             : /*
    2196             :  * This mmap-allocator allocates new areas top-down from below the
    2197             :  * stack's low limit (the base):
    2198             :  */
    2199             : #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
    2200             : unsigned long
    2201             : arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
    2202             :                           unsigned long len, unsigned long pgoff,
    2203             :                           unsigned long flags)
    2204             : {
    2205             :         struct vm_area_struct *vma, *prev;
    2206             :         struct mm_struct *mm = current->mm;
    2207             :         struct vm_unmapped_area_info info;
    2208             :         const unsigned long mmap_end = arch_get_mmap_end(addr);
    2209             : 
    2210             :         /* requested length too big for entire address space */
    2211             :         if (len > mmap_end - mmap_min_addr)
    2212             :                 return -ENOMEM;
    2213             : 
    2214             :         if (flags & MAP_FIXED)
    2215             :                 return addr;
    2216             : 
    2217             :         /* requesting a specific address */
    2218             :         if (addr) {
    2219             :                 addr = PAGE_ALIGN(addr);
    2220             :                 vma = find_vma_prev(mm, addr, &prev);
    2221             :                 if (mmap_end - len >= addr && addr >= mmap_min_addr &&
    2222             :                                 (!vma || addr + len <= vm_start_gap(vma)) &&
    2223             :                                 (!prev || addr >= vm_end_gap(prev)))
    2224             :                         return addr;
    2225             :         }
    2226             : 
    2227             :         info.flags = VM_UNMAPPED_AREA_TOPDOWN;
    2228             :         info.length = len;
    2229             :         info.low_limit = max(PAGE_SIZE, mmap_min_addr);
    2230             :         info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
    2231             :         info.align_mask = 0;
    2232             :         info.align_offset = 0;
    2233             :         addr = vm_unmapped_area(&info);
    2234             : 
    2235             :         /*
    2236             :          * A failed mmap() very likely causes application failure,
    2237             :          * so fall back to the bottom-up function here. This scenario
    2238             :          * can happen with large stack limits and large mmap()
    2239             :          * allocations.
    2240             :          */
    2241             :         if (offset_in_page(addr)) {
    2242             :                 VM_BUG_ON(addr != -ENOMEM);
    2243             :                 info.flags = 0;
    2244             :                 info.low_limit = TASK_UNMAPPED_BASE;
    2245             :                 info.high_limit = mmap_end;
    2246             :                 addr = vm_unmapped_area(&info);
    2247             :         }
    2248             : 
    2249             :         return addr;
    2250             : }
    2251             : #endif
    2252             : 
    2253             : unsigned long
    2254       32203 : get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
    2255             :                 unsigned long pgoff, unsigned long flags)
    2256             : {
    2257       32203 :         unsigned long (*get_area)(struct file *, unsigned long,
    2258             :                                   unsigned long, unsigned long, unsigned long);
    2259             : 
    2260       32203 :         unsigned long error = arch_mmap_check(addr, len, flags);
    2261       32203 :         if (error)
    2262             :                 return error;
    2263             : 
    2264             :         /* Careful about overflows.. */
    2265       32203 :         if (len > TASK_SIZE)
    2266             :                 return -ENOMEM;
    2267             : 
    2268       32204 :         get_area = current->mm->get_unmapped_area;
    2269       32204 :         if (file) {
    2270       24748 :                 if (file->f_op->get_unmapped_area)
    2271       24748 :                         get_area = file->f_op->get_unmapped_area;
    2272        7456 :         } else if (flags & MAP_SHARED) {
    2273             :                 /*
    2274             :                  * mmap_region() will call shmem_zero_setup() to create a file,
    2275             :                  * so use shmem's get_unmapped_area in case it can be huge.
    2276             :                  * do_mmap() will clear pgoff, so match alignment.
    2277             :                  */
    2278           3 :                 pgoff = 0;
    2279           3 :                 get_area = shmem_get_unmapped_area;
    2280             :         }
    2281             : 
    2282       32204 :         addr = get_area(file, addr, len, pgoff, flags);
    2283       32204 :         if (IS_ERR_VALUE(addr))
    2284             :                 return addr;
    2285             : 
    2286       32204 :         if (addr > TASK_SIZE - len)
    2287             :                 return -ENOMEM;
    2288       32204 :         if (offset_in_page(addr))
    2289             :                 return -EINVAL;
    2290             : 
    2291       32204 :         error = security_mmap_addr(addr);
    2292       32204 :         return error ? error : addr;
    2293             : }
    2294             : 
    2295             : EXPORT_SYMBOL(get_unmapped_area);
    2296             : 
    2297             : /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
    2298      226397 : struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
    2299             : {
    2300      226397 :         struct rb_node *rb_node;
    2301      226397 :         struct vm_area_struct *vma;
    2302             : 
    2303             :         /* Check the cache first. */
    2304      226397 :         vma = vmacache_find(mm, addr);
    2305      226410 :         if (likely(vma))
    2306             :                 return vma;
    2307             : 
    2308      127680 :         rb_node = mm->mm_rb.rb_node;
    2309             : 
    2310      656263 :         while (rb_node) {
    2311      648364 :                 struct vm_area_struct *tmp;
    2312             : 
    2313      648364 :                 tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
    2314             : 
    2315      648364 :                 if (tmp->vm_end > addr) {
    2316      413338 :                         vma = tmp;
    2317      413338 :                         if (tmp->vm_start <= addr)
    2318             :                                 break;
    2319      293557 :                         rb_node = rb_node->rb_left;
    2320             :                 } else
    2321      235026 :                         rb_node = rb_node->rb_right;
    2322             :         }
    2323             : 
    2324      127680 :         if (vma)
    2325      126742 :                 vmacache_update(addr, vma);
    2326             :         return vma;
    2327             : }
    2328             : 
    2329             : EXPORT_SYMBOL(find_vma);
    2330             : 
    2331             : /*
    2332             :  * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
    2333             :  */
    2334             : struct vm_area_struct *
    2335          12 : find_vma_prev(struct mm_struct *mm, unsigned long addr,
    2336             :                         struct vm_area_struct **pprev)
    2337             : {
    2338          12 :         struct vm_area_struct *vma;
    2339             : 
    2340          12 :         vma = find_vma(mm, addr);
    2341          12 :         if (vma) {
    2342          12 :                 *pprev = vma->vm_prev;
    2343             :         } else {
    2344           0 :                 struct rb_node *rb_node = rb_last(&mm->mm_rb);
    2345             : 
    2346           0 :                 *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
    2347             :         }
    2348          12 :         return vma;
    2349             : }
    2350             : 
    2351             : /*
    2352             :  * Verify that the stack growth is acceptable and
    2353             :  * update accounting. This is shared with both the
    2354             :  * grow-up and grow-down cases.
    2355             :  */
    2356         954 : static int acct_stack_growth(struct vm_area_struct *vma,
    2357             :                              unsigned long size, unsigned long grow)
    2358             : {
    2359         954 :         struct mm_struct *mm = vma->vm_mm;
    2360         954 :         unsigned long new_start;
    2361             : 
    2362             :         /* address space limit tests */
    2363         954 :         if (!may_expand_vm(mm, vma->vm_flags, grow))
    2364             :                 return -ENOMEM;
    2365             : 
    2366             :         /* Stack limit test */
    2367         954 :         if (size > rlimit(RLIMIT_STACK))
    2368             :                 return -ENOMEM;
    2369             : 
    2370             :         /* mlock limit tests */
    2371         954 :         if (vma->vm_flags & VM_LOCKED) {
    2372           0 :                 unsigned long locked;
    2373           0 :                 unsigned long limit;
    2374           0 :                 locked = mm->locked_vm + grow;
    2375           0 :                 limit = rlimit(RLIMIT_MEMLOCK);
    2376           0 :                 limit >>= PAGE_SHIFT;
    2377           0 :                 if (locked > limit && !capable(CAP_IPC_LOCK))
    2378             :                         return -ENOMEM;
    2379             :         }
    2380             : 
    2381             :         /* Check to ensure the stack will not grow into a hugetlb-only region */
    2382         954 :         new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
    2383         954 :                         vma->vm_end - size;
    2384         954 :         if (is_hugepage_only_range(vma->vm_mm, new_start, size))
    2385             :                 return -EFAULT;
    2386             : 
    2387             :         /*
    2388             :          * Overcommit..  This must be the final test, as it will
    2389             :          * update security statistics.
    2390             :          */
    2391         954 :         if (security_vm_enough_memory_mm(mm, grow))
    2392           0 :                 return -ENOMEM;
    2393             : 
    2394             :         return 0;
    2395             : }
    2396             : 
    2397             : #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
    2398             : /*
    2399             :  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
    2400             :  * vma is the last one with address > vma->vm_end.  Have to extend vma.
    2401             :  */
    2402             : int expand_upwards(struct vm_area_struct *vma, unsigned long address)
    2403             : {
    2404             :         struct mm_struct *mm = vma->vm_mm;
    2405             :         struct vm_area_struct *next;
    2406             :         unsigned long gap_addr;
    2407             :         int error = 0;
    2408             : 
    2409             :         if (!(vma->vm_flags & VM_GROWSUP))
    2410             :                 return -EFAULT;
    2411             : 
    2412             :         /* Guard against exceeding limits of the address space. */
    2413             :         address &= PAGE_MASK;
    2414             :         if (address >= (TASK_SIZE & PAGE_MASK))
    2415             :                 return -ENOMEM;
    2416             :         address += PAGE_SIZE;
    2417             : 
    2418             :         /* Enforce stack_guard_gap */
    2419             :         gap_addr = address + stack_guard_gap;
    2420             : 
    2421             :         /* Guard against overflow */
    2422             :         if (gap_addr < address || gap_addr > TASK_SIZE)
    2423             :                 gap_addr = TASK_SIZE;
    2424             : 
    2425             :         next = vma->vm_next;
    2426             :         if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
    2427             :                 if (!(next->vm_flags & VM_GROWSUP))
    2428             :                         return -ENOMEM;
    2429             :                 /* Check that both stack segments have the same anon_vma? */
    2430             :         }
    2431             : 
    2432             :         /* We must make sure the anon_vma is allocated. */
    2433             :         if (unlikely(anon_vma_prepare(vma)))
    2434             :                 return -ENOMEM;
    2435             : 
    2436             :         /*
    2437             :          * vma->vm_start/vm_end cannot change under us because the caller
    2438             :          * is required to hold the mmap_lock in read mode.  We need the
    2439             :          * anon_vma lock to serialize against concurrent expand_stacks.
    2440             :          */
    2441             :         anon_vma_lock_write(vma->anon_vma);
    2442             : 
    2443             :         /* Somebody else might have raced and expanded it already */
    2444             :         if (address > vma->vm_end) {
    2445             :                 unsigned long size, grow;
    2446             : 
    2447             :                 size = address - vma->vm_start;
    2448             :                 grow = (address - vma->vm_end) >> PAGE_SHIFT;
    2449             : 
    2450             :                 error = -ENOMEM;
    2451             :                 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
    2452             :                         error = acct_stack_growth(vma, size, grow);
    2453             :                         if (!error) {
    2454             :                                 /*
    2455             :                                  * vma_gap_update() doesn't support concurrent
    2456             :                                  * updates, but we only hold a shared mmap_lock
    2457             :                                  * lock here, so we need to protect against
    2458             :                                  * concurrent vma expansions.
    2459             :                                  * anon_vma_lock_write() doesn't help here, as
    2460             :                                  * we don't guarantee that all growable vmas
    2461             :                                  * in a mm share the same root anon vma.
    2462             :                                  * So, we reuse mm->page_table_lock to guard
    2463             :                                  * against concurrent vma expansions.
    2464             :                                  */
    2465             :                                 spin_lock(&mm->page_table_lock);
    2466             :                                 if (vma->vm_flags & VM_LOCKED)
    2467             :                                         mm->locked_vm += grow;
    2468             :                                 vm_stat_account(mm, vma->vm_flags, grow);
    2469             :                                 anon_vma_interval_tree_pre_update_vma(vma);
    2470             :                                 vma->vm_end = address;
    2471             :                                 anon_vma_interval_tree_post_update_vma(vma);
    2472             :                                 if (vma->vm_next)
    2473             :                                         vma_gap_update(vma->vm_next);
    2474             :                                 else
    2475             :                                         mm->highest_vm_end = vm_end_gap(vma);
    2476             :                                 spin_unlock(&mm->page_table_lock);
    2477             : 
    2478             :                                 perf_event_mmap(vma);
    2479             :                         }
    2480             :                 }
    2481             :         }
    2482             :         anon_vma_unlock_write(vma->anon_vma);
    2483             :         khugepaged_enter_vma_merge(vma, vma->vm_flags);
    2484             :         validate_mm(mm);
    2485             :         return error;
    2486             : }
    2487             : #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
    2488             : 
    2489             : /*
    2490             :  * vma is the first one with address < vma->vm_start.  Have to extend vma.
    2491             :  */
    2492         954 : int expand_downwards(struct vm_area_struct *vma,
    2493             :                                    unsigned long address)
    2494             : {
    2495         954 :         struct mm_struct *mm = vma->vm_mm;
    2496         954 :         struct vm_area_struct *prev;
    2497         954 :         int error = 0;
    2498             : 
    2499         954 :         address &= PAGE_MASK;
    2500         954 :         if (address < mmap_min_addr)
    2501             :                 return -EPERM;
    2502             : 
    2503             :         /* Enforce stack_guard_gap */
    2504         954 :         prev = vma->vm_prev;
    2505             :         /* Check that both stack segments have the same anon_vma? */
    2506         954 :         if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
    2507          16 :                         vma_is_accessible(prev)) {
    2508          16 :                 if (address - prev->vm_end < stack_guard_gap)
    2509             :                         return -ENOMEM;
    2510             :         }
    2511             : 
    2512             :         /* We must make sure the anon_vma is allocated. */
    2513         954 :         if (unlikely(anon_vma_prepare(vma)))
    2514             :                 return -ENOMEM;
    2515             : 
    2516             :         /*
    2517             :          * vma->vm_start/vm_end cannot change under us because the caller
    2518             :          * is required to hold the mmap_lock in read mode.  We need the
    2519             :          * anon_vma lock to serialize against concurrent expand_stacks.
    2520             :          */
    2521         954 :         anon_vma_lock_write(vma->anon_vma);
    2522             : 
    2523             :         /* Somebody else might have raced and expanded it already */
    2524         954 :         if (address < vma->vm_start) {
    2525         954 :                 unsigned long size, grow;
    2526             : 
    2527         954 :                 size = vma->vm_end - address;
    2528         954 :                 grow = (vma->vm_start - address) >> PAGE_SHIFT;
    2529             : 
    2530         954 :                 error = -ENOMEM;
    2531         954 :                 if (grow <= vma->vm_pgoff) {
    2532         954 :                         error = acct_stack_growth(vma, size, grow);
    2533         954 :                         if (!error) {
    2534             :                                 /*
    2535             :                                  * vma_gap_update() doesn't support concurrent
    2536             :                                  * updates, but we only hold a shared mmap_lock
    2537             :                                  * lock here, so we need to protect against
    2538             :                                  * concurrent vma expansions.
    2539             :                                  * anon_vma_lock_write() doesn't help here, as
    2540             :                                  * we don't guarantee that all growable vmas
    2541             :                                  * in a mm share the same root anon vma.
    2542             :                                  * So, we reuse mm->page_table_lock to guard
    2543             :                                  * against concurrent vma expansions.
    2544             :                                  */
    2545         954 :                                 spin_lock(&mm->page_table_lock);
    2546         954 :                                 if (vma->vm_flags & VM_LOCKED)
    2547           0 :                                         mm->locked_vm += grow;
    2548         954 :                                 vm_stat_account(mm, vma->vm_flags, grow);
    2549         954 :                                 anon_vma_interval_tree_pre_update_vma(vma);
    2550         954 :                                 vma->vm_start = address;
    2551         954 :                                 vma->vm_pgoff -= grow;
    2552         954 :                                 anon_vma_interval_tree_post_update_vma(vma);
    2553         954 :                                 vma_gap_update(vma);
    2554         954 :                                 spin_unlock(&mm->page_table_lock);
    2555             : 
    2556         954 :                                 perf_event_mmap(vma);
    2557             :                         }
    2558             :                 }
    2559             :         }
    2560         954 :         anon_vma_unlock_write(vma->anon_vma);
    2561         954 :         khugepaged_enter_vma_merge(vma, vma->vm_flags);
    2562         954 :         validate_mm(mm);
    2563         954 :         return error;
    2564             : }
    2565             : 
    2566             : /* enforced gap between the expanding stack and other mappings. */
    2567             : unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
    2568             : 
    2569           0 : static int __init cmdline_parse_stack_guard_gap(char *p)
    2570             : {
    2571           0 :         unsigned long val;
    2572           0 :         char *endptr;
    2573             : 
    2574           0 :         val = simple_strtoul(p, &endptr, 10);
    2575           0 :         if (!*endptr)
    2576           0 :                 stack_guard_gap = val << PAGE_SHIFT;
    2577             : 
    2578           0 :         return 0;
    2579             : }
    2580             : __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
    2581             : 
    2582             : #ifdef CONFIG_STACK_GROWSUP
    2583             : int expand_stack(struct vm_area_struct *vma, unsigned long address)
    2584             : {
    2585             :         return expand_upwards(vma, address);
    2586             : }
    2587             : 
    2588             : struct vm_area_struct *
    2589             : find_extend_vma(struct mm_struct *mm, unsigned long addr)
    2590             : {
    2591             :         struct vm_area_struct *vma, *prev;
    2592             : 
    2593             :         addr &= PAGE_MASK;
    2594             :         vma = find_vma_prev(mm, addr, &prev);
    2595             :         if (vma && (vma->vm_start <= addr))
    2596             :                 return vma;
    2597             :         /* don't alter vm_end if the coredump is running */
    2598             :         if (!prev || expand_stack(prev, addr))
    2599             :                 return NULL;
    2600             :         if (prev->vm_flags & VM_LOCKED)
    2601             :                 populate_vma_page_range(prev, addr, prev->vm_end, NULL);
    2602             :         return prev;
    2603             : }
    2604             : #else
    2605         954 : int expand_stack(struct vm_area_struct *vma, unsigned long address)
    2606             : {
    2607         954 :         return expand_downwards(vma, address);
    2608             : }
    2609             : 
    2610             : struct vm_area_struct *
    2611        8834 : find_extend_vma(struct mm_struct *mm, unsigned long addr)
    2612             : {
    2613        8834 :         struct vm_area_struct *vma;
    2614        8834 :         unsigned long start;
    2615             : 
    2616        8834 :         addr &= PAGE_MASK;
    2617        8834 :         vma = find_vma(mm, addr);
    2618        8834 :         if (!vma)
    2619             :                 return NULL;
    2620        8834 :         if (vma->vm_start <= addr)
    2621             :                 return vma;
    2622           0 :         if (!(vma->vm_flags & VM_GROWSDOWN))
    2623             :                 return NULL;
    2624           0 :         start = vma->vm_start;
    2625           0 :         if (expand_stack(vma, addr))
    2626             :                 return NULL;
    2627           0 :         if (vma->vm_flags & VM_LOCKED)
    2628           0 :                 populate_vma_page_range(vma, addr, start, NULL);
    2629             :         return vma;
    2630             : }
    2631             : #endif
    2632             : 
    2633             : EXPORT_SYMBOL_GPL(find_extend_vma);
    2634             : 
    2635             : /*
    2636             :  * Ok - we have the memory areas we should free on the vma list,
    2637             :  * so release them, and do the vma updates.
    2638             :  *
    2639             :  * Called with the mm semaphore held.
    2640             :  */
    2641       16778 : static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
    2642             : {
    2643       16778 :         unsigned long nr_accounted = 0;
    2644             : 
    2645             :         /* Update high watermark before we lower total_vm */
    2646       16778 :         update_hiwater_vm(mm);
    2647       16940 :         do {
    2648       16940 :                 long nrpages = vma_pages(vma);
    2649             : 
    2650       16940 :                 if (vma->vm_flags & VM_ACCOUNT)
    2651         454 :                         nr_accounted += nrpages;
    2652       16940 :                 vm_stat_account(mm, vma->vm_flags, -nrpages);
    2653       16940 :                 vma = remove_vma(vma);
    2654       16936 :         } while (vma);
    2655       16774 :         vm_unacct_memory(nr_accounted);
    2656       16776 :         validate_mm(mm);
    2657       16776 : }
    2658             : 
    2659             : /*
    2660             :  * Get rid of page table information in the indicated region.
    2661             :  *
    2662             :  * Called with the mm semaphore held.
    2663             :  */
    2664       16778 : static void unmap_region(struct mm_struct *mm,
    2665             :                 struct vm_area_struct *vma, struct vm_area_struct *prev,
    2666             :                 unsigned long start, unsigned long end)
    2667             : {
    2668       16778 :         struct vm_area_struct *next = vma_next(mm, prev);
    2669       16778 :         struct mmu_gather tlb;
    2670             : 
    2671       16778 :         lru_add_drain();
    2672       16778 :         tlb_gather_mmu(&tlb, mm);
    2673       16778 :         update_hiwater_rss(mm);
    2674       16776 :         unmap_vmas(&tlb, vma, start, end);
    2675       16778 :         free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
    2676             :                                  next ? next->vm_start : USER_PGTABLES_CEILING);
    2677       16778 :         tlb_finish_mmu(&tlb);
    2678       16778 : }
    2679             : 
    2680             : /*
    2681             :  * Create a list of vma's touched by the unmap, removing them from the mm's
    2682             :  * vma list as we go..
    2683             :  */
    2684             : static bool
    2685       16778 : detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
    2686             :         struct vm_area_struct *prev, unsigned long end)
    2687             : {
    2688       16778 :         struct vm_area_struct **insertion_point;
    2689       16778 :         struct vm_area_struct *tail_vma = NULL;
    2690             : 
    2691       16778 :         insertion_point = (prev ? &prev->vm_next : &mm->mmap);
    2692       16778 :         vma->vm_prev = NULL;
    2693       16940 :         do {
    2694       16940 :                 vma_rb_erase(vma, &mm->mm_rb);
    2695       16940 :                 mm->map_count--;
    2696       16940 :                 tail_vma = vma;
    2697       16940 :                 vma = vma->vm_next;
    2698       16940 :         } while (vma && vma->vm_start < end);
    2699       16778 :         *insertion_point = vma;
    2700       16778 :         if (vma) {
    2701       16778 :                 vma->vm_prev = prev;
    2702       16778 :                 vma_gap_update(vma);
    2703             :         } else
    2704           0 :                 mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
    2705       16778 :         tail_vma->vm_next = NULL;
    2706             : 
    2707             :         /* Kill the cache */
    2708       16778 :         vmacache_invalidate(mm);
    2709             : 
    2710             :         /*
    2711             :          * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
    2712             :          * VM_GROWSUP VMA. Such VMAs can change their size under
    2713             :          * down_read(mmap_lock) and collide with the VMA we are about to unmap.
    2714             :          */
    2715       16778 :         if (vma && (vma->vm_flags & VM_GROWSDOWN))
    2716        1872 :                 return false;
    2717             :         if (prev && (prev->vm_flags & VM_GROWSUP))
    2718             :                 return false;
    2719             :         return true;
    2720             : }
    2721             : 
    2722             : /*
    2723             :  * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
    2724             :  * has already been checked or doesn't make sense to fail.
    2725             :  */
    2726       25413 : int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
    2727             :                 unsigned long addr, int new_below)
    2728             : {
    2729       25413 :         struct vm_area_struct *new;
    2730       25413 :         int err;
    2731             : 
    2732       25413 :         if (vma->vm_ops && vma->vm_ops->may_split) {
    2733           0 :                 err = vma->vm_ops->may_split(vma, addr);
    2734           0 :                 if (err)
    2735             :                         return err;
    2736             :         }
    2737             : 
    2738       25413 :         new = vm_area_dup(vma);
    2739       25413 :         if (!new)
    2740             :                 return -ENOMEM;
    2741             : 
    2742       25413 :         if (new_below)
    2743       11263 :                 new->vm_end = addr;
    2744             :         else {
    2745       14150 :                 new->vm_start = addr;
    2746       14150 :                 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
    2747             :         }
    2748             : 
    2749       25413 :         err = vma_dup_policy(vma, new);
    2750       25413 :         if (err)
    2751           0 :                 goto out_free_vma;
    2752             : 
    2753       25413 :         err = anon_vma_clone(new, vma);
    2754       25413 :         if (err)
    2755           0 :                 goto out_free_mpol;
    2756             : 
    2757       25413 :         if (new->vm_file)
    2758       25246 :                 get_file(new->vm_file);
    2759             : 
    2760       25413 :         if (new->vm_ops && new->vm_ops->open)
    2761           0 :                 new->vm_ops->open(new);
    2762             : 
    2763       25413 :         if (new_below)
    2764       11263 :                 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
    2765       11263 :                         ((addr - new->vm_start) >> PAGE_SHIFT), new);
    2766             :         else
    2767       14150 :                 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
    2768             : 
    2769             :         /* Success. */
    2770       25413 :         if (!err)
    2771             :                 return 0;
    2772             : 
    2773             :         /* Clean everything up if vma_adjust failed. */
    2774           0 :         if (new->vm_ops && new->vm_ops->close)
    2775           0 :                 new->vm_ops->close(new);
    2776           0 :         if (new->vm_file)
    2777           0 :                 fput(new->vm_file);
    2778           0 :         unlink_anon_vmas(new);
    2779           0 :  out_free_mpol:
    2780           0 :         mpol_put(vma_policy(new));
    2781           0 :  out_free_vma:
    2782           0 :         vm_area_free(new);
    2783           0 :         return err;
    2784             : }
    2785             : 
    2786             : /*
    2787             :  * Split a vma into two pieces at address 'addr', a new vma is allocated
    2788             :  * either for the first part or the tail.
    2789             :  */
    2790        9911 : int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
    2791             :               unsigned long addr, int new_below)
    2792             : {
    2793        9911 :         if (mm->map_count >= sysctl_max_map_count)
    2794             :                 return -ENOMEM;
    2795             : 
    2796        9911 :         return __split_vma(mm, vma, addr, new_below);
    2797             : }
    2798             : 
    2799             : /* Munmap is split into 2 main parts -- this part which finds
    2800             :  * what needs doing, and the areas themselves, which do the
    2801             :  * work.  This now handles partial unmappings.
    2802             :  * Jeremy Fitzhardinge <jeremy@goop.org>
    2803             :  */
    2804       16778 : int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
    2805             :                 struct list_head *uf, bool downgrade)
    2806             : {
    2807       16778 :         unsigned long end;
    2808       16778 :         struct vm_area_struct *vma, *prev, *last;
    2809             : 
    2810       16778 :         if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
    2811           0 :                 return -EINVAL;
    2812             : 
    2813       16778 :         len = PAGE_ALIGN(len);
    2814       16778 :         end = start + len;
    2815       16778 :         if (len == 0)
    2816             :                 return -EINVAL;
    2817             : 
    2818             :         /*
    2819             :          * arch_unmap() might do unmaps itself.  It must be called
    2820             :          * and finish any rbtree manipulation before this code
    2821             :          * runs and also starts to manipulate the rbtree.
    2822             :          */
    2823       16778 :         arch_unmap(mm, start, end);
    2824             : 
    2825             :         /* Find the first overlapping VMA */
    2826       16778 :         vma = find_vma(mm, start);
    2827       16778 :         if (!vma)
    2828             :                 return 0;
    2829       16778 :         prev = vma->vm_prev;
    2830             :         /* we have  start < vma->vm_end  */
    2831             : 
    2832             :         /* if it doesn't overlap, we have nothing.. */
    2833       16778 :         if (vma->vm_start >= end)
    2834             :                 return 0;
    2835             : 
    2836             :         /*
    2837             :          * If we need to split any vma, do it now to save pain later.
    2838             :          *
    2839             :          * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
    2840             :          * unmapped vm_area_struct will remain in use: so lower split_vma
    2841             :          * places tmp vma above, and higher split_vma places tmp vma below.
    2842             :          */
    2843       16778 :         if (start > vma->vm_start) {
    2844        6213 :                 int error;
    2845             : 
    2846             :                 /*
    2847             :                  * Make sure that map_count on return from munmap() will
    2848             :                  * not exceed its limit; but let map_count go just above
    2849             :                  * its limit temporarily, to help free resources as expected.
    2850             :                  */
    2851        6213 :                 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
    2852             :                         return -ENOMEM;
    2853             : 
    2854        6213 :                 error = __split_vma(mm, vma, start, 0);
    2855        6213 :                 if (error)
    2856             :                         return error;
    2857             :                 prev = vma;
    2858             :         }
    2859             : 
    2860             :         /* Does it split the last one? */
    2861       16778 :         last = find_vma(mm, end);
    2862       16778 :         if (last && end > last->vm_start) {
    2863        9289 :                 int error = __split_vma(mm, last, end, 1);
    2864        9289 :                 if (error)
    2865             :                         return error;
    2866             :         }
    2867       16778 :         vma = vma_next(mm, prev);
    2868             : 
    2869       16778 :         if (unlikely(uf)) {
    2870             :                 /*
    2871             :                  * If userfaultfd_unmap_prep returns an error the vmas
    2872             :                  * will remain splitted, but userland will get a
    2873             :                  * highly unexpected error anyway. This is no
    2874             :                  * different than the case where the first of the two
    2875             :                  * __split_vma fails, but we don't undo the first
    2876             :                  * split, despite we could. This is unlikely enough
    2877             :                  * failure that it's not worth optimizing it for.
    2878             :                  */
    2879       16778 :                 int error = userfaultfd_unmap_prep(vma, start, end, uf);
    2880             :                 if (error)
    2881             :                         return error;
    2882             :         }
    2883             : 
    2884             :         /*
    2885             :          * unlock any mlock()ed ranges before detaching vmas
    2886             :          */
    2887       16778 :         if (mm->locked_vm) {
    2888             :                 struct vm_area_struct *tmp = vma;
    2889           2 :                 while (tmp && tmp->vm_start < end) {
    2890           1 :                         if (tmp->vm_flags & VM_LOCKED) {
    2891           1 :                                 mm->locked_vm -= vma_pages(tmp);
    2892           1 :                                 munlock_vma_pages_all(tmp);
    2893             :                         }
    2894             : 
    2895           1 :                         tmp = tmp->vm_next;
    2896             :                 }
    2897             :         }
    2898             : 
    2899             :         /* Detach vmas from rbtree */
    2900       16778 :         if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
    2901             :                 downgrade = false;
    2902             : 
    2903       14906 :         if (downgrade)
    2904        1574 :                 mmap_write_downgrade(mm);
    2905             : 
    2906       16778 :         unmap_region(mm, vma, prev, start, end);
    2907             : 
    2908             :         /* Fix up all other VM information */
    2909       16778 :         remove_vma_list(mm, vma);
    2910             : 
    2911       16776 :         return downgrade ? 1 : 0;
    2912             : }
    2913             : 
    2914       13332 : int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
    2915             :               struct list_head *uf)
    2916             : {
    2917       13332 :         return __do_munmap(mm, start, len, uf, false);
    2918             : }
    2919             : 
    2920        3354 : static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
    2921             : {
    2922        3354 :         int ret;
    2923        3354 :         struct mm_struct *mm = current->mm;
    2924        3354 :         LIST_HEAD(uf);
    2925             : 
    2926        3354 :         if (mmap_write_lock_killable(mm))
    2927             :                 return -EINTR;
    2928             : 
    2929        3354 :         ret = __do_munmap(mm, start, len, &uf, downgrade);
    2930             :         /*
    2931             :          * Returning 1 indicates mmap_lock is downgraded.
    2932             :          * But 1 is not legal return value of vm_munmap() and munmap(), reset
    2933             :          * it to 0 before return.
    2934             :          */
    2935        3354 :         if (ret == 1) {
    2936        1482 :                 mmap_read_unlock(mm);
    2937        1482 :                 ret = 0;
    2938             :         } else
    2939        1872 :                 mmap_write_unlock(mm);
    2940             : 
    2941        3354 :         userfaultfd_unmap_complete(mm, &uf);
    2942             :         return ret;
    2943             : }
    2944             : 
    2945        1872 : int vm_munmap(unsigned long start, size_t len)
    2946             : {
    2947        1872 :         return __vm_munmap(start, len, false);
    2948             : }
    2949             : EXPORT_SYMBOL(vm_munmap);
    2950             : 
    2951        2964 : SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
    2952             : {
    2953        1482 :         addr = untagged_addr(addr);
    2954        1482 :         profile_munmap(addr);
    2955        1482 :         return __vm_munmap(addr, len, true);
    2956             : }
    2957             : 
    2958             : 
    2959             : /*
    2960             :  * Emulation of deprecated remap_file_pages() syscall.
    2961             :  */
    2962           0 : SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
    2963             :                 unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
    2964             : {
    2965             : 
    2966           0 :         struct mm_struct *mm = current->mm;
    2967           0 :         struct vm_area_struct *vma;
    2968           0 :         unsigned long populate = 0;
    2969           0 :         unsigned long ret = -EINVAL;
    2970           0 :         struct file *file;
    2971             : 
    2972           0 :         pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
    2973             :                      current->comm, current->pid);
    2974             : 
    2975           0 :         if (prot)
    2976             :                 return ret;
    2977           0 :         start = start & PAGE_MASK;
    2978           0 :         size = size & PAGE_MASK;
    2979             : 
    2980           0 :         if (start + size <= start)
    2981             :                 return ret;
    2982             : 
    2983             :         /* Does pgoff wrap? */
    2984           0 :         if (pgoff + (size >> PAGE_SHIFT) < pgoff)
    2985             :                 return ret;
    2986             : 
    2987           0 :         if (mmap_write_lock_killable(mm))
    2988             :                 return -EINTR;
    2989             : 
    2990           0 :         vma = find_vma(mm, start);
    2991             : 
    2992           0 :         if (!vma || !(vma->vm_flags & VM_SHARED))
    2993           0 :                 goto out;
    2994             : 
    2995           0 :         if (start < vma->vm_start)
    2996           0 :                 goto out;
    2997             : 
    2998           0 :         if (start + size > vma->vm_end) {
    2999           0 :                 struct vm_area_struct *next;
    3000             : 
    3001           0 :                 for (next = vma->vm_next; next; next = next->vm_next) {
    3002             :                         /* hole between vmas ? */
    3003           0 :                         if (next->vm_start != next->vm_prev->vm_end)
    3004           0 :                                 goto out;
    3005             : 
    3006           0 :                         if (next->vm_file != vma->vm_file)
    3007           0 :                                 goto out;
    3008             : 
    3009           0 :                         if (next->vm_flags != vma->vm_flags)
    3010           0 :                                 goto out;
    3011             : 
    3012           0 :                         if (start + size <= next->vm_end)
    3013             :                                 break;
    3014             :                 }
    3015             : 
    3016           0 :                 if (!next)
    3017           0 :                         goto out;
    3018             :         }
    3019             : 
    3020           0 :         prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
    3021           0 :         prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
    3022           0 :         prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
    3023             : 
    3024           0 :         flags &= MAP_NONBLOCK;
    3025           0 :         flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
    3026           0 :         if (vma->vm_flags & VM_LOCKED) {
    3027           0 :                 struct vm_area_struct *tmp;
    3028           0 :                 flags |= MAP_LOCKED;
    3029             : 
    3030             :                 /* drop PG_Mlocked flag for over-mapped range */
    3031           0 :                 for (tmp = vma; tmp->vm_start >= start + size;
    3032           0 :                                 tmp = tmp->vm_next) {
    3033             :                         /*
    3034             :                          * Split pmd and munlock page on the border
    3035             :                          * of the range.
    3036             :                          */
    3037           0 :                         vma_adjust_trans_huge(tmp, start, start + size, 0);
    3038             : 
    3039           0 :                         munlock_vma_pages_range(tmp,
    3040           0 :                                         max(tmp->vm_start, start),
    3041           0 :                                         min(tmp->vm_end, start + size));
    3042             :                 }
    3043             :         }
    3044             : 
    3045           0 :         file = get_file(vma->vm_file);
    3046           0 :         ret = do_mmap(vma->vm_file, start, size,
    3047             :                         prot, flags, pgoff, &populate, NULL);
    3048           0 :         fput(file);
    3049           0 : out:
    3050           0 :         mmap_write_unlock(mm);
    3051           0 :         if (populate)
    3052           0 :                 mm_populate(ret, populate);
    3053           0 :         if (!IS_ERR_VALUE(ret))
    3054           0 :                 ret = 0;
    3055           0 :         return ret;
    3056             : }
    3057             : 
    3058             : /*
    3059             :  *  this is really a simplified "do_mmap".  it only handles
    3060             :  *  anonymous maps.  eventually we may be able to do some
    3061             :  *  brk-specific accounting here.
    3062             :  */
    3063        2520 : static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
    3064             : {
    3065        2520 :         struct mm_struct *mm = current->mm;
    3066        2520 :         struct vm_area_struct *vma, *prev;
    3067        2520 :         struct rb_node **rb_link, *rb_parent;
    3068        2520 :         pgoff_t pgoff = addr >> PAGE_SHIFT;
    3069        2520 :         int error;
    3070        2520 :         unsigned long mapped_addr;
    3071             : 
    3072             :         /* Until we need other flags, refuse anything except VM_EXEC. */
    3073        2520 :         if ((flags & (~VM_EXEC)) != 0)
    3074             :                 return -EINVAL;
    3075        2520 :         flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
    3076             : 
    3077        2520 :         mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
    3078        2520 :         if (IS_ERR_VALUE(mapped_addr))
    3079           0 :                 return mapped_addr;
    3080             : 
    3081        2520 :         error = mlock_future_check(mm, mm->def_flags, len);
    3082        2520 :         if (error)
    3083             :                 return error;
    3084             : 
    3085             :         /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
    3086        2520 :         if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
    3087             :                 return -ENOMEM;
    3088             : 
    3089             :         /* Check against address space limits *after* clearing old maps... */
    3090        2520 :         if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
    3091             :                 return -ENOMEM;
    3092             : 
    3093        2520 :         if (mm->map_count > sysctl_max_map_count)
    3094             :                 return -ENOMEM;
    3095             : 
    3096        2520 :         if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
    3097             :                 return -ENOMEM;
    3098             : 
    3099             :         /* Can we just expand an old private anonymous mapping? */
    3100        2520 :         vma = vma_merge(mm, prev, addr, addr + len, flags,
    3101             :                         NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
    3102        2520 :         if (vma)
    3103         254 :                 goto out;
    3104             : 
    3105             :         /*
    3106             :          * create a vma struct for an anonymous mapping
    3107             :          */
    3108        2266 :         vma = vm_area_alloc(mm);
    3109        2266 :         if (!vma) {
    3110           0 :                 vm_unacct_memory(len >> PAGE_SHIFT);
    3111           0 :                 return -ENOMEM;
    3112             :         }
    3113             : 
    3114        2266 :         vma_set_anonymous(vma);
    3115        2266 :         vma->vm_start = addr;
    3116        2266 :         vma->vm_end = addr + len;
    3117        2266 :         vma->vm_pgoff = pgoff;
    3118        2266 :         vma->vm_flags = flags;
    3119        2266 :         vma->vm_page_prot = vm_get_page_prot(flags);
    3120        2266 :         vma_link(mm, vma, prev, rb_link, rb_parent);
    3121        2520 : out:
    3122        2520 :         perf_event_mmap(vma);
    3123        2520 :         mm->total_vm += len >> PAGE_SHIFT;
    3124        2520 :         mm->data_vm += len >> PAGE_SHIFT;
    3125        2520 :         if (flags & VM_LOCKED)
    3126           0 :                 mm->locked_vm += (len >> PAGE_SHIFT);
    3127        2520 :         vma->vm_flags |= VM_SOFTDIRTY;
    3128        2520 :         return 0;
    3129             : }
    3130             : 
    3131        1337 : int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
    3132             : {
    3133        1337 :         struct mm_struct *mm = current->mm;
    3134        1337 :         unsigned long len;
    3135        1337 :         int ret;
    3136        1337 :         bool populate;
    3137        1337 :         LIST_HEAD(uf);
    3138             : 
    3139        1337 :         len = PAGE_ALIGN(request);
    3140        1337 :         if (len < request)
    3141             :                 return -ENOMEM;
    3142        1337 :         if (!len)
    3143             :                 return 0;
    3144             : 
    3145        1337 :         if (mmap_write_lock_killable(mm))
    3146             :                 return -EINTR;
    3147             : 
    3148        1337 :         ret = do_brk_flags(addr, len, flags, &uf);
    3149        1337 :         populate = ((mm->def_flags & VM_LOCKED) != 0);
    3150        1337 :         mmap_write_unlock(mm);
    3151        1337 :         userfaultfd_unmap_complete(mm, &uf);
    3152        1337 :         if (populate && !ret)
    3153           0 :                 mm_populate(addr, len);
    3154             :         return ret;
    3155             : }
    3156             : EXPORT_SYMBOL(vm_brk_flags);
    3157             : 
    3158           0 : int vm_brk(unsigned long addr, unsigned long len)
    3159             : {
    3160           0 :         return vm_brk_flags(addr, len, 0);
    3161             : }
    3162             : EXPORT_SYMBOL(vm_brk);
    3163             : 
    3164             : /* Release all mmaps. */
    3165        3728 : void exit_mmap(struct mm_struct *mm)
    3166             : {
    3167        3728 :         struct mmu_gather tlb;
    3168        3728 :         struct vm_area_struct *vma;
    3169        3728 :         unsigned long nr_accounted = 0;
    3170             : 
    3171             :         /* mm's last user has gone, and its about to be pulled down */
    3172        3728 :         mmu_notifier_release(mm);
    3173             : 
    3174        3728 :         if (unlikely(mm_is_oom_victim(mm))) {
    3175             :                 /*
    3176             :                  * Manually reap the mm to free as much memory as possible.
    3177             :                  * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
    3178             :                  * this mm from further consideration.  Taking mm->mmap_lock for
    3179             :                  * write after setting MMF_OOM_SKIP will guarantee that the oom
    3180             :                  * reaper will not run on this mm again after mmap_lock is
    3181             :                  * dropped.
    3182             :                  *
    3183             :                  * Nothing can be holding mm->mmap_lock here and the above call
    3184             :                  * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
    3185             :                  * __oom_reap_task_mm() will not block.
    3186             :                  *
    3187             :                  * This needs to be done before calling munlock_vma_pages_all(),
    3188             :                  * which clears VM_LOCKED, otherwise the oom reaper cannot
    3189             :                  * reliably test it.
    3190             :                  */
    3191           0 :                 (void)__oom_reap_task_mm(mm);
    3192             : 
    3193           0 :                 set_bit(MMF_OOM_SKIP, &mm->flags);
    3194           0 :                 mmap_write_lock(mm);
    3195           0 :                 mmap_write_unlock(mm);
    3196             :         }
    3197             : 
    3198        3728 :         if (mm->locked_vm) {
    3199           0 :                 vma = mm->mmap;
    3200           0 :                 while (vma) {
    3201           0 :                         if (vma->vm_flags & VM_LOCKED)
    3202           0 :                                 munlock_vma_pages_all(vma);
    3203           0 :                         vma = vma->vm_next;
    3204             :                 }
    3205             :         }
    3206             : 
    3207        3728 :         arch_exit_mmap(mm);
    3208             : 
    3209        3728 :         vma = mm->mmap;
    3210        3728 :         if (!vma)       /* Can happen if dup_mmap() received an OOM */
    3211           0 :                 return;
    3212             : 
    3213        3728 :         lru_add_drain();
    3214        3728 :         flush_cache_mm(mm);
    3215        3728 :         tlb_gather_mmu_fullmm(&tlb, mm);
    3216             :         /* update_hiwater_rss(mm) here? but nobody should be looking */
    3217             :         /* Use -1 here to ensure all VMAs in the mm are unmapped */
    3218        3728 :         unmap_vmas(&tlb, vma, 0, -1);
    3219        3728 :         free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
    3220        3728 :         tlb_finish_mmu(&tlb);
    3221             : 
    3222             :         /*
    3223             :          * Walk the list again, actually closing and freeing it,
    3224             :          * with preemption enabled, without holding any MM locks.
    3225             :          */
    3226      101045 :         while (vma) {
    3227       97317 :                 if (vma->vm_flags & VM_ACCOUNT)
    3228       45297 :                         nr_accounted += vma_pages(vma);
    3229       97317 :                 vma = remove_vma(vma);
    3230       97311 :                 cond_resched();
    3231             :         }
    3232        3728 :         vm_unacct_memory(nr_accounted);
    3233             : }
    3234             : 
    3235             : /* Insert vm structure into process list sorted by address
    3236             :  * and into the inode's i_mmap tree.  If vm_file is non-NULL
    3237             :  * then i_mmap_rwsem is taken here.
    3238             :  */
    3239        4384 : int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
    3240             : {
    3241        4384 :         struct vm_area_struct *prev;
    3242        4384 :         struct rb_node **rb_link, *rb_parent;
    3243             : 
    3244        8768 :         if (find_vma_links(mm, vma->vm_start, vma->vm_end,
    3245             :                            &prev, &rb_link, &rb_parent))
    3246             :                 return -ENOMEM;
    3247        6892 :         if ((vma->vm_flags & VM_ACCOUNT) &&
    3248        2508 :              security_vm_enough_memory_mm(mm, vma_pages(vma)))
    3249             :                 return -ENOMEM;
    3250             : 
    3251             :         /*
    3252             :          * The vm_pgoff of a purely anonymous vma should be irrelevant
    3253             :          * until its first write fault, when page's anon_vma and index
    3254             :          * are set.  But now set the vm_pgoff it will almost certainly
    3255             :          * end up with (unless mremap moves it elsewhere before that
    3256             :          * first wfault), so /proc/pid/maps tells a consistent story.
    3257             :          *
    3258             :          * By setting it to reflect the virtual start address of the
    3259             :          * vma, merges and splits can happen in a seamless way, just
    3260             :          * using the existing file pgoff checks and manipulations.
    3261             :          * Similarly in do_mmap and in do_brk_flags.
    3262             :          */
    3263        4384 :         if (vma_is_anonymous(vma)) {
    3264        2508 :                 BUG_ON(vma->anon_vma);
    3265        2508 :                 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
    3266             :         }
    3267             : 
    3268        4384 :         vma_link(mm, vma, prev, rb_link, rb_parent);
    3269        4384 :         return 0;
    3270             : }
    3271             : 
    3272             : /*
    3273             :  * Copy the vma structure to a new location in the same mm,
    3274             :  * prior to moving page table entries, to effect an mremap move.
    3275             :  */
    3276           0 : struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
    3277             :         unsigned long addr, unsigned long len, pgoff_t pgoff,
    3278             :         bool *need_rmap_locks)
    3279             : {
    3280           0 :         struct vm_area_struct *vma = *vmap;
    3281           0 :         unsigned long vma_start = vma->vm_start;
    3282           0 :         struct mm_struct *mm = vma->vm_mm;
    3283           0 :         struct vm_area_struct *new_vma, *prev;
    3284           0 :         struct rb_node **rb_link, *rb_parent;
    3285           0 :         bool faulted_in_anon_vma = true;
    3286             : 
    3287             :         /*
    3288             :          * If anonymous vma has not yet been faulted, update new pgoff
    3289             :          * to match new location, to increase its chance of merging.
    3290             :          */
    3291           0 :         if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
    3292           0 :                 pgoff = addr >> PAGE_SHIFT;
    3293           0 :                 faulted_in_anon_vma = false;
    3294             :         }
    3295             : 
    3296           0 :         if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
    3297             :                 return NULL;    /* should never get here */
    3298           0 :         new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
    3299             :                             vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
    3300             :                             vma->vm_userfaultfd_ctx);
    3301           0 :         if (new_vma) {
    3302             :                 /*
    3303             :                  * Source vma may have been merged into new_vma
    3304             :                  */
    3305           0 :                 if (unlikely(vma_start >= new_vma->vm_start &&
    3306             :                              vma_start < new_vma->vm_end)) {
    3307             :                         /*
    3308             :                          * The only way we can get a vma_merge with
    3309             :                          * self during an mremap is if the vma hasn't
    3310             :                          * been faulted in yet and we were allowed to
    3311             :                          * reset the dst vma->vm_pgoff to the
    3312             :                          * destination address of the mremap to allow
    3313             :                          * the merge to happen. mremap must change the
    3314             :                          * vm_pgoff linearity between src and dst vmas
    3315             :                          * (in turn preventing a vma_merge) to be
    3316             :                          * safe. It is only safe to keep the vm_pgoff
    3317             :                          * linear if there are no pages mapped yet.
    3318             :                          */
    3319           0 :                         VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
    3320           0 :                         *vmap = vma = new_vma;
    3321             :                 }
    3322           0 :                 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
    3323             :         } else {
    3324           0 :                 new_vma = vm_area_dup(vma);
    3325           0 :                 if (!new_vma)
    3326           0 :                         goto out;
    3327           0 :                 new_vma->vm_start = addr;
    3328           0 :                 new_vma->vm_end = addr + len;
    3329           0 :                 new_vma->vm_pgoff = pgoff;
    3330           0 :                 if (vma_dup_policy(vma, new_vma))
    3331           0 :                         goto out_free_vma;
    3332           0 :                 if (anon_vma_clone(new_vma, vma))
    3333           0 :                         goto out_free_mempol;
    3334           0 :                 if (new_vma->vm_file)
    3335           0 :                         get_file(new_vma->vm_file);
    3336           0 :                 if (new_vma->vm_ops && new_vma->vm_ops->open)
    3337           0 :                         new_vma->vm_ops->open(new_vma);
    3338           0 :                 vma_link(mm, new_vma, prev, rb_link, rb_parent);
    3339           0 :                 *need_rmap_locks = false;
    3340             :         }
    3341             :         return new_vma;
    3342             : 
    3343           0 : out_free_mempol:
    3344           0 :         mpol_put(vma_policy(new_vma));
    3345           0 : out_free_vma:
    3346           0 :         vm_area_free(new_vma);
    3347             : out:
    3348             :         return NULL;
    3349             : }
    3350             : 
    3351             : /*
    3352             :  * Return true if the calling process may expand its vm space by the passed
    3353             :  * number of pages
    3354             :  */
    3355       32253 : bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
    3356             : {
    3357       32253 :         if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
    3358             :                 return false;
    3359             : 
    3360       32253 :         if (is_data_mapping(flags) &&
    3361       12496 :             mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
    3362             :                 /* Workaround for Valgrind */
    3363           0 :                 if (rlimit(RLIMIT_DATA) == 0 &&
    3364           0 :                     mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
    3365             :                         return true;
    3366             : 
    3367           0 :                 pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
    3368             :                              current->comm, current->pid,
    3369             :                              (mm->data_vm + npages) << PAGE_SHIFT,
    3370             :                              rlimit(RLIMIT_DATA),
    3371             :                              ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");
    3372             : 
    3373           0 :                 if (!ignore_rlimit_data)
    3374           0 :                         return false;
    3375             :         }
    3376             : 
    3377             :         return true;
    3378             : }
    3379             : 
    3380       64434 : void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
    3381             : {
    3382       64434 :         mm->total_vm += npages;
    3383             : 
    3384       64434 :         if (is_exec_mapping(flags))
    3385        8015 :                 mm->exec_vm += npages;
    3386       56419 :         else if (is_stack_mapping(flags))
    3387         954 :                 mm->stack_vm += npages;
    3388       55465 :         else if (is_data_mapping(flags))
    3389       16349 :                 mm->data_vm += npages;
    3390       64434 : }
    3391             : 
    3392             : static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
    3393             : 
    3394             : /*
    3395             :  * Having a close hook prevents vma merging regardless of flags.
    3396             :  */
    3397        4316 : static void special_mapping_close(struct vm_area_struct *vma)
    3398             : {
    3399        4316 : }
    3400             : 
    3401           0 : static const char *special_mapping_name(struct vm_area_struct *vma)
    3402             : {
    3403           0 :         return ((struct vm_special_mapping *)vma->vm_private_data)->name;
    3404             : }
    3405             : 
    3406           0 : static int special_mapping_mremap(struct vm_area_struct *new_vma,
    3407             :                                   unsigned long flags)
    3408             : {
    3409           0 :         struct vm_special_mapping *sm = new_vma->vm_private_data;
    3410             : 
    3411           0 :         if (flags & MREMAP_DONTUNMAP)
    3412             :                 return -EINVAL;
    3413             : 
    3414           0 :         if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
    3415             :                 return -EFAULT;
    3416             : 
    3417           0 :         if (sm->mremap)
    3418           0 :                 return sm->mremap(sm, new_vma);
    3419             : 
    3420             :         return 0;
    3421             : }
    3422             : 
    3423           0 : static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
    3424             : {
    3425             :         /*
    3426             :          * Forbid splitting special mappings - kernel has expectations over
    3427             :          * the number of pages in mapping. Together with VM_DONTEXPAND
    3428             :          * the size of vma should stay the same over the special mapping's
    3429             :          * lifetime.
    3430             :          */
    3431           0 :         return -EINVAL;
    3432             : }
    3433             : 
    3434             : static const struct vm_operations_struct special_mapping_vmops = {
    3435             :         .close = special_mapping_close,
    3436             :         .fault = special_mapping_fault,
    3437             :         .mremap = special_mapping_mremap,
    3438             :         .name = special_mapping_name,
    3439             :         /* vDSO code relies that VVAR can't be accessed remotely */
    3440             :         .access = NULL,
    3441             :         .may_split = special_mapping_split,
    3442             : };
    3443             : 
    3444             : static const struct vm_operations_struct legacy_special_mapping_vmops = {
    3445             :         .close = special_mapping_close,
    3446             :         .fault = special_mapping_fault,
    3447             : };
    3448             : 
    3449        1096 : static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
    3450             : {
    3451        1096 :         struct vm_area_struct *vma = vmf->vma;
    3452        1096 :         pgoff_t pgoff;
    3453        1096 :         struct page **pages;
    3454             : 
    3455        1096 :         if (vma->vm_ops == &legacy_special_mapping_vmops) {
    3456           0 :                 pages = vma->vm_private_data;
    3457             :         } else {
    3458        1096 :                 struct vm_special_mapping *sm = vma->vm_private_data;
    3459             : 
    3460        1096 :                 if (sm->fault)
    3461        1096 :                         return sm->fault(sm, vmf->vma, vmf);
    3462             : 
    3463           0 :                 pages = sm->pages;
    3464             :         }
    3465             : 
    3466           0 :         for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
    3467           0 :                 pgoff--;
    3468             : 
    3469           0 :         if (*pages) {
    3470           0 :                 struct page *page = *pages;
    3471           0 :                 get_page(page);
    3472           0 :                 vmf->page = page;
    3473           0 :                 return 0;
    3474             :         }
    3475             : 
    3476             :         return VM_FAULT_SIGBUS;
    3477             : }
    3478             : 
    3479        1876 : static struct vm_area_struct *__install_special_mapping(
    3480             :         struct mm_struct *mm,
    3481             :         unsigned long addr, unsigned long len,
    3482             :         unsigned long vm_flags, void *priv,
    3483             :         const struct vm_operations_struct *ops)
    3484             : {
    3485        1876 :         int ret;
    3486        1876 :         struct vm_area_struct *vma;
    3487             : 
    3488        1876 :         vma = vm_area_alloc(mm);
    3489        1876 :         if (unlikely(vma == NULL))
    3490        1876 :                 return ERR_PTR(-ENOMEM);
    3491             : 
    3492        1876 :         vma->vm_start = addr;
    3493        1876 :         vma->vm_end = addr + len;
    3494             : 
    3495        1876 :         vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
    3496        1876 :         vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
    3497             : 
    3498        1876 :         vma->vm_ops = ops;
    3499        1876 :         vma->vm_private_data = priv;
    3500             : 
    3501        1876 :         ret = insert_vm_struct(mm, vma);
    3502        1876 :         if (ret)
    3503           0 :                 goto out;
    3504             : 
    3505        1876 :         vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
    3506             : 
    3507        1876 :         perf_event_mmap(vma);
    3508             : 
    3509        1876 :         return vma;
    3510             : 
    3511           0 : out:
    3512           0 :         vm_area_free(vma);
    3513           0 :         return ERR_PTR(ret);
    3514             : }
    3515             : 
    3516           0 : bool vma_is_special_mapping(const struct vm_area_struct *vma,
    3517             :         const struct vm_special_mapping *sm)
    3518             : {
    3519           0 :         return vma->vm_private_data == sm &&
    3520           0 :                 (vma->vm_ops == &special_mapping_vmops ||
    3521             :                  vma->vm_ops == &legacy_special_mapping_vmops);
    3522             : }
    3523             : 
    3524             : /*
    3525             :  * Called with mm->mmap_lock held for writing.
    3526             :  * Insert a new vma covering the given region, with the given flags.
    3527             :  * Its pages are supplied by the given array of struct page *.
    3528             :  * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
    3529             :  * The region past the last page supplied will always produce SIGBUS.
    3530             :  * The array pointer and the pages it points to are assumed to stay alive
    3531             :  * for as long as this mapping might exist.
    3532             :  */
    3533        1876 : struct vm_area_struct *_install_special_mapping(
    3534             :         struct mm_struct *mm,
    3535             :         unsigned long addr, unsigned long len,
    3536             :         unsigned long vm_flags, const struct vm_special_mapping *spec)
    3537             : {
    3538        1876 :         return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
    3539             :                                         &special_mapping_vmops);
    3540             : }
    3541             : 
    3542           0 : int install_special_mapping(struct mm_struct *mm,
    3543             :                             unsigned long addr, unsigned long len,
    3544             :                             unsigned long vm_flags, struct page **pages)
    3545             : {
    3546           0 :         struct vm_area_struct *vma = __install_special_mapping(
    3547             :                 mm, addr, len, vm_flags, (void *)pages,
    3548             :                 &legacy_special_mapping_vmops);
    3549             : 
    3550           0 :         return PTR_ERR_OR_ZERO(vma);
    3551             : }
    3552             : 
    3553             : static DEFINE_MUTEX(mm_all_locks_mutex);
    3554             : 
    3555           0 : static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
    3556             : {
    3557           0 :         if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
    3558             :                 /*
    3559             :                  * The LSB of head.next can't change from under us
    3560             :                  * because we hold the mm_all_locks_mutex.
    3561             :                  */
    3562           0 :                 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
    3563             :                 /*
    3564             :                  * We can safely modify head.next after taking the
    3565             :                  * anon_vma->root->rwsem. If some other vma in this mm shares
    3566             :                  * the same anon_vma we won't take it again.
    3567             :                  *
    3568             :                  * No need of atomic instructions here, head.next
    3569             :                  * can't change from under us thanks to the
    3570             :                  * anon_vma->root->rwsem.
    3571             :                  */
    3572           0 :                 if (__test_and_set_bit(0, (unsigned long *)
    3573           0 :                                        &anon_vma->root->rb_root.rb_root.rb_node))
    3574           0 :                         BUG();
    3575             :         }
    3576           0 : }
    3577             : 
    3578           0 : static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
    3579             : {
    3580           0 :         if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
    3581             :                 /*
    3582             :                  * AS_MM_ALL_LOCKS can't change from under us because
    3583             :                  * we hold the mm_all_locks_mutex.
    3584             :                  *
    3585             :                  * Operations on ->flags have to be atomic because
    3586             :                  * even if AS_MM_ALL_LOCKS is stable thanks to the
    3587             :                  * mm_all_locks_mutex, there may be other cpus
    3588             :                  * changing other bitflags in parallel to us.
    3589             :                  */
    3590           0 :                 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
    3591           0 :                         BUG();
    3592           0 :                 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
    3593             :         }
    3594           0 : }
    3595             : 
    3596             : /*
    3597             :  * This operation locks against the VM for all pte/vma/mm related
    3598             :  * operations that could ever happen on a certain mm. This includes
    3599             :  * vmtruncate, try_to_unmap, and all page faults.
    3600             :  *
    3601             :  * The caller must take the mmap_lock in write mode before calling
    3602             :  * mm_take_all_locks(). The caller isn't allowed to release the
    3603             :  * mmap_lock until mm_drop_all_locks() returns.
    3604             :  *
    3605             :  * mmap_lock in write mode is required in order to block all operations
    3606             :  * that could modify pagetables and free pages without need of
    3607             :  * altering the vma layout. It's also needed in write mode to avoid new
    3608             :  * anon_vmas to be associated with existing vmas.
    3609             :  *
    3610             :  * A single task can't take more than one mm_take_all_locks() in a row
    3611             :  * or it would deadlock.
    3612             :  *
    3613             :  * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
    3614             :  * mapping->flags avoid to take the same lock twice, if more than one
    3615             :  * vma in this mm is backed by the same anon_vma or address_space.
    3616             :  *
    3617             :  * We take locks in following order, accordingly to comment at beginning
    3618             :  * of mm/rmap.c:
    3619             :  *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
    3620             :  *     hugetlb mapping);
    3621             :  *   - all i_mmap_rwsem locks;
    3622             :  *   - all anon_vma->rwseml
    3623             :  *
    3624             :  * We can take all locks within these types randomly because the VM code
    3625             :  * doesn't nest them and we protected from parallel mm_take_all_locks() by
    3626             :  * mm_all_locks_mutex.
    3627             :  *
    3628             :  * mm_take_all_locks() and mm_drop_all_locks are expensive operations
    3629             :  * that may have to take thousand of locks.
    3630             :  *
    3631             :  * mm_take_all_locks() can fail if it's interrupted by signals.
    3632             :  */
    3633           0 : int mm_take_all_locks(struct mm_struct *mm)
    3634             : {
    3635           0 :         struct vm_area_struct *vma;
    3636           0 :         struct anon_vma_chain *avc;
    3637             : 
    3638           0 :         BUG_ON(mmap_read_trylock(mm));
    3639             : 
    3640           0 :         mutex_lock(&mm_all_locks_mutex);
    3641             : 
    3642           0 :         for (vma = mm->mmap; vma; vma = vma->vm_next) {
    3643           0 :                 if (signal_pending(current))
    3644           0 :                         goto out_unlock;
    3645           0 :                 if (vma->vm_file && vma->vm_file->f_mapping &&
    3646           0 :                                 is_vm_hugetlb_page(vma))
    3647             :                         vm_lock_mapping(mm, vma->vm_file->f_mapping);
    3648             :         }
    3649             : 
    3650           0 :         for (vma = mm->mmap; vma; vma = vma->vm_next) {
    3651           0 :                 if (signal_pending(current))
    3652           0 :                         goto out_unlock;
    3653           0 :                 if (vma->vm_file && vma->vm_file->f_mapping &&
    3654           0 :                                 !is_vm_hugetlb_page(vma))
    3655           0 :                         vm_lock_mapping(mm, vma->vm_file->f_mapping);
    3656             :         }
    3657             : 
    3658           0 :         for (vma = mm->mmap; vma; vma = vma->vm_next) {
    3659           0 :                 if (signal_pending(current))
    3660           0 :                         goto out_unlock;
    3661           0 :                 if (vma->anon_vma)
    3662           0 :                         list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
    3663           0 :                                 vm_lock_anon_vma(mm, avc->anon_vma);
    3664             :         }
    3665             : 
    3666             :         return 0;
    3667             : 
    3668           0 : out_unlock:
    3669           0 :         mm_drop_all_locks(mm);
    3670           0 :         return -EINTR;
    3671             : }
    3672             : 
    3673           0 : static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
    3674             : {
    3675           0 :         if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
    3676             :                 /*
    3677             :                  * The LSB of head.next can't change to 0 from under
    3678             :                  * us because we hold the mm_all_locks_mutex.
    3679             :                  *
    3680             :                  * We must however clear the bitflag before unlocking
    3681             :                  * the vma so the users using the anon_vma->rb_root will
    3682             :                  * never see our bitflag.
    3683             :                  *
    3684             :                  * No need of atomic instructions here, head.next
    3685             :                  * can't change from under us until we release the
    3686             :                  * anon_vma->root->rwsem.
    3687             :                  */
    3688           0 :                 if (!__test_and_clear_bit(0, (unsigned long *)
    3689           0 :                                           &anon_vma->root->rb_root.rb_root.rb_node))
    3690           0 :                         BUG();
    3691           0 :                 anon_vma_unlock_write(anon_vma);
    3692             :         }
    3693           0 : }
    3694             : 
    3695           0 : static void vm_unlock_mapping(struct address_space *mapping)
    3696             : {
    3697           0 :         if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
    3698             :                 /*
    3699             :                  * AS_MM_ALL_LOCKS can't change to 0 from under us
    3700             :                  * because we hold the mm_all_locks_mutex.
    3701             :                  */
    3702           0 :                 i_mmap_unlock_write(mapping);
    3703           0 :                 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
    3704             :                                         &mapping->flags))
    3705           0 :                         BUG();
    3706             :         }
    3707           0 : }
    3708             : 
    3709             : /*
    3710             :  * The mmap_lock cannot be released by the caller until
    3711             :  * mm_drop_all_locks() returns.
    3712             :  */
    3713           0 : void mm_drop_all_locks(struct mm_struct *mm)
    3714             : {
    3715           0 :         struct vm_area_struct *vma;
    3716           0 :         struct anon_vma_chain *avc;
    3717             : 
    3718           0 :         BUG_ON(mmap_read_trylock(mm));
    3719           0 :         BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
    3720             : 
    3721           0 :         for (vma = mm->mmap; vma; vma = vma->vm_next) {
    3722           0 :                 if (vma->anon_vma)
    3723           0 :                         list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
    3724           0 :                                 vm_unlock_anon_vma(avc->anon_vma);
    3725           0 :                 if (vma->vm_file && vma->vm_file->f_mapping)
    3726           0 :                         vm_unlock_mapping(vma->vm_file->f_mapping);
    3727             :         }
    3728             : 
    3729           0 :         mutex_unlock(&mm_all_locks_mutex);
    3730           0 : }
    3731             : 
    3732             : /*
    3733             :  * initialise the percpu counter for VM
    3734             :  */
    3735           1 : void __init mmap_init(void)
    3736             : {
    3737           1 :         int ret;
    3738             : 
    3739           1 :         ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
    3740           1 :         VM_BUG_ON(ret);
    3741           1 : }
    3742             : 
    3743             : /*
    3744             :  * Initialise sysctl_user_reserve_kbytes.
    3745             :  *
    3746             :  * This is intended to prevent a user from starting a single memory hogging
    3747             :  * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
    3748             :  * mode.
    3749             :  *
    3750             :  * The default value is min(3% of free memory, 128MB)
    3751             :  * 128MB is enough to recover with sshd/login, bash, and top/kill.
    3752             :  */
    3753           1 : static int init_user_reserve(void)
    3754             : {
    3755           1 :         unsigned long free_kbytes;
    3756             : 
    3757           1 :         free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
    3758             : 
    3759           1 :         sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
    3760           1 :         return 0;
    3761             : }
    3762             : subsys_initcall(init_user_reserve);
    3763             : 
    3764             : /*
    3765             :  * Initialise sysctl_admin_reserve_kbytes.
    3766             :  *
    3767             :  * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
    3768             :  * to log in and kill a memory hogging process.
    3769             :  *
    3770             :  * Systems with more than 256MB will reserve 8MB, enough to recover
    3771             :  * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
    3772             :  * only reserve 3% of free pages by default.
    3773             :  */
    3774           1 : static int init_admin_reserve(void)
    3775             : {
    3776           1 :         unsigned long free_kbytes;
    3777             : 
    3778           1 :         free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
    3779             : 
    3780           1 :         sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
    3781           1 :         return 0;
    3782             : }
    3783             : subsys_initcall(init_admin_reserve);
    3784             : 
    3785             : /*
    3786             :  * Reinititalise user and admin reserves if memory is added or removed.
    3787             :  *
    3788             :  * The default user reserve max is 128MB, and the default max for the
    3789             :  * admin reserve is 8MB. These are usually, but not always, enough to
    3790             :  * enable recovery from a memory hogging process using login/sshd, a shell,
    3791             :  * and tools like top. It may make sense to increase or even disable the
    3792             :  * reserve depending on the existence of swap or variations in the recovery
    3793             :  * tools. So, the admin may have changed them.
    3794             :  *
    3795             :  * If memory is added and the reserves have been eliminated or increased above
    3796             :  * the default max, then we'll trust the admin.
    3797             :  *
    3798             :  * If memory is removed and there isn't enough free memory, then we
    3799             :  * need to reset the reserves.
    3800             :  *
    3801             :  * Otherwise keep the reserve set by the admin.
    3802             :  */
    3803             : static int reserve_mem_notifier(struct notifier_block *nb,
    3804             :                              unsigned long action, void *data)
    3805             : {
    3806             :         unsigned long tmp, free_kbytes;
    3807             : 
    3808             :         switch (action) {
    3809             :         case MEM_ONLINE:
    3810             :                 /* Default max is 128MB. Leave alone if modified by operator. */
    3811             :                 tmp = sysctl_user_reserve_kbytes;
    3812             :                 if (0 < tmp && tmp < (1UL << 17))
    3813             :                         init_user_reserve();
    3814             : 
    3815             :                 /* Default max is 8MB.  Leave alone if modified by operator. */
    3816             :                 tmp = sysctl_admin_reserve_kbytes;
    3817             :                 if (0 < tmp && tmp < (1UL << 13))
    3818             :                         init_admin_reserve();
    3819             : 
    3820             :                 break;
    3821             :         case MEM_OFFLINE:
    3822             :                 free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
    3823             : 
    3824             :                 if (sysctl_user_reserve_kbytes > free_kbytes) {
    3825             :                         init_user_reserve();
    3826             :                         pr_info("vm.user_reserve_kbytes reset to %lu\n",
    3827             :                                 sysctl_user_reserve_kbytes);
    3828             :                 }
    3829             : 
    3830             :                 if (sysctl_admin_reserve_kbytes > free_kbytes) {
    3831             :                         init_admin_reserve();
    3832             :                         pr_info("vm.admin_reserve_kbytes reset to %lu\n",
    3833             :                                 sysctl_admin_reserve_kbytes);
    3834             :                 }
    3835             :                 break;
    3836             :         default:
    3837             :                 break;
    3838             :         }
    3839             :         return NOTIFY_OK;
    3840             : }
    3841             : 
    3842             : static struct notifier_block reserve_mem_nb = {
    3843             :         .notifier_call = reserve_mem_notifier,
    3844             : };
    3845             : 
    3846           1 : static int __meminit init_reserve_notifier(void)
    3847             : {
    3848           1 :         if (register_hotmemory_notifier(&reserve_mem_nb))
    3849             :                 pr_err("Failed registering memory add/remove notifier for admin reserve\n");
    3850             : 
    3851           1 :         return 0;
    3852             : }
    3853             : subsys_initcall(init_reserve_notifier);

Generated by: LCOV version 1.14