LCOV - code coverage report
Current view: top level - mm - mremap.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 138 423 32.6 %
Date: 2021-04-22 12:43:58 Functions: 8 18 44.4 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  *      mm/mremap.c
       4             :  *
       5             :  *      (C) Copyright 1996 Linus Torvalds
       6             :  *
       7             :  *      Address space accounting code   <alan@lxorguk.ukuu.org.uk>
       8             :  *      (C) Copyright 2002 Red Hat Inc, All Rights Reserved
       9             :  */
      10             : 
      11             : #include <linux/mm.h>
      12             : #include <linux/hugetlb.h>
      13             : #include <linux/shm.h>
      14             : #include <linux/ksm.h>
      15             : #include <linux/mman.h>
      16             : #include <linux/swap.h>
      17             : #include <linux/capability.h>
      18             : #include <linux/fs.h>
      19             : #include <linux/swapops.h>
      20             : #include <linux/highmem.h>
      21             : #include <linux/security.h>
      22             : #include <linux/syscalls.h>
      23             : #include <linux/mmu_notifier.h>
      24             : #include <linux/uaccess.h>
      25             : #include <linux/userfaultfd_k.h>
      26             : 
      27             : #include <asm/cacheflush.h>
      28             : #include <asm/tlbflush.h>
      29             : 
      30             : #include "internal.h"
      31             : 
      32         944 : static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
      33             : {
      34         944 :         pgd_t *pgd;
      35         944 :         p4d_t *p4d;
      36         944 :         pud_t *pud;
      37             : 
      38         944 :         pgd = pgd_offset(mm, addr);
      39         944 :         if (pgd_none_or_clear_bad(pgd))
      40             :                 return NULL;
      41             : 
      42         944 :         p4d = p4d_offset(pgd, addr);
      43         944 :         if (p4d_none_or_clear_bad(p4d))
      44             :                 return NULL;
      45             : 
      46         944 :         pud = pud_offset(p4d, addr);
      47         944 :         if (pud_none_or_clear_bad(pud))
      48           0 :                 return NULL;
      49             : 
      50             :         return pud;
      51             : }
      52             : 
      53         944 : static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
      54             : {
      55         944 :         pud_t *pud;
      56         944 :         pmd_t *pmd;
      57             : 
      58         944 :         pud = get_old_pud(mm, addr);
      59         944 :         if (!pud)
      60             :                 return NULL;
      61             : 
      62         944 :         pmd = pmd_offset(pud, addr);
      63         944 :         if (pmd_none(*pmd))
      64           0 :                 return NULL;
      65             : 
      66             :         return pmd;
      67             : }
      68             : 
      69         944 : static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
      70             :                             unsigned long addr)
      71             : {
      72         944 :         pgd_t *pgd;
      73         944 :         p4d_t *p4d;
      74             : 
      75         944 :         pgd = pgd_offset(mm, addr);
      76         944 :         p4d = p4d_alloc(mm, pgd, addr);
      77         944 :         if (!p4d)
      78             :                 return NULL;
      79             : 
      80         944 :         return pud_alloc(mm, p4d, addr);
      81             : }
      82             : 
      83         944 : static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
      84             :                             unsigned long addr)
      85             : {
      86         944 :         pud_t *pud;
      87         944 :         pmd_t *pmd;
      88             : 
      89         944 :         pud = alloc_new_pud(mm, vma, addr);
      90         944 :         if (!pud)
      91             :                 return NULL;
      92             : 
      93         944 :         pmd = pmd_alloc(mm, pud, addr);
      94         944 :         if (!pmd)
      95             :                 return NULL;
      96             : 
      97         944 :         VM_BUG_ON(pmd_trans_huge(*pmd));
      98             : 
      99             :         return pmd;
     100             : }
     101             : 
     102           0 : static void take_rmap_locks(struct vm_area_struct *vma)
     103             : {
     104           0 :         if (vma->vm_file)
     105           0 :                 i_mmap_lock_write(vma->vm_file->f_mapping);
     106           0 :         if (vma->anon_vma)
     107           0 :                 anon_vma_lock_write(vma->anon_vma);
     108           0 : }
     109             : 
     110           0 : static void drop_rmap_locks(struct vm_area_struct *vma)
     111             : {
     112           0 :         if (vma->anon_vma)
     113           0 :                 anon_vma_unlock_write(vma->anon_vma);
     114           0 :         if (vma->vm_file)
     115           0 :                 i_mmap_unlock_write(vma->vm_file->f_mapping);
     116           0 : }
     117             : 
     118         944 : static pte_t move_soft_dirty_pte(pte_t pte)
     119             : {
     120             :         /*
     121             :          * Set soft dirty bit so we can notice
     122             :          * in userspace the ptes were moved.
     123             :          */
     124             : #ifdef CONFIG_MEM_SOFT_DIRTY
     125             :         if (pte_present(pte))
     126             :                 pte = pte_mksoft_dirty(pte);
     127             :         else if (is_swap_pte(pte))
     128             :                 pte = pte_swp_mksoft_dirty(pte);
     129             : #endif
     130         944 :         return pte;
     131             : }
     132             : 
     133         944 : static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
     134             :                 unsigned long old_addr, unsigned long old_end,
     135             :                 struct vm_area_struct *new_vma, pmd_t *new_pmd,
     136             :                 unsigned long new_addr, bool need_rmap_locks)
     137             : {
     138         944 :         struct mm_struct *mm = vma->vm_mm;
     139         944 :         pte_t *old_pte, *new_pte, pte;
     140         944 :         spinlock_t *old_ptl, *new_ptl;
     141         944 :         bool force_flush = false;
     142         944 :         unsigned long len = old_end - old_addr;
     143             : 
     144             :         /*
     145             :          * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
     146             :          * locks to ensure that rmap will always observe either the old or the
     147             :          * new ptes. This is the easiest way to avoid races with
     148             :          * truncate_pagecache(), page migration, etc...
     149             :          *
     150             :          * When need_rmap_locks is false, we use other ways to avoid
     151             :          * such races:
     152             :          *
     153             :          * - During exec() shift_arg_pages(), we use a specially tagged vma
     154             :          *   which rmap call sites look for using vma_is_temporary_stack().
     155             :          *
     156             :          * - During mremap(), new_vma is often known to be placed after vma
     157             :          *   in rmap traversal order. This ensures rmap will always observe
     158             :          *   either the old pte, or the new pte, or both (the page table locks
     159             :          *   serialize access to individual ptes, but only rmap traversal
     160             :          *   order guarantees that we won't miss both the old and new ptes).
     161             :          */
     162         944 :         if (need_rmap_locks)
     163           0 :                 take_rmap_locks(vma);
     164             : 
     165             :         /*
     166             :          * We don't have to worry about the ordering of src and dst
     167             :          * pte locks because exclusive mmap_lock prevents deadlock.
     168             :          */
     169        1888 :         old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
     170         944 :         new_pte = pte_offset_map(new_pmd, new_addr);
     171         944 :         new_ptl = pte_lockptr(mm, new_pmd);
     172         944 :         if (new_ptl != old_ptl)
     173         944 :                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
     174         944 :         flush_tlb_batched_pending(vma->vm_mm);
     175         944 :         arch_enter_lazy_mmu_mode();
     176             : 
     177        2832 :         for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
     178         944 :                                    new_pte++, new_addr += PAGE_SIZE) {
     179         944 :                 if (pte_none(*old_pte))
     180           0 :                         continue;
     181             : 
     182         944 :                 pte = ptep_get_and_clear(mm, old_addr, old_pte);
     183             :                 /*
     184             :                  * If we are remapping a valid PTE, make sure
     185             :                  * to flush TLB before we drop the PTL for the
     186             :                  * PTE.
     187             :                  *
     188             :                  * NOTE! Both old and new PTL matter: the old one
     189             :                  * for racing with page_mkclean(), the new one to
     190             :                  * make sure the physical page stays valid until
     191             :                  * the TLB entry for the old mapping has been
     192             :                  * flushed.
     193             :                  */
     194         944 :                 if (pte_present(pte))
     195         944 :                         force_flush = true;
     196         944 :                 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
     197         944 :                 pte = move_soft_dirty_pte(pte);
     198         944 :                 set_pte_at(mm, new_addr, new_pte, pte);
     199             :         }
     200             : 
     201         944 :         arch_leave_lazy_mmu_mode();
     202         944 :         if (force_flush)
     203         944 :                 flush_tlb_range(vma, old_end - len, old_end);
     204         944 :         if (new_ptl != old_ptl)
     205         944 :                 spin_unlock(new_ptl);
     206         944 :         pte_unmap(new_pte - 1);
     207         944 :         pte_unmap_unlock(old_pte - 1, old_ptl);
     208         944 :         if (need_rmap_locks)
     209           0 :                 drop_rmap_locks(vma);
     210         944 : }
     211             : 
     212             : #ifdef CONFIG_HAVE_MOVE_PMD
     213           0 : static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
     214             :                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
     215             : {
     216           0 :         spinlock_t *old_ptl, *new_ptl;
     217           0 :         struct mm_struct *mm = vma->vm_mm;
     218           0 :         pmd_t pmd;
     219             : 
     220             :         /*
     221             :          * The destination pmd shouldn't be established, free_pgtables()
     222             :          * should have released it.
     223             :          *
     224             :          * However, there's a case during execve() where we use mremap
     225             :          * to move the initial stack, and in that case the target area
     226             :          * may overlap the source area (always moving down).
     227             :          *
     228             :          * If everything is PMD-aligned, that works fine, as moving
     229             :          * each pmd down will clear the source pmd. But if we first
     230             :          * have a few 4kB-only pages that get moved down, and then
     231             :          * hit the "now the rest is PMD-aligned, let's do everything
     232             :          * one pmd at a time", we will still have the old (now empty
     233             :          * of any 4kB pages, but still there) PMD in the page table
     234             :          * tree.
     235             :          *
     236             :          * Warn on it once - because we really should try to figure
     237             :          * out how to do this better - but then say "I won't move
     238             :          * this pmd".
     239             :          *
     240             :          * One alternative might be to just unmap the target pmd at
     241             :          * this point, and verify that it really is empty. We'll see.
     242             :          */
     243           0 :         if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
     244             :                 return false;
     245             : 
     246             :         /*
     247             :          * We don't have to worry about the ordering of src and dst
     248             :          * ptlocks because exclusive mmap_lock prevents deadlock.
     249             :          */
     250           0 :         old_ptl = pmd_lock(vma->vm_mm, old_pmd);
     251           0 :         new_ptl = pmd_lockptr(mm, new_pmd);
     252           0 :         if (new_ptl != old_ptl)
     253           0 :                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
     254             : 
     255             :         /* Clear the pmd */
     256           0 :         pmd = *old_pmd;
     257           0 :         pmd_clear(old_pmd);
     258             : 
     259           0 :         VM_BUG_ON(!pmd_none(*new_pmd));
     260             : 
     261             :         /* Set the new pmd */
     262           0 :         set_pmd_at(mm, new_addr, new_pmd, pmd);
     263           0 :         flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
     264           0 :         if (new_ptl != old_ptl)
     265           0 :                 spin_unlock(new_ptl);
     266           0 :         spin_unlock(old_ptl);
     267             : 
     268           0 :         return true;
     269             : }
     270             : #else
     271             : static inline bool move_normal_pmd(struct vm_area_struct *vma,
     272             :                 unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
     273             :                 pmd_t *new_pmd)
     274             : {
     275             :         return false;
     276             : }
     277             : #endif
     278             : 
     279             : #ifdef CONFIG_HAVE_MOVE_PUD
     280           0 : static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
     281             :                   unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
     282             : {
     283           0 :         spinlock_t *old_ptl, *new_ptl;
     284           0 :         struct mm_struct *mm = vma->vm_mm;
     285           0 :         pud_t pud;
     286             : 
     287             :         /*
     288             :          * The destination pud shouldn't be established, free_pgtables()
     289             :          * should have released it.
     290             :          */
     291           0 :         if (WARN_ON_ONCE(!pud_none(*new_pud)))
     292             :                 return false;
     293             : 
     294             :         /*
     295             :          * We don't have to worry about the ordering of src and dst
     296             :          * ptlocks because exclusive mmap_lock prevents deadlock.
     297             :          */
     298           0 :         old_ptl = pud_lock(vma->vm_mm, old_pud);
     299           0 :         new_ptl = pud_lockptr(mm, new_pud);
     300           0 :         if (new_ptl != old_ptl)
     301           0 :                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
     302             : 
     303             :         /* Clear the pud */
     304           0 :         pud = *old_pud;
     305           0 :         pud_clear(old_pud);
     306             : 
     307           0 :         VM_BUG_ON(!pud_none(*new_pud));
     308             : 
     309             :         /* Set the new pud */
     310           0 :         set_pud_at(mm, new_addr, new_pud, pud);
     311           0 :         flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
     312           0 :         if (new_ptl != old_ptl)
     313           0 :                 spin_unlock(new_ptl);
     314           0 :         spin_unlock(old_ptl);
     315             : 
     316           0 :         return true;
     317             : }
     318             : #else
     319             : static inline bool move_normal_pud(struct vm_area_struct *vma,
     320             :                 unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
     321             :                 pud_t *new_pud)
     322             : {
     323             :         return false;
     324             : }
     325             : #endif
     326             : 
     327             : enum pgt_entry {
     328             :         NORMAL_PMD,
     329             :         HPAGE_PMD,
     330             :         NORMAL_PUD,
     331             : };
     332             : 
     333             : /*
     334             :  * Returns an extent of the corresponding size for the pgt_entry specified if
     335             :  * valid. Else returns a smaller extent bounded by the end of the source and
     336             :  * destination pgt_entry.
     337             :  */
     338        1888 : static __always_inline unsigned long get_extent(enum pgt_entry entry,
     339             :                         unsigned long old_addr, unsigned long old_end,
     340             :                         unsigned long new_addr)
     341             : {
     342        1888 :         unsigned long next, extent, mask, size;
     343             : 
     344        1888 :         switch (entry) {
     345             :         case HPAGE_PMD:
     346             :         case NORMAL_PMD:
     347             :                 mask = PMD_MASK;
     348             :                 size = PMD_SIZE;
     349             :                 break;
     350             :         case NORMAL_PUD:
     351         944 :                 mask = PUD_MASK;
     352         944 :                 size = PUD_SIZE;
     353         944 :                 break;
     354             :         default:
     355             :                 BUILD_BUG();
     356             :                 break;
     357             :         }
     358             : 
     359        1888 :         next = (old_addr + size) & mask;
     360             :         /* even if next overflowed, extent below will be ok */
     361        1888 :         extent = next - old_addr;
     362        1888 :         if (extent > old_end - old_addr)
     363             :                 extent = old_end - old_addr;
     364        1888 :         next = (new_addr + size) & mask;
     365        1888 :         if (extent > next - new_addr)
     366             :                 extent = next - new_addr;
     367        1888 :         return extent;
     368             : }
     369             : 
     370             : /*
     371             :  * Attempts to speedup the move by moving entry at the level corresponding to
     372             :  * pgt_entry. Returns true if the move was successful, else false.
     373             :  */
     374           0 : static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
     375             :                         unsigned long old_addr, unsigned long new_addr,
     376             :                         void *old_entry, void *new_entry, bool need_rmap_locks)
     377             : {
     378           0 :         bool moved = false;
     379             : 
     380             :         /* See comment in move_ptes() */
     381           0 :         if (need_rmap_locks)
     382           0 :                 take_rmap_locks(vma);
     383             : 
     384           0 :         switch (entry) {
     385           0 :         case NORMAL_PMD:
     386           0 :                 moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
     387             :                                         new_entry);
     388           0 :                 break;
     389           0 :         case NORMAL_PUD:
     390           0 :                 moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
     391             :                                         new_entry);
     392           0 :                 break;
     393           0 :         case HPAGE_PMD:
     394           0 :                 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
     395           0 :                         move_huge_pmd(vma, old_addr, new_addr, old_entry,
     396             :                                       new_entry);
     397           0 :                 break;
     398             :         default:
     399           0 :                 WARN_ON_ONCE(1);
     400           0 :                 break;
     401             :         }
     402             : 
     403           0 :         if (need_rmap_locks)
     404           0 :                 drop_rmap_locks(vma);
     405             : 
     406           0 :         return moved;
     407             : }
     408             : 
     409         944 : unsigned long move_page_tables(struct vm_area_struct *vma,
     410             :                 unsigned long old_addr, struct vm_area_struct *new_vma,
     411             :                 unsigned long new_addr, unsigned long len,
     412             :                 bool need_rmap_locks)
     413             : {
     414         944 :         unsigned long extent, old_end;
     415         944 :         struct mmu_notifier_range range;
     416         944 :         pmd_t *old_pmd, *new_pmd;
     417             : 
     418         944 :         old_end = old_addr + len;
     419         944 :         flush_cache_range(vma, old_addr, old_end);
     420             : 
     421         944 :         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
     422             :                                 old_addr, old_end);
     423        1888 :         mmu_notifier_invalidate_range_start(&range);
     424             : 
     425        1888 :         for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
     426         944 :                 cond_resched();
     427             :                 /*
     428             :                  * If extent is PUD-sized try to speed up the move by moving at the
     429             :                  * PUD level if possible.
     430             :                  */
     431         944 :                 extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
     432         944 :                 if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
     433           0 :                         pud_t *old_pud, *new_pud;
     434             : 
     435           0 :                         old_pud = get_old_pud(vma->vm_mm, old_addr);
     436           0 :                         if (!old_pud)
     437           0 :                                 continue;
     438           0 :                         new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
     439           0 :                         if (!new_pud)
     440             :                                 break;
     441           0 :                         if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
     442             :                                            old_pud, new_pud, need_rmap_locks))
     443           0 :                                 continue;
     444             :                 }
     445             : 
     446         944 :                 extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
     447         944 :                 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
     448         944 :                 if (!old_pmd)
     449           0 :                         continue;
     450         944 :                 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
     451         944 :                 if (!new_pmd)
     452             :                         break;
     453         944 :                 if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
     454         944 :                     pmd_devmap(*old_pmd)) {
     455           0 :                         if (extent == HPAGE_PMD_SIZE &&
     456           0 :                             move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
     457             :                                            old_pmd, new_pmd, need_rmap_locks))
     458           0 :                                 continue;
     459           0 :                         split_huge_pmd(vma, old_pmd, old_addr);
     460           0 :                         if (pmd_trans_unstable(old_pmd))
     461           0 :                                 continue;
     462         944 :                 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
     463             :                            extent == PMD_SIZE) {
     464             :                         /*
     465             :                          * If the extent is PMD-sized, try to speed the move by
     466             :                          * moving at the PMD level if possible.
     467             :                          */
     468           0 :                         if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
     469             :                                            old_pmd, new_pmd, need_rmap_locks))
     470           0 :                                 continue;
     471             :                 }
     472             : 
     473         944 :                 if (pte_alloc(new_vma->vm_mm, new_pmd))
     474             :                         break;
     475         944 :                 move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
     476             :                           new_pmd, new_addr, need_rmap_locks);
     477             :         }
     478             : 
     479         944 :         mmu_notifier_invalidate_range_end(&range);
     480             : 
     481         944 :         return len + old_addr - old_end;        /* how much done */
     482             : }
     483             : 
     484           0 : static unsigned long move_vma(struct vm_area_struct *vma,
     485             :                 unsigned long old_addr, unsigned long old_len,
     486             :                 unsigned long new_len, unsigned long new_addr,
     487             :                 bool *locked, unsigned long flags,
     488             :                 struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
     489             : {
     490           0 :         struct mm_struct *mm = vma->vm_mm;
     491           0 :         struct vm_area_struct *new_vma;
     492           0 :         unsigned long vm_flags = vma->vm_flags;
     493           0 :         unsigned long new_pgoff;
     494           0 :         unsigned long moved_len;
     495           0 :         unsigned long excess = 0;
     496           0 :         unsigned long hiwater_vm;
     497           0 :         int split = 0;
     498           0 :         int err = 0;
     499           0 :         bool need_rmap_locks;
     500             : 
     501             :         /*
     502             :          * We'd prefer to avoid failure later on in do_munmap:
     503             :          * which may split one vma into three before unmapping.
     504             :          */
     505           0 :         if (mm->map_count >= sysctl_max_map_count - 3)
     506             :                 return -ENOMEM;
     507             : 
     508           0 :         if (vma->vm_ops && vma->vm_ops->may_split) {
     509           0 :                 if (vma->vm_start != old_addr)
     510           0 :                         err = vma->vm_ops->may_split(vma, old_addr);
     511           0 :                 if (!err && vma->vm_end != old_addr + old_len)
     512           0 :                         err = vma->vm_ops->may_split(vma, old_addr + old_len);
     513           0 :                 if (err)
     514           0 :                         return err;
     515             :         }
     516             : 
     517             :         /*
     518             :          * Advise KSM to break any KSM pages in the area to be moved:
     519             :          * it would be confusing if they were to turn up at the new
     520             :          * location, where they happen to coincide with different KSM
     521             :          * pages recently unmapped.  But leave vma->vm_flags as it was,
     522             :          * so KSM can come around to merge on vma and new_vma afterwards.
     523             :          */
     524           0 :         err = ksm_madvise(vma, old_addr, old_addr + old_len,
     525             :                                                 MADV_UNMERGEABLE, &vm_flags);
     526           0 :         if (err)
     527           0 :                 return err;
     528             : 
     529           0 :         if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) {
     530           0 :                 if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT))
     531             :                         return -ENOMEM;
     532             :         }
     533             : 
     534           0 :         new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
     535           0 :         new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
     536             :                            &need_rmap_locks);
     537           0 :         if (!new_vma) {
     538           0 :                 if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT))
     539           0 :                         vm_unacct_memory(new_len >> PAGE_SHIFT);
     540           0 :                 return -ENOMEM;
     541             :         }
     542             : 
     543           0 :         moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
     544             :                                      need_rmap_locks);
     545           0 :         if (moved_len < old_len) {
     546             :                 err = -ENOMEM;
     547           0 :         } else if (vma->vm_ops && vma->vm_ops->mremap) {
     548           0 :                 err = vma->vm_ops->mremap(new_vma, flags);
     549             :         }
     550             : 
     551           0 :         if (unlikely(err)) {
     552             :                 /*
     553             :                  * On error, move entries back from new area to old,
     554             :                  * which will succeed since page tables still there,
     555             :                  * and then proceed to unmap new area instead of old.
     556             :                  */
     557           0 :                 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
     558             :                                  true);
     559           0 :                 vma = new_vma;
     560           0 :                 old_len = new_len;
     561           0 :                 old_addr = new_addr;
     562           0 :                 new_addr = err;
     563             :         } else {
     564           0 :                 mremap_userfaultfd_prep(new_vma, uf);
     565             :         }
     566             : 
     567             :         /* Conceal VM_ACCOUNT so old reservation is not undone */
     568           0 :         if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
     569           0 :                 vma->vm_flags &= ~VM_ACCOUNT;
     570           0 :                 excess = vma->vm_end - vma->vm_start - old_len;
     571           0 :                 if (old_addr > vma->vm_start &&
     572           0 :                     old_addr + old_len < vma->vm_end)
     573           0 :                         split = 1;
     574             :         }
     575             : 
     576             :         /*
     577             :          * If we failed to move page tables we still do total_vm increment
     578             :          * since do_munmap() will decrement it by old_len == new_len.
     579             :          *
     580             :          * Since total_vm is about to be raised artificially high for a
     581             :          * moment, we need to restore high watermark afterwards: if stats
     582             :          * are taken meanwhile, total_vm and hiwater_vm appear too high.
     583             :          * If this were a serious issue, we'd add a flag to do_munmap().
     584             :          */
     585           0 :         hiwater_vm = mm->hiwater_vm;
     586           0 :         vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
     587             : 
     588             :         /* Tell pfnmap has moved from this vma */
     589           0 :         if (unlikely(vma->vm_flags & VM_PFNMAP))
     590           0 :                 untrack_pfn_moved(vma);
     591             : 
     592           0 :         if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
     593             :                 /* We always clear VM_LOCKED[ONFAULT] on the old vma */
     594           0 :                 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
     595             : 
     596             :                 /*
     597             :                  * anon_vma links of the old vma is no longer needed after its page
     598             :                  * table has been moved.
     599             :                  */
     600           0 :                 if (new_vma != vma && vma->vm_start == old_addr &&
     601           0 :                         vma->vm_end == (old_addr + old_len))
     602           0 :                         unlink_anon_vmas(vma);
     603             : 
     604             :                 /* Because we won't unmap we don't need to touch locked_vm */
     605           0 :                 return new_addr;
     606             :         }
     607             : 
     608           0 :         if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
     609             :                 /* OOM: unable to split vma, just get accounts right */
     610           0 :                 if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
     611           0 :                         vm_acct_memory(new_len >> PAGE_SHIFT);
     612             :                 excess = 0;
     613             :         }
     614             : 
     615           0 :         if (vm_flags & VM_LOCKED) {
     616           0 :                 mm->locked_vm += new_len >> PAGE_SHIFT;
     617           0 :                 *locked = true;
     618             :         }
     619             : 
     620           0 :         mm->hiwater_vm = hiwater_vm;
     621             : 
     622             :         /* Restore VM_ACCOUNT if one or two pieces of vma left */
     623           0 :         if (excess) {
     624           0 :                 vma->vm_flags |= VM_ACCOUNT;
     625           0 :                 if (split)
     626           0 :                         vma->vm_next->vm_flags |= VM_ACCOUNT;
     627             :         }
     628             : 
     629             :         return new_addr;
     630             : }
     631             : 
     632           0 : static struct vm_area_struct *vma_to_resize(unsigned long addr,
     633             :         unsigned long old_len, unsigned long new_len, unsigned long flags,
     634             :         unsigned long *p)
     635             : {
     636           0 :         struct mm_struct *mm = current->mm;
     637           0 :         struct vm_area_struct *vma = find_vma(mm, addr);
     638           0 :         unsigned long pgoff;
     639             : 
     640           0 :         if (!vma || vma->vm_start > addr)
     641           0 :                 return ERR_PTR(-EFAULT);
     642             : 
     643             :         /*
     644             :          * !old_len is a special case where an attempt is made to 'duplicate'
     645             :          * a mapping.  This makes no sense for private mappings as it will
     646             :          * instead create a fresh/new mapping unrelated to the original.  This
     647             :          * is contrary to the basic idea of mremap which creates new mappings
     648             :          * based on the original.  There are no known use cases for this
     649             :          * behavior.  As a result, fail such attempts.
     650             :          */
     651           0 :         if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
     652           0 :                 pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
     653           0 :                 return ERR_PTR(-EINVAL);
     654             :         }
     655             : 
     656           0 :         if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
     657           0 :                         vma->vm_flags & VM_SHARED))
     658           0 :                 return ERR_PTR(-EINVAL);
     659             : 
     660           0 :         if (is_vm_hugetlb_page(vma))
     661             :                 return ERR_PTR(-EINVAL);
     662             : 
     663             :         /* We can't remap across vm area boundaries */
     664           0 :         if (old_len > vma->vm_end - addr)
     665           0 :                 return ERR_PTR(-EFAULT);
     666             : 
     667           0 :         if (new_len == old_len)
     668             :                 return vma;
     669             : 
     670             :         /* Need to be careful about a growing mapping */
     671           0 :         pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
     672           0 :         pgoff += vma->vm_pgoff;
     673           0 :         if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
     674           0 :                 return ERR_PTR(-EINVAL);
     675             : 
     676           0 :         if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
     677           0 :                 return ERR_PTR(-EFAULT);
     678             : 
     679           0 :         if (vma->vm_flags & VM_LOCKED) {
     680           0 :                 unsigned long locked, lock_limit;
     681           0 :                 locked = mm->locked_vm << PAGE_SHIFT;
     682           0 :                 lock_limit = rlimit(RLIMIT_MEMLOCK);
     683           0 :                 locked += new_len - old_len;
     684           0 :                 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
     685           0 :                         return ERR_PTR(-EAGAIN);
     686             :         }
     687             : 
     688           0 :         if (!may_expand_vm(mm, vma->vm_flags,
     689           0 :                                 (new_len - old_len) >> PAGE_SHIFT))
     690           0 :                 return ERR_PTR(-ENOMEM);
     691             : 
     692           0 :         if (vma->vm_flags & VM_ACCOUNT) {
     693           0 :                 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
     694           0 :                 if (security_vm_enough_memory_mm(mm, charged))
     695           0 :                         return ERR_PTR(-ENOMEM);
     696           0 :                 *p = charged;
     697             :         }
     698             : 
     699             :         return vma;
     700             : }
     701             : 
     702           0 : static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
     703             :                 unsigned long new_addr, unsigned long new_len, bool *locked,
     704             :                 unsigned long flags, struct vm_userfaultfd_ctx *uf,
     705             :                 struct list_head *uf_unmap_early,
     706             :                 struct list_head *uf_unmap)
     707             : {
     708           0 :         struct mm_struct *mm = current->mm;
     709           0 :         struct vm_area_struct *vma;
     710           0 :         unsigned long ret = -EINVAL;
     711           0 :         unsigned long charged = 0;
     712           0 :         unsigned long map_flags = 0;
     713             : 
     714           0 :         if (offset_in_page(new_addr))
     715           0 :                 goto out;
     716             : 
     717           0 :         if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
     718           0 :                 goto out;
     719             : 
     720             :         /* Ensure the old/new locations do not overlap */
     721           0 :         if (addr + old_len > new_addr && new_addr + new_len > addr)
     722           0 :                 goto out;
     723             : 
     724             :         /*
     725             :          * move_vma() need us to stay 4 maps below the threshold, otherwise
     726             :          * it will bail out at the very beginning.
     727             :          * That is a problem if we have already unmaped the regions here
     728             :          * (new_addr, and old_addr), because userspace will not know the
     729             :          * state of the vma's after it gets -ENOMEM.
     730             :          * So, to avoid such scenario we can pre-compute if the whole
     731             :          * operation has high chances to success map-wise.
     732             :          * Worst-scenario case is when both vma's (new_addr and old_addr) get
     733             :          * split in 3 before unmaping it.
     734             :          * That means 2 more maps (1 for each) to the ones we already hold.
     735             :          * Check whether current map count plus 2 still leads us to 4 maps below
     736             :          * the threshold, otherwise return -ENOMEM here to be more safe.
     737             :          */
     738           0 :         if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
     739             :                 return -ENOMEM;
     740             : 
     741           0 :         if (flags & MREMAP_FIXED) {
     742           0 :                 ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
     743           0 :                 if (ret)
     744           0 :                         goto out;
     745             :         }
     746             : 
     747           0 :         if (old_len >= new_len) {
     748           0 :                 ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
     749           0 :                 if (ret && old_len != new_len)
     750           0 :                         goto out;
     751             :                 old_len = new_len;
     752             :         }
     753             : 
     754           0 :         vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
     755           0 :         if (IS_ERR(vma)) {
     756           0 :                 ret = PTR_ERR(vma);
     757           0 :                 goto out;
     758             :         }
     759             : 
     760             :         /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
     761           0 :         if (flags & MREMAP_DONTUNMAP &&
     762           0 :                 !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
     763           0 :                 ret = -ENOMEM;
     764           0 :                 goto out;
     765             :         }
     766             : 
     767           0 :         if (flags & MREMAP_FIXED)
     768           0 :                 map_flags |= MAP_FIXED;
     769             : 
     770           0 :         if (vma->vm_flags & VM_MAYSHARE)
     771           0 :                 map_flags |= MAP_SHARED;
     772             : 
     773           0 :         ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
     774           0 :                                 ((addr - vma->vm_start) >> PAGE_SHIFT),
     775             :                                 map_flags);
     776           0 :         if (IS_ERR_VALUE(ret))
     777           0 :                 goto out1;
     778             : 
     779             :         /* We got a new mapping */
     780           0 :         if (!(flags & MREMAP_FIXED))
     781           0 :                 new_addr = ret;
     782             : 
     783           0 :         ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
     784             :                        uf_unmap);
     785             : 
     786           0 :         if (!(offset_in_page(ret)))
     787           0 :                 goto out;
     788             : 
     789           0 : out1:
     790           0 :         vm_unacct_memory(charged);
     791             : 
     792             : out:
     793             :         return ret;
     794             : }
     795             : 
     796           0 : static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
     797             : {
     798           0 :         unsigned long end = vma->vm_end + delta;
     799           0 :         if (end < vma->vm_end) /* overflow */
     800             :                 return 0;
     801           0 :         if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
     802             :                 return 0;
     803           0 :         if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
     804           0 :                               0, MAP_FIXED) & ~PAGE_MASK)
     805           0 :                 return 0;
     806             :         return 1;
     807             : }
     808             : 
     809             : /*
     810             :  * Expand (or shrink) an existing mapping, potentially moving it at the
     811             :  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
     812             :  *
     813             :  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
     814             :  * This option implies MREMAP_MAYMOVE.
     815             :  */
     816          84 : SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
     817             :                 unsigned long, new_len, unsigned long, flags,
     818             :                 unsigned long, new_addr)
     819             : {
     820          42 :         struct mm_struct *mm = current->mm;
     821          42 :         struct vm_area_struct *vma;
     822          42 :         unsigned long ret = -EINVAL;
     823          42 :         unsigned long charged = 0;
     824          42 :         bool locked = false;
     825          42 :         bool downgraded = false;
     826          42 :         struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
     827          42 :         LIST_HEAD(uf_unmap_early);
     828          42 :         LIST_HEAD(uf_unmap);
     829             : 
     830             :         /*
     831             :          * There is a deliberate asymmetry here: we strip the pointer tag
     832             :          * from the old address but leave the new address alone. This is
     833             :          * for consistency with mmap(), where we prevent the creation of
     834             :          * aliasing mappings in userspace by leaving the tag bits of the
     835             :          * mapping address intact. A non-zero tag will cause the subsequent
     836             :          * range checks to reject the address as invalid.
     837             :          *
     838             :          * See Documentation/arm64/tagged-address-abi.rst for more information.
     839             :          */
     840          42 :         addr = untagged_addr(addr);
     841             : 
     842          42 :         if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
     843             :                 return ret;
     844             : 
     845          42 :         if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
     846             :                 return ret;
     847             : 
     848             :         /*
     849             :          * MREMAP_DONTUNMAP is always a move and it does not allow resizing
     850             :          * in the process.
     851             :          */
     852          42 :         if (flags & MREMAP_DONTUNMAP &&
     853           0 :                         (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
     854             :                 return ret;
     855             : 
     856             : 
     857          42 :         if (offset_in_page(addr))
     858             :                 return ret;
     859             : 
     860          42 :         old_len = PAGE_ALIGN(old_len);
     861          42 :         new_len = PAGE_ALIGN(new_len);
     862             : 
     863             :         /*
     864             :          * We allow a zero old-len as a special case
     865             :          * for DOS-emu "duplicate shm area" thing. But
     866             :          * a zero new-len is nonsensical.
     867             :          */
     868          42 :         if (!new_len)
     869             :                 return ret;
     870             : 
     871          42 :         if (mmap_write_lock_killable(current->mm))
     872             :                 return -EINTR;
     873             : 
     874          42 :         if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
     875           0 :                 ret = mremap_to(addr, old_len, new_addr, new_len,
     876             :                                 &locked, flags, &uf, &uf_unmap_early,
     877             :                                 &uf_unmap);
     878           0 :                 goto out;
     879             :         }
     880             : 
     881             :         /*
     882             :          * Always allow a shrinking remap: that just unmaps
     883             :          * the unnecessary pages..
     884             :          * __do_munmap does all the needed commit accounting, and
     885             :          * downgrades mmap_lock to read if so directed.
     886             :          */
     887          42 :         if (old_len >= new_len) {
     888          42 :                 int retval;
     889             : 
     890          42 :                 retval = __do_munmap(mm, addr+new_len, old_len - new_len,
     891             :                                   &uf_unmap, true);
     892          42 :                 if (retval < 0 && old_len != new_len) {
     893           0 :                         ret = retval;
     894           0 :                         goto out;
     895             :                 /* Returning 1 indicates mmap_lock is downgraded to read. */
     896          42 :                 } else if (retval == 1)
     897          42 :                         downgraded = true;
     898          42 :                 ret = addr;
     899          42 :                 goto out;
     900             :         }
     901             : 
     902             :         /*
     903             :          * Ok, we need to grow..
     904             :          */
     905           0 :         vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
     906           0 :         if (IS_ERR(vma)) {
     907           0 :                 ret = PTR_ERR(vma);
     908           0 :                 goto out;
     909             :         }
     910             : 
     911             :         /* old_len exactly to the end of the area..
     912             :          */
     913           0 :         if (old_len == vma->vm_end - addr) {
     914             :                 /* can we just expand the current mapping? */
     915           0 :                 if (vma_expandable(vma, new_len - old_len)) {
     916           0 :                         int pages = (new_len - old_len) >> PAGE_SHIFT;
     917             : 
     918           0 :                         if (vma_adjust(vma, vma->vm_start, addr + new_len,
     919             :                                        vma->vm_pgoff, NULL)) {
     920           0 :                                 ret = -ENOMEM;
     921           0 :                                 goto out;
     922             :                         }
     923             : 
     924           0 :                         vm_stat_account(mm, vma->vm_flags, pages);
     925           0 :                         if (vma->vm_flags & VM_LOCKED) {
     926           0 :                                 mm->locked_vm += pages;
     927           0 :                                 locked = true;
     928           0 :                                 new_addr = addr;
     929             :                         }
     930           0 :                         ret = addr;
     931           0 :                         goto out;
     932             :                 }
     933             :         }
     934             : 
     935             :         /*
     936             :          * We weren't able to just expand or shrink the area,
     937             :          * we need to create a new one and move it..
     938             :          */
     939           0 :         ret = -ENOMEM;
     940           0 :         if (flags & MREMAP_MAYMOVE) {
     941           0 :                 unsigned long map_flags = 0;
     942           0 :                 if (vma->vm_flags & VM_MAYSHARE)
     943           0 :                         map_flags |= MAP_SHARED;
     944             : 
     945           0 :                 new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
     946           0 :                                         vma->vm_pgoff +
     947           0 :                                         ((addr - vma->vm_start) >> PAGE_SHIFT),
     948             :                                         map_flags);
     949           0 :                 if (IS_ERR_VALUE(new_addr)) {
     950           0 :                         ret = new_addr;
     951           0 :                         goto out;
     952             :                 }
     953             : 
     954           0 :                 ret = move_vma(vma, addr, old_len, new_len, new_addr,
     955             :                                &locked, flags, &uf, &uf_unmap);
     956             :         }
     957           0 : out:
     958          42 :         if (offset_in_page(ret)) {
     959           0 :                 vm_unacct_memory(charged);
     960           0 :                 locked = false;
     961             :         }
     962          42 :         if (downgraded)
     963          42 :                 mmap_read_unlock(current->mm);
     964             :         else
     965           0 :                 mmap_write_unlock(current->mm);
     966          42 :         if (locked && new_len > old_len)
     967           0 :                 mm_populate(new_addr + old_len, new_len - old_len);
     968          42 :         userfaultfd_unmap_complete(mm, &uf_unmap_early);
     969          42 :         mremap_userfaultfd_complete(&uf, addr, ret, old_len);
     970          42 :         userfaultfd_unmap_complete(mm, &uf_unmap);
     971          42 :         return ret;
     972             : }

Generated by: LCOV version 1.14