LCOV - code coverage report
Current view: top level - mm - madvise.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 81 504 16.1 %
Date: 2021-04-22 12:43:58 Functions: 5 19 26.3 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  *      linux/mm/madvise.c
       4             :  *
       5             :  * Copyright (C) 1999  Linus Torvalds
       6             :  * Copyright (C) 2002  Christoph Hellwig
       7             :  */
       8             : 
       9             : #include <linux/mman.h>
      10             : #include <linux/pagemap.h>
      11             : #include <linux/syscalls.h>
      12             : #include <linux/mempolicy.h>
      13             : #include <linux/page-isolation.h>
      14             : #include <linux/page_idle.h>
      15             : #include <linux/userfaultfd_k.h>
      16             : #include <linux/hugetlb.h>
      17             : #include <linux/falloc.h>
      18             : #include <linux/fadvise.h>
      19             : #include <linux/sched.h>
      20             : #include <linux/sched/mm.h>
      21             : #include <linux/uio.h>
      22             : #include <linux/ksm.h>
      23             : #include <linux/fs.h>
      24             : #include <linux/file.h>
      25             : #include <linux/blkdev.h>
      26             : #include <linux/backing-dev.h>
      27             : #include <linux/pagewalk.h>
      28             : #include <linux/swap.h>
      29             : #include <linux/swapops.h>
      30             : #include <linux/shmem_fs.h>
      31             : #include <linux/mmu_notifier.h>
      32             : 
      33             : #include <asm/tlb.h>
      34             : 
      35             : #include "internal.h"
      36             : 
      37             : struct madvise_walk_private {
      38             :         struct mmu_gather *tlb;
      39             :         bool pageout;
      40             : };
      41             : 
      42             : /*
      43             :  * Any behaviour which results in changes to the vma->vm_flags needs to
      44             :  * take mmap_lock for writing. Others, which simply traverse vmas, need
      45             :  * to only take it for reading.
      46             :  */
      47          12 : static int madvise_need_mmap_write(int behavior)
      48             : {
      49          12 :         switch (behavior) {
      50             :         case MADV_REMOVE:
      51             :         case MADV_WILLNEED:
      52             :         case MADV_DONTNEED:
      53             :         case MADV_COLD:
      54             :         case MADV_PAGEOUT:
      55             :         case MADV_FREE:
      56             :                 return 0;
      57             :         default:
      58             :                 /* be safe, default to 1. list exceptions explicitly */
      59          10 :                 return 1;
      60             :         }
      61             : }
      62             : 
      63             : /*
      64             :  * We can potentially split a vm area into separate
      65             :  * areas, each area with its own behavior.
      66             :  */
      67          10 : static long madvise_behavior(struct vm_area_struct *vma,
      68             :                      struct vm_area_struct **prev,
      69             :                      unsigned long start, unsigned long end, int behavior)
      70             : {
      71          10 :         struct mm_struct *mm = vma->vm_mm;
      72          10 :         int error = 0;
      73          10 :         pgoff_t pgoff;
      74          10 :         unsigned long new_flags = vma->vm_flags;
      75             : 
      76          10 :         switch (behavior) {
      77           0 :         case MADV_NORMAL:
      78           0 :                 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
      79           0 :                 break;
      80           0 :         case MADV_SEQUENTIAL:
      81           0 :                 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
      82           0 :                 break;
      83           0 :         case MADV_RANDOM:
      84           0 :                 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
      85           0 :                 break;
      86           0 :         case MADV_DONTFORK:
      87           0 :                 new_flags |= VM_DONTCOPY;
      88           0 :                 break;
      89          10 :         case MADV_DOFORK:
      90          10 :                 if (vma->vm_flags & VM_IO) {
      91           0 :                         error = -EINVAL;
      92           0 :                         goto out;
      93             :                 }
      94          10 :                 new_flags &= ~VM_DONTCOPY;
      95          10 :                 break;
      96           0 :         case MADV_WIPEONFORK:
      97             :                 /* MADV_WIPEONFORK is only supported on anonymous memory. */
      98           0 :                 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
      99           0 :                         error = -EINVAL;
     100           0 :                         goto out;
     101             :                 }
     102           0 :                 new_flags |= VM_WIPEONFORK;
     103           0 :                 break;
     104           0 :         case MADV_KEEPONFORK:
     105           0 :                 new_flags &= ~VM_WIPEONFORK;
     106           0 :                 break;
     107           0 :         case MADV_DONTDUMP:
     108           0 :                 new_flags |= VM_DONTDUMP;
     109           0 :                 break;
     110             :         case MADV_DODUMP:
     111           0 :                 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
     112           0 :                         error = -EINVAL;
     113           0 :                         goto out;
     114             :                 }
     115           0 :                 new_flags &= ~VM_DONTDUMP;
     116           0 :                 break;
     117           0 :         case MADV_MERGEABLE:
     118             :         case MADV_UNMERGEABLE:
     119           0 :                 error = ksm_madvise(vma, start, end, behavior, &new_flags);
     120           0 :                 if (error)
     121           0 :                         goto out_convert_errno;
     122             :                 break;
     123           0 :         case MADV_HUGEPAGE:
     124             :         case MADV_NOHUGEPAGE:
     125           0 :                 error = hugepage_madvise(vma, &new_flags, behavior);
     126           0 :                 if (error)
     127           0 :                         goto out_convert_errno;
     128             :                 break;
     129             :         }
     130             : 
     131          10 :         if (new_flags == vma->vm_flags) {
     132          10 :                 *prev = vma;
     133          10 :                 goto out;
     134             :         }
     135             : 
     136           0 :         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
     137           0 :         *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
     138             :                           vma->vm_file, pgoff, vma_policy(vma),
     139             :                           vma->vm_userfaultfd_ctx);
     140           0 :         if (*prev) {
     141           0 :                 vma = *prev;
     142           0 :                 goto success;
     143             :         }
     144             : 
     145           0 :         *prev = vma;
     146             : 
     147           0 :         if (start != vma->vm_start) {
     148           0 :                 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
     149           0 :                         error = -ENOMEM;
     150           0 :                         goto out;
     151             :                 }
     152           0 :                 error = __split_vma(mm, vma, start, 1);
     153           0 :                 if (error)
     154           0 :                         goto out_convert_errno;
     155             :         }
     156             : 
     157           0 :         if (end != vma->vm_end) {
     158           0 :                 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
     159           0 :                         error = -ENOMEM;
     160           0 :                         goto out;
     161             :                 }
     162           0 :                 error = __split_vma(mm, vma, end, 0);
     163           0 :                 if (error)
     164           0 :                         goto out_convert_errno;
     165             :         }
     166             : 
     167           0 : success:
     168             :         /*
     169             :          * vm_flags is protected by the mmap_lock held in write mode.
     170             :          */
     171           0 :         vma->vm_flags = new_flags;
     172             : 
     173           0 : out_convert_errno:
     174             :         /*
     175             :          * madvise() returns EAGAIN if kernel resources, such as
     176             :          * slab, are temporarily unavailable.
     177             :          */
     178           0 :         if (error == -ENOMEM)
     179           0 :                 error = -EAGAIN;
     180           0 : out:
     181          10 :         return error;
     182             : }
     183             : 
     184             : #ifdef CONFIG_SWAP
     185             : static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
     186             :         unsigned long end, struct mm_walk *walk)
     187             : {
     188             :         pte_t *orig_pte;
     189             :         struct vm_area_struct *vma = walk->private;
     190             :         unsigned long index;
     191             : 
     192             :         if (pmd_none_or_trans_huge_or_clear_bad(pmd))
     193             :                 return 0;
     194             : 
     195             :         for (index = start; index != end; index += PAGE_SIZE) {
     196             :                 pte_t pte;
     197             :                 swp_entry_t entry;
     198             :                 struct page *page;
     199             :                 spinlock_t *ptl;
     200             : 
     201             :                 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
     202             :                 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
     203             :                 pte_unmap_unlock(orig_pte, ptl);
     204             : 
     205             :                 if (pte_present(pte) || pte_none(pte))
     206             :                         continue;
     207             :                 entry = pte_to_swp_entry(pte);
     208             :                 if (unlikely(non_swap_entry(entry)))
     209             :                         continue;
     210             : 
     211             :                 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
     212             :                                                         vma, index, false);
     213             :                 if (page)
     214             :                         put_page(page);
     215             :         }
     216             : 
     217             :         return 0;
     218             : }
     219             : 
     220             : static const struct mm_walk_ops swapin_walk_ops = {
     221             :         .pmd_entry              = swapin_walk_pmd_entry,
     222             : };
     223             : 
     224             : static void force_shm_swapin_readahead(struct vm_area_struct *vma,
     225             :                 unsigned long start, unsigned long end,
     226             :                 struct address_space *mapping)
     227             : {
     228             :         XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
     229             :         pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
     230             :         struct page *page;
     231             : 
     232             :         rcu_read_lock();
     233             :         xas_for_each(&xas, page, end_index) {
     234             :                 swp_entry_t swap;
     235             : 
     236             :                 if (!xa_is_value(page))
     237             :                         continue;
     238             :                 xas_pause(&xas);
     239             :                 rcu_read_unlock();
     240             : 
     241             :                 swap = radix_to_swp_entry(page);
     242             :                 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
     243             :                                                         NULL, 0, false);
     244             :                 if (page)
     245             :                         put_page(page);
     246             : 
     247             :                 rcu_read_lock();
     248             :         }
     249             :         rcu_read_unlock();
     250             : 
     251             :         lru_add_drain();        /* Push any new pages onto the LRU now */
     252             : }
     253             : #endif          /* CONFIG_SWAP */
     254             : 
     255             : /*
     256             :  * Schedule all required I/O operations.  Do not wait for completion.
     257             :  */
     258           0 : static long madvise_willneed(struct vm_area_struct *vma,
     259             :                              struct vm_area_struct **prev,
     260             :                              unsigned long start, unsigned long end)
     261             : {
     262           0 :         struct mm_struct *mm = vma->vm_mm;
     263           0 :         struct file *file = vma->vm_file;
     264           0 :         loff_t offset;
     265             : 
     266           0 :         *prev = vma;
     267             : #ifdef CONFIG_SWAP
     268             :         if (!file) {
     269             :                 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
     270             :                 lru_add_drain(); /* Push any new pages onto the LRU now */
     271             :                 return 0;
     272             :         }
     273             : 
     274             :         if (shmem_mapping(file->f_mapping)) {
     275             :                 force_shm_swapin_readahead(vma, start, end,
     276             :                                         file->f_mapping);
     277             :                 return 0;
     278             :         }
     279             : #else
     280           0 :         if (!file)
     281             :                 return -EBADF;
     282             : #endif
     283             : 
     284           0 :         if (IS_DAX(file_inode(file))) {
     285             :                 /* no bad return value, but ignore advice */
     286             :                 return 0;
     287             :         }
     288             : 
     289             :         /*
     290             :          * Filesystem's fadvise may need to take various locks.  We need to
     291             :          * explicitly grab a reference because the vma (and hence the
     292             :          * vma's reference to the file) can go away as soon as we drop
     293             :          * mmap_lock.
     294             :          */
     295           0 :         *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
     296           0 :         get_file(file);
     297           0 :         offset = (loff_t)(start - vma->vm_start)
     298           0 :                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
     299           0 :         mmap_read_unlock(mm);
     300           0 :         vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
     301           0 :         fput(file);
     302           0 :         mmap_read_lock(mm);
     303           0 :         return 0;
     304             : }
     305             : 
     306           0 : static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
     307             :                                 unsigned long addr, unsigned long end,
     308             :                                 struct mm_walk *walk)
     309             : {
     310           0 :         struct madvise_walk_private *private = walk->private;
     311           0 :         struct mmu_gather *tlb = private->tlb;
     312           0 :         bool pageout = private->pageout;
     313           0 :         struct mm_struct *mm = tlb->mm;
     314           0 :         struct vm_area_struct *vma = walk->vma;
     315           0 :         pte_t *orig_pte, *pte, ptent;
     316           0 :         spinlock_t *ptl;
     317           0 :         struct page *page = NULL;
     318           0 :         LIST_HEAD(page_list);
     319             : 
     320           0 :         if (fatal_signal_pending(current))
     321             :                 return -EINTR;
     322             : 
     323             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     324           0 :         if (pmd_trans_huge(*pmd)) {
     325           0 :                 pmd_t orig_pmd;
     326           0 :                 unsigned long next = pmd_addr_end(addr, end);
     327             : 
     328           0 :                 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
     329           0 :                 ptl = pmd_trans_huge_lock(pmd, vma);
     330           0 :                 if (!ptl)
     331           0 :                         return 0;
     332             : 
     333           0 :                 orig_pmd = *pmd;
     334           0 :                 if (is_huge_zero_pmd(orig_pmd))
     335           0 :                         goto huge_unlock;
     336             : 
     337           0 :                 if (unlikely(!pmd_present(orig_pmd))) {
     338           0 :                         VM_BUG_ON(thp_migration_supported() &&
     339             :                                         !is_pmd_migration_entry(orig_pmd));
     340           0 :                         goto huge_unlock;
     341             :                 }
     342             : 
     343           0 :                 page = pmd_page(orig_pmd);
     344             : 
     345             :                 /* Do not interfere with other mappings of this page */
     346           0 :                 if (page_mapcount(page) != 1)
     347           0 :                         goto huge_unlock;
     348             : 
     349           0 :                 if (next - addr != HPAGE_PMD_SIZE) {
     350           0 :                         int err;
     351             : 
     352           0 :                         get_page(page);
     353           0 :                         spin_unlock(ptl);
     354           0 :                         lock_page(page);
     355           0 :                         err = split_huge_page(page);
     356           0 :                         unlock_page(page);
     357           0 :                         put_page(page);
     358           0 :                         if (!err)
     359           0 :                                 goto regular_page;
     360             :                         return 0;
     361             :                 }
     362             : 
     363           0 :                 if (pmd_young(orig_pmd)) {
     364           0 :                         pmdp_invalidate(vma, addr, pmd);
     365           0 :                         orig_pmd = pmd_mkold(orig_pmd);
     366             : 
     367           0 :                         set_pmd_at(mm, addr, pmd, orig_pmd);
     368           0 :                         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
     369             :                 }
     370             : 
     371           0 :                 ClearPageReferenced(page);
     372           0 :                 test_and_clear_page_young(page);
     373           0 :                 if (pageout) {
     374           0 :                         if (!isolate_lru_page(page)) {
     375           0 :                                 if (PageUnevictable(page))
     376           0 :                                         putback_lru_page(page);
     377             :                                 else
     378           0 :                                         list_add(&page->lru, &page_list);
     379             :                         }
     380             :                 } else
     381           0 :                         deactivate_page(page);
     382           0 : huge_unlock:
     383           0 :                 spin_unlock(ptl);
     384           0 :                 if (pageout)
     385           0 :                         reclaim_pages(&page_list);
     386           0 :                 return 0;
     387             :         }
     388             : 
     389           0 : regular_page:
     390           0 :         if (pmd_trans_unstable(pmd))
     391             :                 return 0;
     392             : #endif
     393           0 :         tlb_change_page_size(tlb, PAGE_SIZE);
     394           0 :         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
     395           0 :         flush_tlb_batched_pending(mm);
     396           0 :         arch_enter_lazy_mmu_mode();
     397           0 :         for (; addr < end; pte++, addr += PAGE_SIZE) {
     398           0 :                 ptent = *pte;
     399             : 
     400           0 :                 if (pte_none(ptent))
     401           0 :                         continue;
     402             : 
     403           0 :                 if (!pte_present(ptent))
     404           0 :                         continue;
     405             : 
     406           0 :                 page = vm_normal_page(vma, addr, ptent);
     407           0 :                 if (!page)
     408           0 :                         continue;
     409             : 
     410             :                 /*
     411             :                  * Creating a THP page is expensive so split it only if we
     412             :                  * are sure it's worth. Split it if we are only owner.
     413             :                  */
     414           0 :                 if (PageTransCompound(page)) {
     415           0 :                         if (page_mapcount(page) != 1)
     416             :                                 break;
     417           0 :                         get_page(page);
     418           0 :                         if (!trylock_page(page)) {
     419           0 :                                 put_page(page);
     420           0 :                                 break;
     421             :                         }
     422           0 :                         pte_unmap_unlock(orig_pte, ptl);
     423           0 :                         if (split_huge_page(page)) {
     424           0 :                                 unlock_page(page);
     425           0 :                                 put_page(page);
     426           0 :                                 pte_offset_map_lock(mm, pmd, addr, &ptl);
     427           0 :                                 break;
     428             :                         }
     429           0 :                         unlock_page(page);
     430           0 :                         put_page(page);
     431           0 :                         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
     432           0 :                         pte--;
     433           0 :                         addr -= PAGE_SIZE;
     434           0 :                         continue;
     435             :                 }
     436             : 
     437             :                 /* Do not interfere with other mappings of this page */
     438           0 :                 if (page_mapcount(page) != 1)
     439           0 :                         continue;
     440             : 
     441           0 :                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
     442             : 
     443           0 :                 if (pte_young(ptent)) {
     444           0 :                         ptent = ptep_get_and_clear_full(mm, addr, pte,
     445           0 :                                                         tlb->fullmm);
     446           0 :                         ptent = pte_mkold(ptent);
     447           0 :                         set_pte_at(mm, addr, pte, ptent);
     448           0 :                         tlb_remove_tlb_entry(tlb, pte, addr);
     449             :                 }
     450             : 
     451             :                 /*
     452             :                  * We are deactivating a page for accelerating reclaiming.
     453             :                  * VM couldn't reclaim the page unless we clear PG_young.
     454             :                  * As a side effect, it makes confuse idle-page tracking
     455             :                  * because they will miss recent referenced history.
     456             :                  */
     457           0 :                 ClearPageReferenced(page);
     458           0 :                 test_and_clear_page_young(page);
     459           0 :                 if (pageout) {
     460           0 :                         if (!isolate_lru_page(page)) {
     461           0 :                                 if (PageUnevictable(page))
     462           0 :                                         putback_lru_page(page);
     463             :                                 else
     464           0 :                                         list_add(&page->lru, &page_list);
     465             :                         }
     466             :                 } else
     467           0 :                         deactivate_page(page);
     468             :         }
     469             : 
     470           0 :         arch_leave_lazy_mmu_mode();
     471           0 :         pte_unmap_unlock(orig_pte, ptl);
     472           0 :         if (pageout)
     473           0 :                 reclaim_pages(&page_list);
     474           0 :         cond_resched();
     475             : 
     476           0 :         return 0;
     477             : }
     478             : 
     479             : static const struct mm_walk_ops cold_walk_ops = {
     480             :         .pmd_entry = madvise_cold_or_pageout_pte_range,
     481             : };
     482             : 
     483           0 : static void madvise_cold_page_range(struct mmu_gather *tlb,
     484             :                              struct vm_area_struct *vma,
     485             :                              unsigned long addr, unsigned long end)
     486             : {
     487           0 :         struct madvise_walk_private walk_private = {
     488             :                 .pageout = false,
     489             :                 .tlb = tlb,
     490             :         };
     491             : 
     492           0 :         tlb_start_vma(tlb, vma);
     493           0 :         walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
     494           0 :         tlb_end_vma(tlb, vma);
     495           0 : }
     496             : 
     497           0 : static long madvise_cold(struct vm_area_struct *vma,
     498             :                         struct vm_area_struct **prev,
     499             :                         unsigned long start_addr, unsigned long end_addr)
     500             : {
     501           0 :         struct mm_struct *mm = vma->vm_mm;
     502           0 :         struct mmu_gather tlb;
     503             : 
     504           0 :         *prev = vma;
     505           0 :         if (!can_madv_lru_vma(vma))
     506             :                 return -EINVAL;
     507             : 
     508           0 :         lru_add_drain();
     509           0 :         tlb_gather_mmu(&tlb, mm);
     510           0 :         madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
     511           0 :         tlb_finish_mmu(&tlb);
     512             : 
     513           0 :         return 0;
     514             : }
     515             : 
     516           0 : static void madvise_pageout_page_range(struct mmu_gather *tlb,
     517             :                              struct vm_area_struct *vma,
     518             :                              unsigned long addr, unsigned long end)
     519             : {
     520           0 :         struct madvise_walk_private walk_private = {
     521             :                 .pageout = true,
     522             :                 .tlb = tlb,
     523             :         };
     524             : 
     525           0 :         tlb_start_vma(tlb, vma);
     526           0 :         walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
     527           0 :         tlb_end_vma(tlb, vma);
     528           0 : }
     529             : 
     530           0 : static inline bool can_do_pageout(struct vm_area_struct *vma)
     531             : {
     532           0 :         if (vma_is_anonymous(vma))
     533             :                 return true;
     534           0 :         if (!vma->vm_file)
     535             :                 return false;
     536             :         /*
     537             :          * paging out pagecache only for non-anonymous mappings that correspond
     538             :          * to the files the calling process could (if tried) open for writing;
     539             :          * otherwise we'd be including shared non-exclusive mappings, which
     540             :          * opens a side channel.
     541             :          */
     542           0 :         return inode_owner_or_capable(&init_user_ns,
     543           0 :                                       file_inode(vma->vm_file)) ||
     544           0 :                file_permission(vma->vm_file, MAY_WRITE) == 0;
     545             : }
     546             : 
     547           0 : static long madvise_pageout(struct vm_area_struct *vma,
     548             :                         struct vm_area_struct **prev,
     549             :                         unsigned long start_addr, unsigned long end_addr)
     550             : {
     551           0 :         struct mm_struct *mm = vma->vm_mm;
     552           0 :         struct mmu_gather tlb;
     553             : 
     554           0 :         *prev = vma;
     555           0 :         if (!can_madv_lru_vma(vma))
     556             :                 return -EINVAL;
     557             : 
     558           0 :         if (!can_do_pageout(vma))
     559             :                 return 0;
     560             : 
     561           0 :         lru_add_drain();
     562           0 :         tlb_gather_mmu(&tlb, mm);
     563           0 :         madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
     564           0 :         tlb_finish_mmu(&tlb);
     565             : 
     566           0 :         return 0;
     567             : }
     568             : 
     569           0 : static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
     570             :                                 unsigned long end, struct mm_walk *walk)
     571             : 
     572             : {
     573           0 :         struct mmu_gather *tlb = walk->private;
     574           0 :         struct mm_struct *mm = tlb->mm;
     575           0 :         struct vm_area_struct *vma = walk->vma;
     576           0 :         spinlock_t *ptl;
     577           0 :         pte_t *orig_pte, *pte, ptent;
     578           0 :         struct page *page;
     579           0 :         int nr_swap = 0;
     580           0 :         unsigned long next;
     581             : 
     582           0 :         next = pmd_addr_end(addr, end);
     583           0 :         if (pmd_trans_huge(*pmd))
     584           0 :                 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
     585           0 :                         goto next;
     586             : 
     587           0 :         if (pmd_trans_unstable(pmd))
     588             :                 return 0;
     589             : 
     590           0 :         tlb_change_page_size(tlb, PAGE_SIZE);
     591           0 :         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
     592           0 :         flush_tlb_batched_pending(mm);
     593           0 :         arch_enter_lazy_mmu_mode();
     594           0 :         for (; addr != end; pte++, addr += PAGE_SIZE) {
     595           0 :                 ptent = *pte;
     596             : 
     597           0 :                 if (pte_none(ptent))
     598           0 :                         continue;
     599             :                 /*
     600             :                  * If the pte has swp_entry, just clear page table to
     601             :                  * prevent swap-in which is more expensive rather than
     602             :                  * (page allocation + zeroing).
     603             :                  */
     604           0 :                 if (!pte_present(ptent)) {
     605           0 :                         swp_entry_t entry;
     606             : 
     607           0 :                         entry = pte_to_swp_entry(ptent);
     608           0 :                         if (non_swap_entry(entry))
     609           0 :                                 continue;
     610           0 :                         nr_swap--;
     611           0 :                         free_swap_and_cache(entry);
     612           0 :                         pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
     613           0 :                         continue;
     614             :                 }
     615             : 
     616           0 :                 page = vm_normal_page(vma, addr, ptent);
     617           0 :                 if (!page)
     618           0 :                         continue;
     619             : 
     620             :                 /*
     621             :                  * If pmd isn't transhuge but the page is THP and
     622             :                  * is owned by only this process, split it and
     623             :                  * deactivate all pages.
     624             :                  */
     625           0 :                 if (PageTransCompound(page)) {
     626           0 :                         if (page_mapcount(page) != 1)
     627           0 :                                 goto out;
     628           0 :                         get_page(page);
     629           0 :                         if (!trylock_page(page)) {
     630           0 :                                 put_page(page);
     631           0 :                                 goto out;
     632             :                         }
     633           0 :                         pte_unmap_unlock(orig_pte, ptl);
     634           0 :                         if (split_huge_page(page)) {
     635           0 :                                 unlock_page(page);
     636           0 :                                 put_page(page);
     637           0 :                                 pte_offset_map_lock(mm, pmd, addr, &ptl);
     638           0 :                                 goto out;
     639             :                         }
     640           0 :                         unlock_page(page);
     641           0 :                         put_page(page);
     642           0 :                         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
     643           0 :                         pte--;
     644           0 :                         addr -= PAGE_SIZE;
     645           0 :                         continue;
     646             :                 }
     647             : 
     648           0 :                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
     649             : 
     650           0 :                 if (PageSwapCache(page) || PageDirty(page)) {
     651           0 :                         if (!trylock_page(page))
     652           0 :                                 continue;
     653             :                         /*
     654             :                          * If page is shared with others, we couldn't clear
     655             :                          * PG_dirty of the page.
     656             :                          */
     657           0 :                         if (page_mapcount(page) != 1) {
     658           0 :                                 unlock_page(page);
     659           0 :                                 continue;
     660             :                         }
     661             : 
     662           0 :                         if (PageSwapCache(page) && !try_to_free_swap(page)) {
     663             :                                 unlock_page(page);
     664             :                                 continue;
     665             :                         }
     666             : 
     667           0 :                         ClearPageDirty(page);
     668           0 :                         unlock_page(page);
     669             :                 }
     670             : 
     671           0 :                 if (pte_young(ptent) || pte_dirty(ptent)) {
     672             :                         /*
     673             :                          * Some of architecture(ex, PPC) don't update TLB
     674             :                          * with set_pte_at and tlb_remove_tlb_entry so for
     675             :                          * the portability, remap the pte with old|clean
     676             :                          * after pte clearing.
     677             :                          */
     678           0 :                         ptent = ptep_get_and_clear_full(mm, addr, pte,
     679           0 :                                                         tlb->fullmm);
     680             : 
     681           0 :                         ptent = pte_mkold(ptent);
     682           0 :                         ptent = pte_mkclean(ptent);
     683           0 :                         set_pte_at(mm, addr, pte, ptent);
     684           0 :                         tlb_remove_tlb_entry(tlb, pte, addr);
     685             :                 }
     686           0 :                 mark_page_lazyfree(page);
     687             :         }
     688           0 : out:
     689           0 :         if (nr_swap) {
     690           0 :                 if (current->mm == mm)
     691           0 :                         sync_mm_rss(mm);
     692             : 
     693           0 :                 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
     694             :         }
     695           0 :         arch_leave_lazy_mmu_mode();
     696           0 :         pte_unmap_unlock(orig_pte, ptl);
     697           0 :         cond_resched();
     698             : next:
     699             :         return 0;
     700             : }
     701             : 
     702             : static const struct mm_walk_ops madvise_free_walk_ops = {
     703             :         .pmd_entry              = madvise_free_pte_range,
     704             : };
     705             : 
     706           0 : static int madvise_free_single_vma(struct vm_area_struct *vma,
     707             :                         unsigned long start_addr, unsigned long end_addr)
     708             : {
     709           0 :         struct mm_struct *mm = vma->vm_mm;
     710           0 :         struct mmu_notifier_range range;
     711           0 :         struct mmu_gather tlb;
     712             : 
     713             :         /* MADV_FREE works for only anon vma at the moment */
     714           0 :         if (!vma_is_anonymous(vma))
     715             :                 return -EINVAL;
     716             : 
     717           0 :         range.start = max(vma->vm_start, start_addr);
     718           0 :         if (range.start >= vma->vm_end)
     719             :                 return -EINVAL;
     720           0 :         range.end = min(vma->vm_end, end_addr);
     721           0 :         if (range.end <= vma->vm_start)
     722             :                 return -EINVAL;
     723           0 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
     724             :                                 range.start, range.end);
     725             : 
     726           0 :         lru_add_drain();
     727           0 :         tlb_gather_mmu(&tlb, mm);
     728           0 :         update_hiwater_rss(mm);
     729             : 
     730           0 :         mmu_notifier_invalidate_range_start(&range);
     731           0 :         tlb_start_vma(&tlb, vma);
     732           0 :         walk_page_range(vma->vm_mm, range.start, range.end,
     733             :                         &madvise_free_walk_ops, &tlb);
     734           0 :         tlb_end_vma(&tlb, vma);
     735           0 :         mmu_notifier_invalidate_range_end(&range);
     736           0 :         tlb_finish_mmu(&tlb);
     737             : 
     738           0 :         return 0;
     739             : }
     740             : 
     741             : /*
     742             :  * Application no longer needs these pages.  If the pages are dirty,
     743             :  * it's OK to just throw them away.  The app will be more careful about
     744             :  * data it wants to keep.  Be sure to free swap resources too.  The
     745             :  * zap_page_range call sets things up for shrink_active_list to actually free
     746             :  * these pages later if no one else has touched them in the meantime,
     747             :  * although we could add these pages to a global reuse list for
     748             :  * shrink_active_list to pick up before reclaiming other pages.
     749             :  *
     750             :  * NB: This interface discards data rather than pushes it out to swap,
     751             :  * as some implementations do.  This has performance implications for
     752             :  * applications like large transactional databases which want to discard
     753             :  * pages in anonymous maps after committing to backing store the data
     754             :  * that was kept in them.  There is no reason to write this data out to
     755             :  * the swap area if the application is discarding it.
     756             :  *
     757             :  * An interface that causes the system to free clean pages and flush
     758             :  * dirty pages is already available as msync(MS_INVALIDATE).
     759             :  */
     760           4 : static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
     761             :                                         unsigned long start, unsigned long end)
     762             : {
     763           4 :         zap_page_range(vma, start, end - start);
     764           4 :         return 0;
     765             : }
     766             : 
     767           4 : static long madvise_dontneed_free(struct vm_area_struct *vma,
     768             :                                   struct vm_area_struct **prev,
     769             :                                   unsigned long start, unsigned long end,
     770             :                                   int behavior)
     771             : {
     772           4 :         struct mm_struct *mm = vma->vm_mm;
     773             : 
     774           4 :         *prev = vma;
     775           4 :         if (!can_madv_lru_vma(vma))
     776             :                 return -EINVAL;
     777             : 
     778           4 :         if (!userfaultfd_remove(vma, start, end)) {
     779             :                 *prev = NULL; /* mmap_lock has been dropped, prev is stale */
     780             : 
     781             :                 mmap_read_lock(mm);
     782             :                 vma = find_vma(mm, start);
     783             :                 if (!vma)
     784             :                         return -ENOMEM;
     785             :                 if (start < vma->vm_start) {
     786             :                         /*
     787             :                          * This "vma" under revalidation is the one
     788             :                          * with the lowest vma->vm_start where start
     789             :                          * is also < vma->vm_end. If start <
     790             :                          * vma->vm_start it means an hole materialized
     791             :                          * in the user address space within the
     792             :                          * virtual range passed to MADV_DONTNEED
     793             :                          * or MADV_FREE.
     794             :                          */
     795             :                         return -ENOMEM;
     796             :                 }
     797             :                 if (!can_madv_lru_vma(vma))
     798             :                         return -EINVAL;
     799             :                 if (end > vma->vm_end) {
     800             :                         /*
     801             :                          * Don't fail if end > vma->vm_end. If the old
     802             :                          * vma was splitted while the mmap_lock was
     803             :                          * released the effect of the concurrent
     804             :                          * operation may not cause madvise() to
     805             :                          * have an undefined result. There may be an
     806             :                          * adjacent next vma that we'll walk
     807             :                          * next. userfaultfd_remove() will generate an
     808             :                          * UFFD_EVENT_REMOVE repetition on the
     809             :                          * end-vma->vm_end range, but the manager can
     810             :                          * handle a repetition fine.
     811             :                          */
     812             :                         end = vma->vm_end;
     813             :                 }
     814             :                 VM_WARN_ON(start >= end);
     815             :         }
     816             : 
     817           4 :         if (behavior == MADV_DONTNEED)
     818           4 :                 return madvise_dontneed_single_vma(vma, start, end);
     819           0 :         else if (behavior == MADV_FREE)
     820           0 :                 return madvise_free_single_vma(vma, start, end);
     821             :         else
     822             :                 return -EINVAL;
     823             : }
     824             : 
     825             : /*
     826             :  * Application wants to free up the pages and associated backing store.
     827             :  * This is effectively punching a hole into the middle of a file.
     828             :  */
     829           0 : static long madvise_remove(struct vm_area_struct *vma,
     830             :                                 struct vm_area_struct **prev,
     831             :                                 unsigned long start, unsigned long end)
     832             : {
     833           0 :         loff_t offset;
     834           0 :         int error;
     835           0 :         struct file *f;
     836           0 :         struct mm_struct *mm = vma->vm_mm;
     837             : 
     838           0 :         *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
     839             : 
     840           0 :         if (vma->vm_flags & VM_LOCKED)
     841             :                 return -EINVAL;
     842             : 
     843           0 :         f = vma->vm_file;
     844             : 
     845           0 :         if (!f || !f->f_mapping || !f->f_mapping->host) {
     846             :                         return -EINVAL;
     847             :         }
     848             : 
     849           0 :         if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
     850             :                 return -EACCES;
     851             : 
     852           0 :         offset = (loff_t)(start - vma->vm_start)
     853           0 :                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
     854             : 
     855             :         /*
     856             :          * Filesystem's fallocate may need to take i_mutex.  We need to
     857             :          * explicitly grab a reference because the vma (and hence the
     858             :          * vma's reference to the file) can go away as soon as we drop
     859             :          * mmap_lock.
     860             :          */
     861           0 :         get_file(f);
     862           0 :         if (userfaultfd_remove(vma, start, end)) {
     863             :                 /* mmap_lock was not released by userfaultfd_remove() */
     864           0 :                 mmap_read_unlock(mm);
     865             :         }
     866           0 :         error = vfs_fallocate(f,
     867             :                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
     868           0 :                                 offset, end - start);
     869           0 :         fput(f);
     870           0 :         mmap_read_lock(mm);
     871           0 :         return error;
     872             : }
     873             : 
     874             : #ifdef CONFIG_MEMORY_FAILURE
     875             : /*
     876             :  * Error injection support for memory error handling.
     877             :  */
     878             : static int madvise_inject_error(int behavior,
     879             :                 unsigned long start, unsigned long end)
     880             : {
     881             :         unsigned long size;
     882             : 
     883             :         if (!capable(CAP_SYS_ADMIN))
     884             :                 return -EPERM;
     885             : 
     886             : 
     887             :         for (; start < end; start += size) {
     888             :                 unsigned long pfn;
     889             :                 struct page *page;
     890             :                 int ret;
     891             : 
     892             :                 ret = get_user_pages_fast(start, 1, 0, &page);
     893             :                 if (ret != 1)
     894             :                         return ret;
     895             :                 pfn = page_to_pfn(page);
     896             : 
     897             :                 /*
     898             :                  * When soft offlining hugepages, after migrating the page
     899             :                  * we dissolve it, therefore in the second loop "page" will
     900             :                  * no longer be a compound page.
     901             :                  */
     902             :                 size = page_size(compound_head(page));
     903             : 
     904             :                 if (behavior == MADV_SOFT_OFFLINE) {
     905             :                         pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
     906             :                                  pfn, start);
     907             :                         ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
     908             :                 } else {
     909             :                         pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
     910             :                                  pfn, start);
     911             :                         ret = memory_failure(pfn, MF_COUNT_INCREASED);
     912             :                 }
     913             : 
     914             :                 if (ret)
     915             :                         return ret;
     916             :         }
     917             : 
     918             :         return 0;
     919             : }
     920             : #endif
     921             : 
     922             : static long
     923          14 : madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
     924             :                 unsigned long start, unsigned long end, int behavior)
     925             : {
     926          14 :         switch (behavior) {
     927           0 :         case MADV_REMOVE:
     928           0 :                 return madvise_remove(vma, prev, start, end);
     929           0 :         case MADV_WILLNEED:
     930           0 :                 return madvise_willneed(vma, prev, start, end);
     931           0 :         case MADV_COLD:
     932           0 :                 return madvise_cold(vma, prev, start, end);
     933           0 :         case MADV_PAGEOUT:
     934           0 :                 return madvise_pageout(vma, prev, start, end);
     935           4 :         case MADV_FREE:
     936             :         case MADV_DONTNEED:
     937           4 :                 return madvise_dontneed_free(vma, prev, start, end, behavior);
     938          10 :         default:
     939          10 :                 return madvise_behavior(vma, prev, start, end, behavior);
     940             :         }
     941             : }
     942             : 
     943             : static bool
     944          12 : madvise_behavior_valid(int behavior)
     945             : {
     946          12 :         switch (behavior) {
     947             :         case MADV_DOFORK:
     948             :         case MADV_DONTFORK:
     949             :         case MADV_NORMAL:
     950             :         case MADV_SEQUENTIAL:
     951             :         case MADV_RANDOM:
     952             :         case MADV_REMOVE:
     953             :         case MADV_WILLNEED:
     954             :         case MADV_DONTNEED:
     955             :         case MADV_FREE:
     956             :         case MADV_COLD:
     957             :         case MADV_PAGEOUT:
     958             : #ifdef CONFIG_KSM
     959             :         case MADV_MERGEABLE:
     960             :         case MADV_UNMERGEABLE:
     961             : #endif
     962             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     963             :         case MADV_HUGEPAGE:
     964             :         case MADV_NOHUGEPAGE:
     965             : #endif
     966             :         case MADV_DONTDUMP:
     967             :         case MADV_DODUMP:
     968             :         case MADV_WIPEONFORK:
     969             :         case MADV_KEEPONFORK:
     970             : #ifdef CONFIG_MEMORY_FAILURE
     971             :         case MADV_SOFT_OFFLINE:
     972             :         case MADV_HWPOISON:
     973             : #endif
     974             :                 return true;
     975             : 
     976             :         default:
     977             :                 return false;
     978             :         }
     979             : }
     980             : 
     981             : static bool
     982           0 : process_madvise_behavior_valid(int behavior)
     983             : {
     984           0 :         switch (behavior) {
     985             :         case MADV_COLD:
     986             :         case MADV_PAGEOUT:
     987             :                 return true;
     988             :         default:
     989           0 :                 return false;
     990             :         }
     991             : }
     992             : 
     993             : /*
     994             :  * The madvise(2) system call.
     995             :  *
     996             :  * Applications can use madvise() to advise the kernel how it should
     997             :  * handle paging I/O in this VM area.  The idea is to help the kernel
     998             :  * use appropriate read-ahead and caching techniques.  The information
     999             :  * provided is advisory only, and can be safely disregarded by the
    1000             :  * kernel without affecting the correct operation of the application.
    1001             :  *
    1002             :  * behavior values:
    1003             :  *  MADV_NORMAL - the default behavior is to read clusters.  This
    1004             :  *              results in some read-ahead and read-behind.
    1005             :  *  MADV_RANDOM - the system should read the minimum amount of data
    1006             :  *              on any access, since it is unlikely that the appli-
    1007             :  *              cation will need more than what it asks for.
    1008             :  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
    1009             :  *              once, so they can be aggressively read ahead, and
    1010             :  *              can be freed soon after they are accessed.
    1011             :  *  MADV_WILLNEED - the application is notifying the system to read
    1012             :  *              some pages ahead.
    1013             :  *  MADV_DONTNEED - the application is finished with the given range,
    1014             :  *              so the kernel can free resources associated with it.
    1015             :  *  MADV_FREE - the application marks pages in the given range as lazy free,
    1016             :  *              where actual purges are postponed until memory pressure happens.
    1017             :  *  MADV_REMOVE - the application wants to free up the given range of
    1018             :  *              pages and associated backing store.
    1019             :  *  MADV_DONTFORK - omit this area from child's address space when forking:
    1020             :  *              typically, to avoid COWing pages pinned by get_user_pages().
    1021             :  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
    1022             :  *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
    1023             :  *              range after a fork.
    1024             :  *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
    1025             :  *  MADV_HWPOISON - trigger memory error handler as if the given memory range
    1026             :  *              were corrupted by unrecoverable hardware memory failure.
    1027             :  *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
    1028             :  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
    1029             :  *              this area with pages of identical content from other such areas.
    1030             :  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
    1031             :  *  MADV_HUGEPAGE - the application wants to back the given range by transparent
    1032             :  *              huge pages in the future. Existing pages might be coalesced and
    1033             :  *              new pages might be allocated as THP.
    1034             :  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
    1035             :  *              transparent huge pages so the existing pages will not be
    1036             :  *              coalesced into THP and new pages will not be allocated as THP.
    1037             :  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
    1038             :  *              from being included in its core dump.
    1039             :  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
    1040             :  *  MADV_COLD - the application is not expected to use this memory soon,
    1041             :  *              deactivate pages in this range so that they can be reclaimed
    1042             :  *              easily if memory pressure hanppens.
    1043             :  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
    1044             :  *              page out the pages in this range immediately.
    1045             :  *
    1046             :  * return values:
    1047             :  *  zero    - success
    1048             :  *  -EINVAL - start + len < 0, start is not page-aligned,
    1049             :  *              "behavior" is not a valid value, or application
    1050             :  *              is attempting to release locked or shared pages,
    1051             :  *              or the specified address range includes file, Huge TLB,
    1052             :  *              MAP_SHARED or VMPFNMAP range.
    1053             :  *  -ENOMEM - addresses in the specified range are not currently
    1054             :  *              mapped, or are outside the AS of the process.
    1055             :  *  -EIO    - an I/O error occurred while paging in data.
    1056             :  *  -EBADF  - map exists, but area maps something that isn't a file.
    1057             :  *  -EAGAIN - a kernel resource was temporarily unavailable.
    1058             :  */
    1059          12 : int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
    1060             : {
    1061          12 :         unsigned long end, tmp;
    1062          12 :         struct vm_area_struct *vma, *prev;
    1063          12 :         int unmapped_error = 0;
    1064          12 :         int error = -EINVAL;
    1065          12 :         int write;
    1066          12 :         size_t len;
    1067          12 :         struct blk_plug plug;
    1068             : 
    1069          12 :         start = untagged_addr(start);
    1070             : 
    1071          12 :         if (!madvise_behavior_valid(behavior))
    1072             :                 return error;
    1073             : 
    1074          12 :         if (!PAGE_ALIGNED(start))
    1075             :                 return error;
    1076          12 :         len = PAGE_ALIGN(len_in);
    1077             : 
    1078             :         /* Check to see whether len was rounded up from small -ve to zero */
    1079          12 :         if (len_in && !len)
    1080             :                 return error;
    1081             : 
    1082          12 :         end = start + len;
    1083          12 :         if (end < start)
    1084             :                 return error;
    1085             : 
    1086          12 :         error = 0;
    1087          12 :         if (end == start)
    1088             :                 return error;
    1089             : 
    1090             : #ifdef CONFIG_MEMORY_FAILURE
    1091             :         if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
    1092             :                 return madvise_inject_error(behavior, start, start + len_in);
    1093             : #endif
    1094             : 
    1095          12 :         write = madvise_need_mmap_write(behavior);
    1096          12 :         if (write) {
    1097          10 :                 if (mmap_write_lock_killable(mm))
    1098             :                         return -EINTR;
    1099             :         } else {
    1100           2 :                 mmap_read_lock(mm);
    1101             :         }
    1102             : 
    1103             :         /*
    1104             :          * If the interval [start,end) covers some unmapped address
    1105             :          * ranges, just ignore them, but return -ENOMEM at the end.
    1106             :          * - different from the way of handling in mlock etc.
    1107             :          */
    1108          12 :         vma = find_vma_prev(mm, start, &prev);
    1109          12 :         if (vma && start > vma->vm_start)
    1110           0 :                 prev = vma;
    1111             : 
    1112          12 :         blk_start_plug(&plug);
    1113          14 :         for (;;) {
    1114             :                 /* Still start < end. */
    1115          14 :                 error = -ENOMEM;
    1116          14 :                 if (!vma)
    1117           0 :                         goto out;
    1118             : 
    1119             :                 /* Here start < (end|vma->vm_end). */
    1120          14 :                 if (start < vma->vm_start) {
    1121           0 :                         unmapped_error = -ENOMEM;
    1122           0 :                         start = vma->vm_start;
    1123           0 :                         if (start >= end)
    1124           0 :                                 goto out;
    1125             :                 }
    1126             : 
    1127             :                 /* Here vma->vm_start <= start < (end|vma->vm_end) */
    1128          14 :                 tmp = vma->vm_end;
    1129          14 :                 if (end < tmp)
    1130             :                         tmp = end;
    1131             : 
    1132             :                 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
    1133          14 :                 error = madvise_vma(vma, &prev, start, tmp, behavior);
    1134          14 :                 if (error)
    1135           0 :                         goto out;
    1136          14 :                 start = tmp;
    1137          14 :                 if (prev && start < prev->vm_end)
    1138             :                         start = prev->vm_end;
    1139          14 :                 error = unmapped_error;
    1140          14 :                 if (start >= end)
    1141          12 :                         goto out;
    1142           2 :                 if (prev)
    1143           2 :                         vma = prev->vm_next;
    1144             :                 else    /* madvise_remove dropped mmap_lock */
    1145           0 :                         vma = find_vma(mm, start);
    1146             :         }
    1147          12 : out:
    1148          12 :         blk_finish_plug(&plug);
    1149          12 :         if (write)
    1150          10 :                 mmap_write_unlock(mm);
    1151             :         else
    1152           2 :                 mmap_read_unlock(mm);
    1153             : 
    1154             :         return error;
    1155             : }
    1156             : 
    1157          24 : SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
    1158             : {
    1159          12 :         return do_madvise(current->mm, start, len_in, behavior);
    1160             : }
    1161             : 
    1162           0 : SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
    1163             :                 size_t, vlen, int, behavior, unsigned int, flags)
    1164             : {
    1165           0 :         ssize_t ret;
    1166           0 :         struct iovec iovstack[UIO_FASTIOV], iovec;
    1167           0 :         struct iovec *iov = iovstack;
    1168           0 :         struct iov_iter iter;
    1169           0 :         struct pid *pid;
    1170           0 :         struct task_struct *task;
    1171           0 :         struct mm_struct *mm;
    1172           0 :         size_t total_len;
    1173           0 :         unsigned int f_flags;
    1174             : 
    1175           0 :         if (flags != 0) {
    1176           0 :                 ret = -EINVAL;
    1177           0 :                 goto out;
    1178             :         }
    1179             : 
    1180           0 :         ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
    1181           0 :         if (ret < 0)
    1182           0 :                 goto out;
    1183             : 
    1184           0 :         pid = pidfd_get_pid(pidfd, &f_flags);
    1185           0 :         if (IS_ERR(pid)) {
    1186           0 :                 ret = PTR_ERR(pid);
    1187           0 :                 goto free_iov;
    1188             :         }
    1189             : 
    1190           0 :         task = get_pid_task(pid, PIDTYPE_PID);
    1191           0 :         if (!task) {
    1192           0 :                 ret = -ESRCH;
    1193           0 :                 goto put_pid;
    1194             :         }
    1195             : 
    1196           0 :         if (!process_madvise_behavior_valid(behavior)) {
    1197           0 :                 ret = -EINVAL;
    1198           0 :                 goto release_task;
    1199             :         }
    1200             : 
    1201             :         /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
    1202           0 :         mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
    1203           0 :         if (IS_ERR_OR_NULL(mm)) {
    1204           0 :                 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
    1205           0 :                 goto release_task;
    1206             :         }
    1207             : 
    1208             :         /*
    1209             :          * Require CAP_SYS_NICE for influencing process performance. Note that
    1210             :          * only non-destructive hints are currently supported.
    1211             :          */
    1212           0 :         if (!capable(CAP_SYS_NICE)) {
    1213           0 :                 ret = -EPERM;
    1214           0 :                 goto release_mm;
    1215             :         }
    1216             : 
    1217           0 :         total_len = iov_iter_count(&iter);
    1218             : 
    1219           0 :         while (iov_iter_count(&iter)) {
    1220           0 :                 iovec = iov_iter_iovec(&iter);
    1221           0 :                 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
    1222             :                                         iovec.iov_len, behavior);
    1223           0 :                 if (ret < 0)
    1224             :                         break;
    1225           0 :                 iov_iter_advance(&iter, iovec.iov_len);
    1226             :         }
    1227             : 
    1228           0 :         if (ret == 0)
    1229           0 :                 ret = total_len - iov_iter_count(&iter);
    1230             : 
    1231           0 : release_mm:
    1232           0 :         mmput(mm);
    1233           0 : release_task:
    1234           0 :         put_task_struct(task);
    1235           0 : put_pid:
    1236           0 :         put_pid(pid);
    1237           0 : free_iov:
    1238           0 :         kfree(iov);
    1239           0 : out:
    1240           0 :         return ret;
    1241             : }

Generated by: LCOV version 1.14