LCOV - landlock.info - mm/memory.c

LCOV - code coverage report

Current view:	top level - mm - memory.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	1135	1900	59.7 %
Date:	2021-04-22 12:43:58	Functions:	79	126	62.7 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/memory.c
       4             :  *
       5             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       6             :  */
       7             : 
       8             : /*
       9             :  * demand-loading started 01.12.91 - seems it is high on the list of
      10             :  * things wanted, and it should be easy to implement. - Linus
      11             :  */
      12             : 
      13             : /*
      14             :  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
      15             :  * pages started 02.12.91, seems to work. - Linus.
      16             :  *
      17             :  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
      18             :  * would have taken more than the 6M I have free, but it worked well as
      19             :  * far as I could see.
      20             :  *
      21             :  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
      22             :  */
      23             : 
      24             : /*
      25             :  * Real VM (paging to/from disk) started 18.12.91. Much more work and
      26             :  * thought has to go into this. Oh, well..
      27             :  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
      28             :  *              Found it. Everything seems to work now.
      29             :  * 20.12.91  -  Ok, making the swap-device changeable like the root.
      30             :  */
      31             : 
      32             : /*
      33             :  * 05.04.94  -  Multi-page memory management added for v1.1.
      34             :  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
      35             :  *
      36             :  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
      37             :  *              (Gerhard.Wichert@pdb.siemens.de)
      38             :  *
      39             :  * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
      40             :  */
      41             : 
      42             : #include <linux/kernel_stat.h>
      43             : #include <linux/mm.h>
      44             : #include <linux/sched/mm.h>
      45             : #include <linux/sched/coredump.h>
      46             : #include <linux/sched/numa_balancing.h>
      47             : #include <linux/sched/task.h>
      48             : #include <linux/hugetlb.h>
      49             : #include <linux/mman.h>
      50             : #include <linux/swap.h>
      51             : #include <linux/highmem.h>
      52             : #include <linux/pagemap.h>
      53             : #include <linux/memremap.h>
      54             : #include <linux/ksm.h>
      55             : #include <linux/rmap.h>
      56             : #include <linux/export.h>
      57             : #include <linux/delayacct.h>
      58             : #include <linux/init.h>
      59             : #include <linux/pfn_t.h>
      60             : #include <linux/writeback.h>
      61             : #include <linux/memcontrol.h>
      62             : #include <linux/mmu_notifier.h>
      63             : #include <linux/swapops.h>
      64             : #include <linux/elf.h>
      65             : #include <linux/gfp.h>
      66             : #include <linux/migrate.h>
      67             : #include <linux/string.h>
      68             : #include <linux/debugfs.h>
      69             : #include <linux/userfaultfd_k.h>
      70             : #include <linux/dax.h>
      71             : #include <linux/oom.h>
      72             : #include <linux/numa.h>
      73             : #include <linux/perf_event.h>
      74             : #include <linux/ptrace.h>
      75             : #include <linux/vmalloc.h>
      76             : 
      77             : #include <trace/events/kmem.h>
      78             : 
      79             : #include <asm/io.h>
      80             : #include <asm/mmu_context.h>
      81             : #include <asm/pgalloc.h>
      82             : #include <linux/uaccess.h>
      83             : #include <asm/tlb.h>
      84             : #include <asm/tlbflush.h>
      85             : 
      86             : #include "pgalloc-track.h"
      87             : #include "internal.h"
      88             : 
      89             : #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
      90             : #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
      91             : #endif
      92             : 
      93             : #ifndef CONFIG_NEED_MULTIPLE_NODES
      94             : /* use the per-pgdat data instead for discontigmem - mbligh */
      95             : unsigned long max_mapnr;
      96             : EXPORT_SYMBOL(max_mapnr);
      97             : 
      98             : struct page *mem_map;
      99             : EXPORT_SYMBOL(mem_map);
     100             : #endif
     101             : 
     102             : /*
     103             :  * A number of key systems in x86 including ioremap() rely on the assumption
     104             :  * that high_memory defines the upper bound on direct map memory, then end
     105             :  * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
     106             :  * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
     107             :  * and ZONE_HIGHMEM.
     108             :  */
     109             : void *high_memory;
     110             : EXPORT_SYMBOL(high_memory);
     111             : 
     112             : /*
     113             :  * Randomize the address space (stacks, mmaps, brk, etc.).
     114             :  *
     115             :  * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
     116             :  *   as ancient (libc5 based) binaries can segfault. )
     117             :  */
     118             : int randomize_va_space __read_mostly =
     119             : #ifdef CONFIG_COMPAT_BRK
     120             :                                         1;
     121             : #else
     122             :                                         2;
     123             : #endif
     124             : 
     125             : #ifndef arch_faults_on_old_pte
     126             : static inline bool arch_faults_on_old_pte(void)
     127             : {
     128             :         /*
     129             :          * Those arches which don't have hw access flag feature need to
     130             :          * implement their own helper. By default, "true" means pagefault
     131             :          * will be hit on old pte.
     132             :          */
     133             :         return true;
     134             : }
     135             : #endif
     136             : 
     137             : #ifndef arch_wants_old_prefaulted_pte
     138             : static inline bool arch_wants_old_prefaulted_pte(void)
     139             : {
     140             :         /*
     141             :          * Transitioning a PTE from 'old' to 'young' can be expensive on
     142             :          * some architectures, even if it's performed in hardware. By
     143             :          * default, "false" means prefaulted entries will be 'young'.
     144             :          */
     145             :         return false;
     146             : }
     147             : #endif
     148             : 
     149           0 : static int __init disable_randmaps(char *s)
     150             : {
     151           0 :         randomize_va_space = 0;
     152           0 :         return 1;
     153             : }
     154             : __setup("norandmaps", disable_randmaps);
     155             : 
     156             : unsigned long zero_pfn __read_mostly;
     157             : EXPORT_SYMBOL(zero_pfn);
     158             : 
     159             : unsigned long highest_memmap_pfn __read_mostly;
     160             : 
     161             : /*
     162             :  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
     163             :  */
     164           1 : static int __init init_zero_pfn(void)
     165             : {
     166           1 :         zero_pfn = page_to_pfn(ZERO_PAGE(0));
     167           1 :         return 0;
     168             : }
     169             : core_initcall(init_zero_pfn);
     170             : 
     171      124668 : void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
     172             : {
     173      124668 :         trace_rss_stat(mm, member, count);
     174        6020 : }
     175             : 
     176             : #if defined(SPLIT_RSS_COUNTING)
     177             : 
     178       19122 : void sync_mm_rss(struct mm_struct *mm)
     179             : {
     180       19122 :         int i;
     181             : 
     182       95610 :         for (i = 0; i < NR_MM_COUNTERS; i++) {
     183       76487 :                 if (current->rss_stat.count[i]) {
     184       16614 :                         add_mm_counter(mm, i, current->rss_stat.count[i]);
     185       16615 :                         current->rss_stat.count[i] = 0;
     186             :                 }
     187             :         }
     188       19123 :         current->rss_stat.events = 0;
     189       19123 : }
     190             : 
     191      850245 : static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
     192             : {
     193      850245 :         struct task_struct *task = current;
     194             : 
     195      850245 :         if (likely(task->mm == mm))
     196      847758 :                 task->rss_stat.count[member] += val;
     197             :         else
     198        2487 :                 add_mm_counter(mm, member, val);
     199      850245 : }
     200             : #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
     201             : #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
     202             : 
     203             : /* sync counter once per 64 page faults */
     204             : #define TASK_RSS_EVENTS_THRESH  (64)
     205      171934 : static void check_sync_rss_stat(struct task_struct *task)
     206             : {
     207      171934 :         if (unlikely(task != current))
     208             :                 return;
     209      171934 :         if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
     210         956 :                 sync_mm_rss(task->mm);
     211             : }
     212             : #else /* SPLIT_RSS_COUNTING */
     213             : 
     214             : #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
     215             : #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
     216             : 
     217             : static void check_sync_rss_stat(struct task_struct *task)
     218             : {
     219             : }
     220             : 
     221             : #endif /* SPLIT_RSS_COUNTING */
     222             : 
     223             : /*
     224             :  * Note: this doesn't free the actual pages themselves. That
     225             :  * has been handled earlier when unmapping all the memory regions.
     226             :  */
     227       16925 : static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
     228             :                            unsigned long addr)
     229             : {
     230       16925 :         pgtable_t token = pmd_pgtable(*pmd);
     231       16925 :         pmd_clear(pmd);
     232       16925 :         pte_free_tlb(tlb, token, addr);
     233       16925 :         mm_dec_nr_ptes(tlb->mm);
     234       16925 : }
     235             : 
     236       11081 : static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
     237             :                                 unsigned long addr, unsigned long end,
     238             :                                 unsigned long floor, unsigned long ceiling)
     239             : {
     240       11081 :         pmd_t *pmd;
     241       11081 :         unsigned long next;
     242       11081 :         unsigned long start;
     243             : 
     244       11081 :         start = addr;
     245       22162 :         pmd = pmd_offset(pud, addr);
     246       18119 :         do {
     247       18119 :                 next = pmd_addr_end(addr, end);
     248       18119 :                 if (pmd_none_or_clear_bad(pmd))
     249        1194 :                         continue;
     250       16925 :                 free_pte_range(tlb, pmd, addr);
     251       18119 :         } while (pmd++, addr = next, addr != end);
     252             : 
     253       11081 :         start &= PUD_MASK;
     254       11081 :         if (start < floor)
     255             :                 return;
     256       10981 :         if (ceiling) {
     257        6405 :                 ceiling &= PUD_MASK;
     258        6405 :                 if (!ceiling)
     259             :                         return;
     260             :         }
     261       10979 :         if (end - 1 > ceiling - 1)
     262             :                 return;
     263             : 
     264        8893 :         pmd = pmd_offset(pud, start);
     265        8893 :         pud_clear(pud);
     266        8893 :         pmd_free_tlb(tlb, pmd, start);
     267        8893 :         mm_dec_nr_pmds(tlb->mm);
     268             : }
     269             : 
     270       11103 : static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
     271             :                                 unsigned long addr, unsigned long end,
     272             :                                 unsigned long floor, unsigned long ceiling)
     273             : {
     274       11103 :         pud_t *pud;
     275       11103 :         unsigned long next;
     276       11103 :         unsigned long start;
     277             : 
     278       11103 :         start = addr;
     279       11103 :         pud = pud_offset(p4d, addr);
     280       11112 :         do {
     281       11112 :                 next = pud_addr_end(addr, end);
     282       11112 :                 if (pud_none_or_clear_bad(pud))
     283          31 :                         continue;
     284       11081 :                 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
     285       11112 :         } while (pud++, addr = next, addr != end);
     286             : 
     287       11103 :         start &= P4D_MASK;
     288       11103 :         if (start < floor)
     289             :                 return;
     290       10097 :         if (ceiling) {
     291        6396 :                 ceiling &= P4D_MASK;
     292        6396 :                 if (!ceiling)
     293             :                         return;
     294             :         }
     295       10095 :         if (end - 1 > ceiling - 1)
     296             :                 return;
     297             : 
     298        7011 :         pud = pud_offset(p4d, start);
     299        7011 :         p4d_clear(p4d);
     300        7011 :         pud_free_tlb(tlb, pud, start);
     301        7011 :         mm_dec_nr_puds(tlb->mm);
     302             : }
     303             : 
     304       11238 : static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
     305             :                                 unsigned long addr, unsigned long end,
     306             :                                 unsigned long floor, unsigned long ceiling)
     307             : {
     308       11238 :         p4d_t *p4d;
     309       11238 :         unsigned long next;
     310       11238 :         unsigned long start;
     311             : 
     312       11238 :         start = addr;
     313       11238 :         p4d = p4d_offset(pgd, addr);
     314       11238 :         do {
     315       11238 :                 next = p4d_addr_end(addr, end);
     316       11238 :                 if (p4d_none_or_clear_bad(p4d))
     317         135 :                         continue;
     318       11103 :                 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
     319       11238 :         } while (p4d++, addr = next, addr != end);
     320             : 
     321       11238 :         start &= PGDIR_MASK;
     322       11238 :         if (start < floor)
     323             :                 return;
     324       11238 :         if (ceiling) {
     325             :                 ceiling &= PGDIR_MASK;
     326             :                 if (!ceiling)
     327             :                         return;
     328             :         }
     329       11238 :         if (end - 1 > ceiling - 1)
     330             :                 return;
     331             : 
     332       11238 :         p4d = p4d_offset(pgd, start);
     333       11238 :         pgd_clear(pgd);
     334       11238 :         p4d_free_tlb(tlb, p4d, start);
     335             : }
     336             : 
     337             : /*
     338             :  * This function frees user-level page tables of a process.
     339             :  */
     340       27724 : void free_pgd_range(struct mmu_gather *tlb,
     341             :                         unsigned long addr, unsigned long end,
     342             :                         unsigned long floor, unsigned long ceiling)
     343             : {
     344       27724 :         pgd_t *pgd;
     345       27724 :         unsigned long next;
     346             : 
     347             :         /*
     348             :          * The next few lines have given us lots of grief...
     349             :          *
     350             :          * Why are we testing PMD* at this top level?  Because often
     351             :          * there will be no work to do at all, and we'd prefer not to
     352             :          * go all the way down to the bottom just to discover that.
     353             :          *
     354             :          * Why all these "- 1"s?  Because 0 represents both the bottom
     355             :          * of the address space and the top of it (using -1 for the
     356             :          * top wouldn't help much: the masks would do the wrong thing).
     357             :          * The rule is that addr 0 and floor 0 refer to the bottom of
     358             :          * the address space, but end 0 and ceiling 0 refer to the top
     359             :          * Comparisons need to use "end - 1" and "ceiling - 1" (though
     360             :          * that end 0 case should be mythical).
     361             :          *
     362             :          * Wherever addr is brought up or ceiling brought down, we must
     363             :          * be careful to reject "the opposite 0" before it confuses the
     364             :          * subsequent tests.  But what about where end is brought down
     365             :          * by PMD_SIZE below? no, end can't go down to 0 there.
     366             :          *
     367             :          * Whereas we round start (addr) and ceiling down, by different
     368             :          * masks at different levels, in order to test whether a table
     369             :          * now has no other vmas using it, so can be freed, we don't
     370             :          * bother to round floor or end up - the tests don't need that.
     371             :          */
     372             : 
     373       27724 :         addr &= PMD_MASK;
     374       27724 :         if (addr < floor) {
     375       16390 :                 addr += PMD_SIZE;
     376       16390 :                 if (!addr)
     377             :                         return;
     378             :         }
     379       27724 :         if (ceiling) {
     380       23091 :                 ceiling &= PMD_MASK;
     381       23091 :                 if (!ceiling)
     382             :                         return;
     383             :         }
     384       27724 :         if (end - 1 > ceiling - 1)
     385       14750 :                 end -= PMD_SIZE;
     386       27724 :         if (addr > end - 1)
     387             :                 return;
     388             :         /*
     389             :          * We add page table cache pages with PAGE_SIZE,
     390             :          * (see pte_free_tlb()), flush the tlb if we need
     391             :          */
     392       11238 :         tlb_change_page_size(tlb, PAGE_SIZE);
     393       11238 :         pgd = pgd_offset(tlb->mm, addr);
     394       11238 :         do {
     395       11238 :                 next = pgd_addr_end(addr, end);
     396       11238 :                 if (pgd_none_or_clear_bad(pgd))
     397             :                         continue;
     398       11238 :                 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
     399       11238 :         } while (pgd++, addr = next, addr != end);
     400             : }
     401             : 
     402       20434 : void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
     403             :                 unsigned long floor, unsigned long ceiling)
     404             : {
     405       47225 :         while (vma) {
     406       26791 :                 struct vm_area_struct *next = vma->vm_next;
     407       26791 :                 unsigned long addr = vma->vm_start;
     408             : 
     409             :                 /*
     410             :                  * Hide vma from rmap and truncate_pagecache before freeing
     411             :                  * pgtables
     412             :                  */
     413       26791 :                 unlink_anon_vmas(vma);
     414       26792 :                 unlink_file_vma(vma);
     415             : 
     416       26792 :                 if (is_vm_hugetlb_page(vma)) {
     417             :                         hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
     418             :                                 floor, next ? next->vm_start : ceiling);
     419             :                 } else {
     420             :                         /*
     421             :                          * Optimization: gather nearby vmas into one call down
     422             :                          */
     423      113853 :                         while (next && next->vm_start <= vma->vm_end + PMD_SIZE
     424       87061 :                                && !is_vm_hugetlb_page(next)) {
     425       87061 :                                 vma = next;
     426       87061 :                                 next = vma->vm_next;
     427       87061 :                                 unlink_anon_vmas(vma);
     428       87053 :                                 unlink_file_vma(vma);
     429             :                         }
     430       26792 :                         free_pgd_range(tlb, addr, vma->vm_end,
     431             :                                 floor, next ? next->vm_start : ceiling);
     432             :                 }
     433       26792 :                 vma = next;
     434             :         }
     435       20434 : }
     436             : 
     437       14036 : int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
     438             : {
     439       14036 :         spinlock_t *ptl;
     440       14036 :         pgtable_t new = pte_alloc_one(mm);
     441       14036 :         if (!new)
     442             :                 return -ENOMEM;
     443             : 
     444             :         /*
     445             :          * Ensure all pte setup (eg. pte page lock and page clearing) are
     446             :          * visible before the pte is made visible to other CPUs by being
     447             :          * put into page tables.
     448             :          *
     449             :          * The other side of the story is the pointer chasing in the page
     450             :          * table walking code (when walking the page table without locking;
     451             :          * ie. most of the time). Fortunately, these data accesses consist
     452             :          * of a chain of data-dependent loads, meaning most CPUs (alpha
     453             :          * being the notable exception) will already guarantee loads are
     454             :          * seen in-order. See the alpha page table accessors for the
     455             :          * smp_rmb() barriers in page table walking code.
     456             :          */
     457       14036 :         smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
     458             : 
     459       14036 :         ptl = pmd_lock(mm, pmd);
     460       14036 :         if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
     461       14036 :                 mm_inc_nr_ptes(mm);
     462       14036 :                 pmd_populate(mm, pmd, new);
     463       14036 :                 new = NULL;
     464             :         }
     465       14036 :         spin_unlock(ptl);
     466       14036 :         if (new)
     467           0 :                 pte_free(mm, new);
     468             :         return 0;
     469             : }
     470             : 
     471          46 : int __pte_alloc_kernel(pmd_t *pmd)
     472             : {
     473          46 :         pte_t *new = pte_alloc_one_kernel(&init_mm);
     474          46 :         if (!new)
     475             :                 return -ENOMEM;
     476             : 
     477          46 :         smp_wmb(); /* See comment in __pte_alloc */
     478             : 
     479          46 :         spin_lock(&init_mm.page_table_lock);
     480          46 :         if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
     481          46 :                 pmd_populate_kernel(&init_mm, pmd, new);
     482          46 :                 new = NULL;
     483             :         }
     484          46 :         spin_unlock(&init_mm.page_table_lock);
     485          46 :         if (new)
     486           0 :                 pte_free_kernel(&init_mm, new);
     487             :         return 0;
     488             : }
     489             : 
     490      145676 : static inline void init_rss_vec(int *rss)
     491             : {
     492      145676 :         memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
     493             : }
     494             : 
     495      145668 : static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
     496             : {
     497      145668 :         int i;
     498             : 
     499      145668 :         if (current->mm == mm)
     500       14801 :                 sync_mm_rss(mm);
     501      728306 :         for (i = 0; i < NR_MM_COUNTERS; i++)
     502      582640 :                 if (rss[i])
     503       99546 :                         add_mm_counter(mm, i, rss[i]);
     504      145666 : }
     505             : 
     506             : /*
     507             :  * This function is called to print an error when a bad pte
     508             :  * is found. For example, we might have a PFN-mapped pte in
     509             :  * a region that doesn't allow it.
     510             :  *
     511             :  * The calling function must still handle the error.
     512             :  */
     513           0 : static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
     514             :                           pte_t pte, struct page *page)
     515             : {
     516           0 :         pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
     517           0 :         p4d_t *p4d = p4d_offset(pgd, addr);
     518           0 :         pud_t *pud = pud_offset(p4d, addr);
     519           0 :         pmd_t *pmd = pmd_offset(pud, addr);
     520           0 :         struct address_space *mapping;
     521           0 :         pgoff_t index;
     522           0 :         static unsigned long resume;
     523           0 :         static unsigned long nr_shown;
     524           0 :         static unsigned long nr_unshown;
     525             : 
     526             :         /*
     527             :          * Allow a burst of 60 reports, then keep quiet for that minute;
     528             :          * or allow a steady drip of one report per second.
     529             :          */
     530           0 :         if (nr_shown == 60) {
     531           0 :                 if (time_before(jiffies, resume)) {
     532           0 :                         nr_unshown++;
     533           0 :                         return;
     534             :                 }
     535           0 :                 if (nr_unshown) {
     536           0 :                         pr_alert("BUG: Bad page map: %lu messages suppressed\n",
     537             :                                  nr_unshown);
     538           0 :                         nr_unshown = 0;
     539             :                 }
     540           0 :                 nr_shown = 0;
     541             :         }
     542           0 :         if (nr_shown++ == 0)
     543           0 :                 resume = jiffies + 60 * HZ;
     544             : 
     545           0 :         mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
     546           0 :         index = linear_page_index(vma, addr);
     547             : 
     548           0 :         pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
     549             :                  current->comm,
     550             :                  (long long)pte_val(pte), (long long)pmd_val(*pmd));
     551           0 :         if (page)
     552           0 :                 dump_page(page, "bad pte");
     553           0 :         pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
     554             :                  (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
     555           0 :         pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
     556             :                  vma->vm_file,
     557             :                  vma->vm_ops ? vma->vm_ops->fault : NULL,
     558             :                  vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
     559             :                  mapping ? mapping->a_ops->readpage : NULL);
     560           0 :         dump_stack();
     561           0 :         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     562             : }
     563             : 
     564             : /*
     565             :  * vm_normal_page -- This function gets the "struct page" associated with a pte.
     566             :  *
     567             :  * "Special" mappings do not wish to be associated with a "struct page" (either
     568             :  * it doesn't exist, or it exists but they don't want to touch it). In this
     569             :  * case, NULL is returned here. "Normal" mappings do have a struct page.
     570             :  *
     571             :  * There are 2 broad cases. Firstly, an architecture may define a pte_special()
     572             :  * pte bit, in which case this function is trivial. Secondly, an architecture
     573             :  * may not have a spare pte bit, which requires a more complicated scheme,
     574             :  * described below.
     575             :  *
     576             :  * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
     577             :  * special mapping (even if there are underlying and valid "struct pages").
     578             :  * COWed pages of a VM_PFNMAP are always normal.
     579             :  *
     580             :  * The way we recognize COWed pages within VM_PFNMAP mappings is through the
     581             :  * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
     582             :  * set, and the vm_pgoff will point to the first PFN mapped: thus every special
     583             :  * mapping will always honor the rule
     584             :  *
     585             :  *      pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
     586             :  *
     587             :  * And for normal mappings this is false.
     588             :  *
     589             :  * This restricts such mappings to be a linear translation from virtual address
     590             :  * to pfn. To get around this restriction, we allow arbitrary mappings so long
     591             :  * as the vma is not a COW mapping; in that case, we know that all ptes are
     592             :  * special (because none can have been COWed).
     593             :  *
     594             :  *
     595             :  * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
     596             :  *
     597             :  * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
     598             :  * page" backing, however the difference is that _all_ pages with a struct
     599             :  * page (that is, those where pfn_valid is true) are refcounted and considered
     600             :  * normal pages by the VM. The disadvantage is that pages are refcounted
     601             :  * (which can be slower and simply not an option for some PFNMAP users). The
     602             :  * advantage is that we don't have to follow the strict linearity rule of
     603             :  * PFNMAP mappings in order to support COWable mappings.
     604             :  *
     605             :  */
     606     1030716 : struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
     607             :                             pte_t pte)
     608             : {
     609     1030716 :         unsigned long pfn = pte_pfn(pte);
     610             : 
     611     1030716 :         if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
     612     1030716 :                 if (likely(!pte_special(pte)))
     613     1014261 :                         goto check_pfn;
     614       16455 :                 if (vma->vm_ops && vma->vm_ops->find_special_page)
     615           0 :                         return vma->vm_ops->find_special_page(vma, addr);
     616       16455 :                 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
     617             :                         return NULL;
     618       14675 :                 if (is_zero_pfn(pfn))
     619             :                         return NULL;
     620           0 :                 if (pte_devmap(pte))
     621             :                         return NULL;
     622             : 
     623           0 :                 print_bad_pte(vma, addr, pte, NULL);
     624           0 :                 return NULL;
     625             :         }
     626             : 
     627             :         /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
     628             : 
     629             :         if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
     630             :                 if (vma->vm_flags & VM_MIXEDMAP) {
     631             :                         if (!pfn_valid(pfn))
     632             :                                 return NULL;
     633             :                         goto out;
     634             :                 } else {
     635             :                         unsigned long off;
     636             :                         off = (addr - vma->vm_start) >> PAGE_SHIFT;
     637             :                         if (pfn == vma->vm_pgoff + off)
     638             :                                 return NULL;
     639             :                         if (!is_cow_mapping(vma->vm_flags))
     640             :                                 return NULL;
     641             :                 }
     642             :         }
     643             : 
     644             :         if (is_zero_pfn(pfn))
     645             :                 return NULL;
     646             : 
     647     1014261 : check_pfn:
     648     1014261 :         if (unlikely(pfn > highest_memmap_pfn)) {
     649           0 :                 print_bad_pte(vma, addr, pte, NULL);
     650           0 :                 return NULL;
     651             :         }
     652             : 
     653             :         /*
     654             :          * NOTE! We still have PageReserved() pages in the page tables.
     655             :          * eg. VDSO mappings can cause them to exist.
     656             :          */
     657     1014261 : out:
     658     1014261 :         return pfn_to_page(pfn);
     659             : }
     660             : 
     661             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     662           0 : struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
     663             :                                 pmd_t pmd)
     664             : {
     665           0 :         unsigned long pfn = pmd_pfn(pmd);
     666             : 
     667             :         /*
     668             :          * There is no pmd_special() but there may be special pmds, e.g.
     669             :          * in a direct-access (dax) mapping, so let's just replicate the
     670             :          * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
     671             :          */
     672           0 :         if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
     673           0 :                 if (vma->vm_flags & VM_MIXEDMAP) {
     674           0 :                         if (!pfn_valid(pfn))
     675             :                                 return NULL;
     676           0 :                         goto out;
     677             :                 } else {
     678           0 :                         unsigned long off;
     679           0 :                         off = (addr - vma->vm_start) >> PAGE_SHIFT;
     680           0 :                         if (pfn == vma->vm_pgoff + off)
     681             :                                 return NULL;
     682           0 :                         if (!is_cow_mapping(vma->vm_flags))
     683             :                                 return NULL;
     684             :                 }
     685             :         }
     686             : 
     687           0 :         if (pmd_devmap(pmd))
     688             :                 return NULL;
     689           0 :         if (is_huge_zero_pmd(pmd))
     690             :                 return NULL;
     691           0 :         if (unlikely(pfn > highest_memmap_pfn))
     692             :                 return NULL;
     693             : 
     694             :         /*
     695             :          * NOTE! We still have PageReserved() pages in the page tables.
     696             :          * eg. VDSO mappings can cause them to exist.
     697             :          */
     698           0 : out:
     699           0 :         return pfn_to_page(pfn);
     700             : }
     701             : #endif
     702             : 
     703             : /*
     704             :  * copy one vm_area from one task to the other. Assumes the page tables
     705             :  * already present in the new task to be cleared in the whole range
     706             :  * covered by this vma.
     707             :  */
     708             : 
     709             : static unsigned long
     710           0 : copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
     711             :                 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
     712             :                 unsigned long addr, int *rss)
     713             : {
     714           0 :         unsigned long vm_flags = vma->vm_flags;
     715           0 :         pte_t pte = *src_pte;
     716           0 :         struct page *page;
     717           0 :         swp_entry_t entry = pte_to_swp_entry(pte);
     718             : 
     719           0 :         if (likely(!non_swap_entry(entry))) {
     720           0 :                 if (swap_duplicate(entry) < 0)
     721             :                         return entry.val;
     722             : 
     723             :                 /* make sure dst_mm is on swapoff's mmlist. */
     724           0 :                 if (unlikely(list_empty(&dst_mm->mmlist))) {
     725           0 :                         spin_lock(&mmlist_lock);
     726           0 :                         if (list_empty(&dst_mm->mmlist))
     727           0 :                                 list_add(&dst_mm->mmlist,
     728             :                                                 &src_mm->mmlist);
     729           0 :                         spin_unlock(&mmlist_lock);
     730             :                 }
     731           0 :                 rss[MM_SWAPENTS]++;
     732           0 :         } else if (is_migration_entry(entry)) {
     733           0 :                 page = migration_entry_to_page(entry);
     734             : 
     735           0 :                 rss[mm_counter(page)]++;
     736             : 
     737           0 :                 if (is_write_migration_entry(entry) &&
     738           0 :                                 is_cow_mapping(vm_flags)) {
     739             :                         /*
     740             :                          * COW mappings require pages in both
     741             :                          * parent and child to be set to read.
     742             :                          */
     743           0 :                         make_migration_entry_read(&entry);
     744           0 :                         pte = swp_entry_to_pte(entry);
     745           0 :                         if (pte_swp_soft_dirty(*src_pte))
     746             :                                 pte = pte_swp_mksoft_dirty(pte);
     747           0 :                         if (pte_swp_uffd_wp(*src_pte))
     748             :                                 pte = pte_swp_mkuffd_wp(pte);
     749           0 :                         set_pte_at(src_mm, addr, src_pte, pte);
     750             :                 }
     751             :         } else if (is_device_private_entry(entry)) {
     752             :                 page = device_private_entry_to_page(entry);
     753             : 
     754             :                 /*
     755             :                  * Update rss count even for unaddressable pages, as
     756             :                  * they should treated just like normal pages in this
     757             :                  * respect.
     758             :                  *
     759             :                  * We will likely want to have some new rss counters
     760             :                  * for unaddressable pages, at some point. But for now
     761             :                  * keep things as they are.
     762             :                  */
     763             :                 get_page(page);
     764             :                 rss[mm_counter(page)]++;
     765             :                 page_dup_rmap(page, false);
     766             : 
     767             :                 /*
     768             :                  * We do not preserve soft-dirty information, because so
     769             :                  * far, checkpoint/restore is the only feature that
     770             :                  * requires that. And checkpoint/restore does not work
     771             :                  * when a device driver is involved (you cannot easily
     772             :                  * save and restore device driver state).
     773             :                  */
     774             :                 if (is_write_device_private_entry(entry) &&
     775             :                     is_cow_mapping(vm_flags)) {
     776             :                         make_device_private_entry_read(&entry);
     777             :                         pte = swp_entry_to_pte(entry);
     778             :                         if (pte_swp_uffd_wp(*src_pte))
     779             :                                 pte = pte_swp_mkuffd_wp(pte);
     780             :                         set_pte_at(src_mm, addr, src_pte, pte);
     781             :                 }
     782             :         }
     783           0 :         set_pte_at(dst_mm, addr, dst_pte, pte);
     784           0 :         return 0;
     785             : }
     786             : 
     787             : /*
     788             :  * Copy a present and normal page if necessary.
     789             :  *
     790             :  * NOTE! The usual case is that this doesn't need to do
     791             :  * anything, and can just return a positive value. That
     792             :  * will let the caller know that it can just increase
     793             :  * the page refcount and re-use the pte the traditional
     794             :  * way.
     795             :  *
     796             :  * But _if_ we need to copy it because it needs to be
     797             :  * pinned in the parent (and the child should get its own
     798             :  * copy rather than just a reference to the same page),
     799             :  * we'll do that here and return zero to let the caller
     800             :  * know we're done.
     801             :  *
     802             :  * And if we need a pre-allocated page but don't yet have
     803             :  * one, return a negative error to let the preallocation
     804             :  * code know so that it can do so outside the page table
     805             :  * lock.
     806             :  */
     807             : static inline int
     808       81815 : copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
     809             :                   pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
     810             :                   struct page **prealloc, pte_t pte, struct page *page)
     811             : {
     812       81815 :         struct page *new_page;
     813             : 
     814             :         /*
     815             :          * What we want to do is to check whether this page may
     816             :          * have been pinned by the parent process.  If so,
     817             :          * instead of wrprotect the pte on both sides, we copy
     818             :          * the page immediately so that we'll always guarantee
     819             :          * the pinned page won't be randomly replaced in the
     820             :          * future.
     821             :          *
     822             :          * The page pinning checks are just "has this mm ever
     823             :          * seen pinning", along with the (inexact) check of
     824             :          * the page count. That might give false positives for
     825             :          * for pinning, but it will work correctly.
     826             :          */
     827       81815 :         if (likely(!page_needs_cow_for_dma(src_vma, page)))
     828             :                 return 1;
     829             : 
     830           0 :         new_page = *prealloc;
     831           0 :         if (!new_page)
     832             :                 return -EAGAIN;
     833             : 
     834             :         /*
     835             :          * We have a prealloc page, all good!  Take it
     836             :          * over and copy the page & arm it.
     837             :          */
     838           0 :         *prealloc = NULL;
     839           0 :         copy_user_highpage(new_page, page, addr, src_vma);
     840           0 :         __SetPageUptodate(new_page);
     841           0 :         page_add_new_anon_rmap(new_page, dst_vma, addr, false);
     842           0 :         lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
     843           0 :         rss[mm_counter(new_page)]++;
     844             : 
     845             :         /* All done, just insert the new page copy in the child */
     846           0 :         pte = mk_pte(new_page, dst_vma->vm_page_prot);
     847           0 :         pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
     848           0 :         set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
     849           0 :         return 0;
     850             : }
     851             : 
     852             : /*
     853             :  * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
     854             :  * is required to copy this pte.
     855             :  */
     856             : static inline int
     857       82657 : copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
     858             :                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
     859             :                  struct page **prealloc)
     860             : {
     861       82657 :         struct mm_struct *src_mm = src_vma->vm_mm;
     862       82657 :         unsigned long vm_flags = src_vma->vm_flags;
     863       82657 :         pte_t pte = *src_pte;
     864       82657 :         struct page *page;
     865             : 
     866       82657 :         page = vm_normal_page(src_vma, addr, pte);
     867       82657 :         if (page) {
     868       81815 :                 int retval;
     869             : 
     870       81815 :                 retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
     871             :                                            addr, rss, prealloc, pte, page);
     872       81814 :                 if (retval <= 0)
     873             :                         return retval;
     874             : 
     875       81814 :                 get_page(page);
     876       81817 :                 page_dup_rmap(page, false);
     877       81816 :                 rss[mm_counter(page)]++;
     878             :         }
     879             : 
     880             :         /*
     881             :          * If it's a COW mapping, write protect it both
     882             :          * in the parent and the child
     883             :          */
     884       82659 :         if (is_cow_mapping(vm_flags) && pte_write(pte)) {
     885       26081 :                 ptep_set_wrprotect(src_mm, addr, src_pte);
     886       26081 :                 pte = pte_wrprotect(pte);
     887             :         }
     888             : 
     889             :         /*
     890             :          * If it's a shared mapping, mark it clean in
     891             :          * the child
     892             :          */
     893       82659 :         if (vm_flags & VM_SHARED)
     894           0 :                 pte = pte_mkclean(pte);
     895       82659 :         pte = pte_mkold(pte);
     896             : 
     897             :         /*
     898             :          * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
     899             :          * does not have the VM_UFFD_WP, which means that the uffd
     900             :          * fork event is not enabled.
     901             :          */
     902       82659 :         if (!(vm_flags & VM_UFFD_WP))
     903       82659 :                 pte = pte_clear_uffd_wp(pte);
     904             : 
     905       82659 :         set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
     906       82659 :         return 0;
     907             : }
     908             : 
     909             : static inline struct page *
     910           0 : page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
     911             :                    unsigned long addr)
     912             : {
     913           0 :         struct page *new_page;
     914             : 
     915           0 :         new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
     916           0 :         if (!new_page)
     917           0 :                 return NULL;
     918             : 
     919           0 :         if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
     920             :                 put_page(new_page);
     921             :                 return NULL;
     922             :         }
     923           0 :         cgroup_throttle_swaprate(new_page, GFP_KERNEL);
     924             : 
     925             :         return new_page;
     926             : }
     927             : 
     928             : static int
     929       28261 : copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
     930             :                pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
     931             :                unsigned long end)
     932             : {
     933       28261 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
     934       28261 :         struct mm_struct *src_mm = src_vma->vm_mm;
     935       28261 :         pte_t *orig_src_pte, *orig_dst_pte;
     936       28261 :         pte_t *src_pte, *dst_pte;
     937       28261 :         spinlock_t *src_ptl, *dst_ptl;
     938       28261 :         int progress, ret = 0;
     939       28261 :         int rss[NR_MM_COUNTERS];
     940       28261 :         swp_entry_t entry = (swp_entry_t){0};
     941       28261 :         struct page *prealloc = NULL;
     942             : 
     943       28334 : again:
     944       28334 :         progress = 0;
     945       28334 :         init_rss_vec(rss);
     946             : 
     947       56668 :         dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
     948       28334 :         if (!dst_pte) {
     949           0 :                 ret = -ENOMEM;
     950           0 :                 goto out;
     951             :         }
     952       28334 :         src_pte = pte_offset_map(src_pmd, addr);
     953       28334 :         src_ptl = pte_lockptr(src_mm, src_pmd);
     954       28334 :         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
     955       28334 :         orig_src_pte = src_pte;
     956       28334 :         orig_dst_pte = dst_pte;
     957      191119 :         arch_enter_lazy_mmu_mode();
     958             : 
     959      191119 :         do {
     960             :                 /*
     961             :                  * We are holding two locks at this point - either of them
     962             :                  * could generate latencies in another task on another CPU.
     963             :                  */
     964      191119 :                 if (progress >= 32) {
     965       12691 :                         progress = 0;
     966       12691 :                         if (need_resched() ||
     967      191046 :                             spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
     968             :                                 break;
     969             :                 }
     970      191046 :                 if (pte_none(*src_pte)) {
     971      108387 :                         progress++;
     972      108387 :                         continue;
     973             :                 }
     974       82659 :                 if (unlikely(!pte_present(*src_pte))) {
     975           0 :                         entry.val = copy_nonpresent_pte(dst_mm, src_mm,
     976             :                                                         dst_pte, src_pte,
     977             :                                                         src_vma, addr, rss);
     978           0 :                         if (entry.val)
     979             :                                 break;
     980           0 :                         progress += 8;
     981           0 :                         continue;
     982             :                 }
     983             :                 /* copy_present_pte() will clear `*prealloc' if consumed */
     984       82659 :                 ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
     985             :                                        addr, rss, &prealloc);
     986             :                 /*
     987             :                  * If we need a pre-allocated page for this pte, drop the
     988             :                  * locks, allocate, and try again.
     989             :                  */
     990       82659 :                 if (unlikely(ret == -EAGAIN))
     991             :                         break;
     992       82659 :                 if (unlikely(prealloc)) {
     993             :                         /*
     994             :                          * pre-alloc page cannot be reused by next time so as
     995             :                          * to strictly follow mempolicy (e.g., alloc_page_vma()
     996             :                          * will allocate page according to address).  This
     997             :                          * could only happen if one pinned pte changed.
     998             :                          */
     999           0 :                         put_page(prealloc);
    1000           0 :                         prealloc = NULL;
    1001             :                 }
    1002       82659 :                 progress += 8;
    1003      191046 :         } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
    1004             : 
    1005       28334 :         arch_leave_lazy_mmu_mode();
    1006       28334 :         spin_unlock(src_ptl);
    1007       28334 :         pte_unmap(orig_src_pte);
    1008       28334 :         add_mm_rss_vec(dst_mm, rss);
    1009       28334 :         pte_unmap_unlock(orig_dst_pte, dst_ptl);
    1010       28334 :         cond_resched();
    1011             : 
    1012       28334 :         if (entry.val) {
    1013       28334 :                 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
    1014             :                         ret = -ENOMEM;
    1015             :                         goto out;
    1016             :                 }
    1017       28334 :                 entry.val = 0;
    1018       28334 :         } else if (ret) {
    1019           0 :                 WARN_ON_ONCE(ret != -EAGAIN);
    1020           0 :                 prealloc = page_copy_prealloc(src_mm, src_vma, addr);
    1021           0 :                 if (!prealloc)
    1022             :                         return -ENOMEM;
    1023             :                 /* We've captured and resolved the error. Reset, try again. */
    1024             :                 ret = 0;
    1025             :         }
    1026       28334 :         if (addr != end)
    1027          73 :                 goto again;
    1028       28261 : out:
    1029       28261 :         if (unlikely(prealloc))
    1030           0 :                 put_page(prealloc);
    1031             :         return ret;
    1032             : }
    1033             : 
    1034             : static inline int
    1035       28031 : copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
    1036             :                pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
    1037             :                unsigned long end)
    1038             : {
    1039       28031 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1040       28031 :         struct mm_struct *src_mm = src_vma->vm_mm;
    1041       28031 :         pmd_t *src_pmd, *dst_pmd;
    1042       28031 :         unsigned long next;
    1043             : 
    1044       28031 :         dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
    1045       28031 :         if (!dst_pmd)
    1046             :                 return -ENOMEM;
    1047       56062 :         src_pmd = pmd_offset(src_pud, addr);
    1048       28360 :         do {
    1049       28360 :                 next = pmd_addr_end(addr, end);
    1050       28360 :                 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
    1051       28360 :                         || pmd_devmap(*src_pmd)) {
    1052           0 :                         int err;
    1053           0 :                         VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
    1054           0 :                         err = copy_huge_pmd(dst_mm, src_mm,
    1055             :                                             dst_pmd, src_pmd, addr, src_vma);
    1056           0 :                         if (err == -ENOMEM)
    1057             :                                 return -ENOMEM;
    1058           0 :                         if (!err)
    1059           0 :                                 continue;
    1060             :                         /* fall through */
    1061             :                 }
    1062       28360 :                 if (pmd_none_or_clear_bad(src_pmd))
    1063          99 :                         continue;
    1064       28261 :                 if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
    1065             :                                    addr, next))
    1066             :                         return -ENOMEM;
    1067       28360 :         } while (dst_pmd++, src_pmd++, addr = next, addr != end);
    1068             :         return 0;
    1069             : }
    1070             : 
    1071             : static inline int
    1072       28029 : copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
    1073             :                p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
    1074             :                unsigned long end)
    1075             : {
    1076       28029 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1077       28029 :         struct mm_struct *src_mm = src_vma->vm_mm;
    1078       28029 :         pud_t *src_pud, *dst_pud;
    1079       28029 :         unsigned long next;
    1080             : 
    1081       28029 :         dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
    1082       28030 :         if (!dst_pud)
    1083             :                 return -ENOMEM;
    1084       28030 :         src_pud = pud_offset(src_p4d, addr);
    1085       28030 :         do {
    1086       28030 :                 next = pud_addr_end(addr, end);
    1087       28030 :                 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
    1088           0 :                         int err;
    1089             : 
    1090           0 :                         VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
    1091           0 :                         err = copy_huge_pud(dst_mm, src_mm,
    1092             :                                             dst_pud, src_pud, addr, src_vma);
    1093           0 :                         if (err == -ENOMEM)
    1094             :                                 return -ENOMEM;
    1095           0 :                         if (!err)
    1096           0 :                                 continue;
    1097             :                         /* fall through */
    1098             :                 }
    1099       28030 :                 if (pud_none_or_clear_bad(src_pud))
    1100           0 :                         continue;
    1101       28031 :                 if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
    1102             :                                    addr, next))
    1103             :                         return -ENOMEM;
    1104       28031 :         } while (dst_pud++, src_pud++, addr = next, addr != end);
    1105             :         return 0;
    1106             : }
    1107             : 
    1108             : static inline int
    1109       28029 : copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
    1110             :                pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
    1111             :                unsigned long end)
    1112             : {
    1113       28029 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1114       28029 :         p4d_t *src_p4d, *dst_p4d;
    1115       28029 :         unsigned long next;
    1116             : 
    1117       28029 :         dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
    1118       28029 :         if (!dst_p4d)
    1119             :                 return -ENOMEM;
    1120       28029 :         src_p4d = p4d_offset(src_pgd, addr);
    1121       28029 :         do {
    1122       28029 :                 next = p4d_addr_end(addr, end);
    1123       28029 :                 if (p4d_none_or_clear_bad(src_p4d))
    1124           0 :                         continue;
    1125       28029 :                 if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
    1126             :                                    addr, next))
    1127             :                         return -ENOMEM;
    1128       28031 :         } while (dst_p4d++, src_p4d++, addr = next, addr != end);
    1129       28031 :         return 0;
    1130             : }
    1131             : 
    1132             : int
    1133       59208 : copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
    1134             : {
    1135       59208 :         pgd_t *src_pgd, *dst_pgd;
    1136       59208 :         unsigned long next;
    1137       59208 :         unsigned long addr = src_vma->vm_start;
    1138       59208 :         unsigned long end = src_vma->vm_end;
    1139       59208 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1140       59208 :         struct mm_struct *src_mm = src_vma->vm_mm;
    1141       59208 :         struct mmu_notifier_range range;
    1142       59208 :         bool is_cow;
    1143       59208 :         int ret;
    1144             : 
    1145             :         /*
    1146             :          * Don't copy ptes where a page fault will fill them correctly.
    1147             :          * Fork becomes much lighter when there are big shared or private
    1148             :          * readonly mappings. The tradeoff is that copy_page_range is more
    1149             :          * efficient than faulting.
    1150             :          */
    1151       59208 :         if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
    1152       57971 :             !src_vma->anon_vma)
    1153             :                 return 0;
    1154             : 
    1155       28030 :         if (is_vm_hugetlb_page(src_vma))
    1156             :                 return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
    1157             : 
    1158       28030 :         if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
    1159             :                 /*
    1160             :                  * We do not free on error cases below as remove_vma
    1161             :                  * gets called on error from higher level routine
    1162             :                  */
    1163        1237 :                 ret = track_pfn_copy(src_vma);
    1164        1237 :                 if (ret)
    1165             :                         return ret;
    1166             :         }
    1167             : 
    1168             :         /*
    1169             :          * We need to invalidate the secondary MMU mappings only when
    1170             :          * there could be a permission downgrade on the ptes of the
    1171             :          * parent mm. And a permission downgrade will only happen if
    1172             :          * is_cow_mapping() returns true.
    1173             :          */
    1174       28030 :         is_cow = is_cow_mapping(src_vma->vm_flags);
    1175             : 
    1176       28030 :         if (is_cow) {
    1177       26793 :                 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
    1178             :                                         0, src_vma, src_mm, addr, end);
    1179       26793 :                 mmu_notifier_invalidate_range_start(&range);
    1180             :                 /*
    1181             :                  * Disabling preemption is not needed for the write side, as
    1182             :                  * the read side doesn't spin, but goes to the mmap_lock.
    1183             :                  *
    1184             :                  * Use the raw variant of the seqcount_t write API to avoid
    1185             :                  * lockdep complaining about preemptibility.
    1186             :                  */
    1187       26793 :                 mmap_assert_write_locked(src_mm);
    1188       26793 :                 raw_write_seqcount_begin(&src_mm->write_protect_seq);
    1189             :         }
    1190             : 
    1191       28030 :         ret = 0;
    1192       28030 :         dst_pgd = pgd_offset(dst_mm, addr);
    1193       28030 :         src_pgd = pgd_offset(src_mm, addr);
    1194       28030 :         do {
    1195       28030 :                 next = pgd_addr_end(addr, end);
    1196       28030 :                 if (pgd_none_or_clear_bad(src_pgd))
    1197             :                         continue;
    1198       28030 :                 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
    1199             :                                             addr, next))) {
    1200             :                         ret = -ENOMEM;
    1201             :                         break;
    1202             :                 }
    1203       28031 :         } while (dst_pgd++, src_pgd++, addr = next, addr != end);
    1204             : 
    1205       28031 :         if (is_cow) {
    1206       26794 :                 raw_write_seqcount_end(&src_mm->write_protect_seq);
    1207       26794 :                 mmu_notifier_invalidate_range_end(&range);
    1208             :         }
    1209             :         return ret;
    1210             : }
    1211             : 
    1212      117289 : static unsigned long zap_pte_range(struct mmu_gather *tlb,
    1213             :                                 struct vm_area_struct *vma, pmd_t *pmd,
    1214             :                                 unsigned long addr, unsigned long end,
    1215             :                                 struct zap_details *details)
    1216             : {
    1217      117289 :         struct mm_struct *mm = tlb->mm;
    1218      117289 :         int force_flush = 0;
    1219      117289 :         int rss[NR_MM_COUNTERS];
    1220      117289 :         spinlock_t *ptl;
    1221      117289 :         pte_t *start_pte;
    1222      117289 :         pte_t *pte;
    1223      117289 :         swp_entry_t entry;
    1224             : 
    1225      117289 :         tlb_change_page_size(tlb, PAGE_SIZE);
    1226      117342 : again:
    1227      117342 :         init_rss_vec(rss);
    1228      234683 :         start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
    1229      117341 :         pte = start_pte;
    1230      117341 :         flush_tlb_batched_pending(mm);
    1231     3556751 :         arch_enter_lazy_mmu_mode();
    1232     3556751 :         do {
    1233     3556751 :                 pte_t ptent = *pte;
    1234     3556751 :                 if (pte_none(ptent))
    1235     2663773 :                         continue;
    1236             : 
    1237      892978 :                 if (need_resched())
    1238             :                         break;
    1239             : 
    1240      892813 :                 if (pte_present(ptent)) {
    1241      892813 :                         struct page *page;
    1242             : 
    1243      892813 :                         page = vm_normal_page(vma, addr, ptent);
    1244      892932 :                         if (unlikely(details) && page) {
    1245             :                                 /*
    1246             :                                  * unmap_shared_mapping_pages() wants to
    1247             :                                  * invalidate cache without truncating:
    1248             :                                  * unmap shared but keep private pages.
    1249             :                                  */
    1250           0 :                                 if (details->check_mapping &&
    1251           0 :                                     details->check_mapping != page_rmapping(page))
    1252           0 :                                         continue;
    1253             :                         }
    1254     1785864 :                         ptent = ptep_get_and_clear_full(mm, addr, pte,
    1255      892932 :                                                         tlb->fullmm);
    1256      892932 :                         tlb_remove_tlb_entry(tlb, pte, addr);
    1257      892932 :                         if (unlikely(!page))
    1258       14323 :                                 continue;
    1259             : 
    1260      878609 :                         if (!PageAnon(page)) {
    1261      749783 :                                 if (pte_dirty(ptent)) {
    1262         239 :                                         force_flush = 1;
    1263         239 :                                         set_page_dirty(page);
    1264             :                                 }
    1265      749783 :                                 if (pte_young(ptent) &&
    1266      748877 :                                     likely(!(vma->vm_flags & VM_SEQ_READ)))
    1267      748877 :                                         mark_page_accessed(page);
    1268             :                         }
    1269      878441 :                         rss[mm_counter(page)]--;
    1270      878285 :                         page_remove_rmap(page, false);
    1271      878704 :                         if (unlikely(page_mapcount(page) < 0))
    1272           0 :                                 print_bad_pte(vma, addr, ptent, page);
    1273      878584 :                         if (unlikely(__tlb_remove_page(tlb, page))) {
    1274             :                                 force_flush = 1;
    1275             :                                 addr += PAGE_SIZE;
    1276             :                                 break;
    1277             :                         }
    1278      878599 :                         continue;
    1279             :                 }
    1280             : 
    1281           0 :                 entry = pte_to_swp_entry(ptent);
    1282           0 :                 if (is_device_private_entry(entry)) {
    1283             :                         struct page *page = device_private_entry_to_page(entry);
    1284             : 
    1285             :                         if (unlikely(details && details->check_mapping)) {
    1286             :                                 /*
    1287             :                                  * unmap_shared_mapping_pages() wants to
    1288             :                                  * invalidate cache without truncating:
    1289             :                                  * unmap shared but keep private pages.
    1290             :                                  */
    1291             :                                 if (details->check_mapping !=
    1292             :                                     page_rmapping(page))
    1293             :                                         continue;
    1294             :                         }
    1295             : 
    1296             :                         pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
    1297             :                         rss[mm_counter(page)]--;
    1298             :                         page_remove_rmap(page, false);
    1299             :                         put_page(page);
    1300             :                         continue;
    1301             :                 }
    1302             : 
    1303             :                 /* If details->check_mapping, we leave swap entries. */
    1304           0 :                 if (unlikely(details))
    1305           0 :                         continue;
    1306             : 
    1307           0 :                 if (!non_swap_entry(entry))
    1308           0 :                         rss[MM_SWAPENTS]--;
    1309           0 :                 else if (is_migration_entry(entry)) {
    1310           0 :                         struct page *page;
    1311             : 
    1312           0 :                         page = migration_entry_to_page(entry);
    1313           0 :                         rss[mm_counter(page)]--;
    1314             :                 }
    1315           0 :                 if (unlikely(!free_swap_and_cache(entry)))
    1316           0 :                         print_bad_pte(vma, addr, ptent, NULL);
    1317           0 :                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
    1318     3556695 :         } while (pte++, addr += PAGE_SIZE, addr != end);
    1319             : 
    1320      117338 :         add_mm_rss_vec(mm, rss);
    1321      117335 :         arch_leave_lazy_mmu_mode();
    1322             : 
    1323             :         /* Do the actual TLB flush before dropping ptl */
    1324      117335 :         if (force_flush)
    1325          70 :                 tlb_flush_mmu_tlbonly(tlb);
    1326      117335 :         pte_unmap_unlock(start_pte, ptl);
    1327             : 
    1328             :         /*
    1329             :          * If we forced a TLB flush (either due to running out of
    1330             :          * batch buffers or because we needed to flush dirty TLB
    1331             :          * entries before releasing the ptl), free the batched
    1332             :          * memory too. Restart if we didn't do everything.
    1333             :          */
    1334      117343 :         if (force_flush) {
    1335          70 :                 force_flush = 0;
    1336          70 :                 tlb_flush_mmu(tlb);
    1337             :         }
    1338             : 
    1339      117343 :         if (addr != end) {
    1340          53 :                 cond_resched();
    1341          53 :                 goto again;
    1342             :         }
    1343             : 
    1344      117290 :         return addr;
    1345             : }
    1346             : 
    1347      112006 : static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
    1348             :                                 struct vm_area_struct *vma, pud_t *pud,
    1349             :                                 unsigned long addr, unsigned long end,
    1350             :                                 struct zap_details *details)
    1351             : {
    1352      112006 :         pmd_t *pmd;
    1353      112006 :         unsigned long next;
    1354             : 
    1355      224013 :         pmd = pmd_offset(pud, addr);
    1356      120148 :         do {
    1357      120148 :                 next = pmd_addr_end(addr, end);
    1358      120148 :                 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
    1359          17 :                         if (next - addr != HPAGE_PMD_SIZE)
    1360           0 :                                 __split_huge_pmd(vma, pmd, addr, false, NULL);
    1361          17 :                         else if (zap_huge_pmd(tlb, vma, pmd, addr))
    1362          17 :                                 goto next;
    1363             :                         /* fall through */
    1364             :                 }
    1365             :                 /*
    1366             :                  * Here there can be other concurrent MADV_DONTNEED or
    1367             :                  * trans huge page faults running, and if the pmd is
    1368             :                  * none or trans huge it can change under us. This is
    1369             :                  * because MADV_DONTNEED holds the mmap_lock in read
    1370             :                  * mode.
    1371             :                  */
    1372      120131 :                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
    1373        2842 :                         goto next;
    1374      117289 :                 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
    1375      120149 : next:
    1376      120149 :                 cond_resched();
    1377      120144 :         } while (pmd++, addr = next, addr != end);
    1378             : 
    1379      112002 :         return addr;
    1380             : }
    1381             : 
    1382      112466 : static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
    1383             :                                 struct vm_area_struct *vma, p4d_t *p4d,
    1384             :                                 unsigned long addr, unsigned long end,
    1385             :                                 struct zap_details *details)
    1386             : {
    1387      112466 :         pud_t *pud;
    1388      112466 :         unsigned long next;
    1389             : 
    1390      112466 :         pud = pud_offset(p4d, addr);
    1391      112475 :         do {
    1392      112475 :                 next = pud_addr_end(addr, end);
    1393      112475 :                 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
    1394           0 :                         if (next - addr != HPAGE_PUD_SIZE) {
    1395           0 :                                 mmap_assert_locked(tlb->mm);
    1396           0 :                                 split_huge_pud(vma, pud, addr);
    1397           0 :                         } else if (zap_huge_pud(tlb, vma, pud, addr))
    1398           0 :                                 goto next;
    1399             :                         /* fall through */
    1400             :                 }
    1401      112475 :                 if (pud_none_or_clear_bad(pud))
    1402         469 :                         continue;
    1403      112007 :                 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
    1404      112002 : next:
    1405      112002 :                 cond_resched();
    1406      112473 :         } while (pud++, addr = next, addr != end);
    1407             : 
    1408      112464 :         return addr;
    1409             : }
    1410             : 
    1411      113858 : static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
    1412             :                                 struct vm_area_struct *vma, pgd_t *pgd,
    1413             :                                 unsigned long addr, unsigned long end,
    1414             :                                 struct zap_details *details)
    1415             : {
    1416      113858 :         p4d_t *p4d;
    1417      113858 :         unsigned long next;
    1418             : 
    1419      113858 :         p4d = p4d_offset(pgd, addr);
    1420      113858 :         do {
    1421      113858 :                 next = p4d_addr_end(addr, end);
    1422      113858 :                 if (p4d_none_or_clear_bad(p4d))
    1423        1392 :                         continue;
    1424      112466 :                 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
    1425      113855 :         } while (p4d++, addr = next, addr != end);
    1426             : 
    1427      113855 :         return addr;
    1428             : }
    1429             : 
    1430      113858 : void unmap_page_range(struct mmu_gather *tlb,
    1431             :                              struct vm_area_struct *vma,
    1432             :                              unsigned long addr, unsigned long end,
    1433             :                              struct zap_details *details)
    1434             : {
    1435      113858 :         pgd_t *pgd;
    1436      113858 :         unsigned long next;
    1437             : 
    1438      113858 :         BUG_ON(addr >= end);
    1439      113858 :         tlb_start_vma(tlb, vma);
    1440      113858 :         pgd = pgd_offset(vma->vm_mm, addr);
    1441      113858 :         do {
    1442      113858 :                 next = pgd_addr_end(addr, end);
    1443      113858 :                 if (pgd_none_or_clear_bad(pgd))
    1444             :                         continue;
    1445      113858 :                 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
    1446      113854 :         } while (pgd++, addr = next, addr != end);
    1447      113854 :         tlb_end_vma(tlb, vma);
    1448      113854 : }
    1449             : 
    1450             : 
    1451      113856 : static void unmap_single_vma(struct mmu_gather *tlb,
    1452             :                 struct vm_area_struct *vma, unsigned long start_addr,
    1453             :                 unsigned long end_addr,
    1454             :                 struct zap_details *details)
    1455             : {
    1456      113856 :         unsigned long start = max(vma->vm_start, start_addr);
    1457      113856 :         unsigned long end;
    1458             : 
    1459      113856 :         if (start >= vma->vm_end)
    1460             :                 return;
    1461      113856 :         end = min(vma->vm_end, end_addr);
    1462      113856 :         if (end <= vma->vm_start)
    1463             :                 return;
    1464             : 
    1465      113856 :         if (vma->vm_file)
    1466      113856 :                 uprobe_munmap(vma, start, end);
    1467             : 
    1468      113856 :         if (unlikely(vma->vm_flags & VM_PFNMAP))
    1469        2146 :                 untrack_pfn(vma, 0, 0);
    1470             : 
    1471      113856 :         if (start != end) {
    1472      113857 :                 if (unlikely(is_vm_hugetlb_page(vma))) {
    1473             :                         /*
    1474             :                          * It is undesirable to test vma->vm_file as it
    1475             :                          * should be non-null for valid hugetlb area.
    1476             :                          * However, vm_file will be NULL in the error
    1477             :                          * cleanup path of mmap_region. When
    1478             :                          * hugetlbfs ->mmap method fails,
    1479             :                          * mmap_region() nullifies vma->vm_file
    1480             :                          * before calling this function to clean up.
    1481             :                          * Since no pte has actually been setup, it is
    1482             :                          * safe to do nothing in this case.
    1483             :                          */
    1484             :                         if (vma->vm_file) {
    1485             :                                 i_mmap_lock_write(vma->vm_file->f_mapping);
    1486             :                                 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
    1487             :                                 i_mmap_unlock_write(vma->vm_file->f_mapping);
    1488             :                         }
    1489             :                 } else
    1490      113857 :                         unmap_page_range(tlb, vma, start, end, details);
    1491             :         }
    1492             : }
    1493             : 
    1494             : /**
    1495             :  * unmap_vmas - unmap a range of memory covered by a list of vma's
    1496             :  * @tlb: address of the caller's struct mmu_gather
    1497             :  * @vma: the starting vma
    1498             :  * @start_addr: virtual address at which to start unmapping
    1499             :  * @end_addr: virtual address at which to end unmapping
    1500             :  *
    1501             :  * Unmap all pages in the vma list.
    1502             :  *
    1503             :  * Only addresses between `start' and `end' will be unmapped.
    1504             :  *
    1505             :  * The VMA list must be sorted in ascending virtual address order.
    1506             :  *
    1507             :  * unmap_vmas() assumes that the caller will flush the whole unmapped address
    1508             :  * range after unmap_vmas() returns.  So the only responsibility here is to
    1509             :  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
    1510             :  * drops the lock and schedules.
    1511             :  */
    1512       20432 : void unmap_vmas(struct mmu_gather *tlb,
    1513             :                 struct vm_area_struct *vma, unsigned long start_addr,
    1514             :                 unsigned long end_addr)
    1515             : {
    1516       20432 :         struct mmu_notifier_range range;
    1517             : 
    1518       20432 :         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
    1519             :                                 start_addr, end_addr);
    1520      134283 :         mmu_notifier_invalidate_range_start(&range);
    1521      134283 :         for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
    1522      113849 :                 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
    1523       20434 :         mmu_notifier_invalidate_range_end(&range);
    1524       20434 : }
    1525             : 
    1526             : /**
    1527             :  * zap_page_range - remove user pages in a given range
    1528             :  * @vma: vm_area_struct holding the applicable pages
    1529             :  * @start: starting address of pages to zap
    1530             :  * @size: number of bytes to zap
    1531             :  *
    1532             :  * Caller must protect the VMA list
    1533             :  */
    1534           4 : void zap_page_range(struct vm_area_struct *vma, unsigned long start,
    1535             :                 unsigned long size)
    1536             : {
    1537           4 :         struct mmu_notifier_range range;
    1538           4 :         struct mmu_gather tlb;
    1539             : 
    1540           4 :         lru_add_drain();
    1541           4 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
    1542             :                                 start, start + size);
    1543           4 :         tlb_gather_mmu(&tlb, vma->vm_mm);
    1544           4 :         update_hiwater_rss(vma->vm_mm);
    1545           4 :         mmu_notifier_invalidate_range_start(&range);
    1546           8 :         for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
    1547           4 :                 unmap_single_vma(&tlb, vma, start, range.end, NULL);
    1548           4 :         mmu_notifier_invalidate_range_end(&range);
    1549           4 :         tlb_finish_mmu(&tlb);
    1550           4 : }
    1551             : 
    1552             : /**
    1553             :  * zap_page_range_single - remove user pages in a given range
    1554             :  * @vma: vm_area_struct holding the applicable pages
    1555             :  * @address: starting address of pages to zap
    1556             :  * @size: number of bytes to zap
    1557             :  * @details: details of shared cache invalidation
    1558             :  *
    1559             :  * The range must fit into one VMA.
    1560             :  */
    1561           0 : static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
    1562             :                 unsigned long size, struct zap_details *details)
    1563             : {
    1564           0 :         struct mmu_notifier_range range;
    1565           0 :         struct mmu_gather tlb;
    1566             : 
    1567           0 :         lru_add_drain();
    1568           0 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
    1569             :                                 address, address + size);
    1570           0 :         tlb_gather_mmu(&tlb, vma->vm_mm);
    1571           0 :         update_hiwater_rss(vma->vm_mm);
    1572           0 :         mmu_notifier_invalidate_range_start(&range);
    1573           0 :         unmap_single_vma(&tlb, vma, address, range.end, details);
    1574           0 :         mmu_notifier_invalidate_range_end(&range);
    1575           0 :         tlb_finish_mmu(&tlb);
    1576           0 : }
    1577             : 
    1578             : /**
    1579             :  * zap_vma_ptes - remove ptes mapping the vma
    1580             :  * @vma: vm_area_struct holding ptes to be zapped
    1581             :  * @address: starting address of pages to zap
    1582             :  * @size: number of bytes to zap
    1583             :  *
    1584             :  * This function only unmaps ptes assigned to VM_PFNMAP vmas.
    1585             :  *
    1586             :  * The entire address range must be fully contained within the vma.
    1587             :  *
    1588             :  */
    1589           0 : void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
    1590             :                 unsigned long size)
    1591             : {
    1592           0 :         if (address < vma->vm_start || address + size > vma->vm_end ||
    1593           0 :                         !(vma->vm_flags & VM_PFNMAP))
    1594             :                 return;
    1595             : 
    1596           0 :         zap_page_range_single(vma, address, size, NULL);
    1597             : }
    1598             : EXPORT_SYMBOL_GPL(zap_vma_ptes);
    1599             : 
    1600         250 : static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
    1601             : {
    1602         250 :         pgd_t *pgd;
    1603         250 :         p4d_t *p4d;
    1604         250 :         pud_t *pud;
    1605         250 :         pmd_t *pmd;
    1606             : 
    1607         250 :         pgd = pgd_offset(mm, addr);
    1608         250 :         p4d = p4d_alloc(mm, pgd, addr);
    1609         250 :         if (!p4d)
    1610             :                 return NULL;
    1611         250 :         pud = pud_alloc(mm, p4d, addr);
    1612         250 :         if (!pud)
    1613             :                 return NULL;
    1614         250 :         pmd = pmd_alloc(mm, pud, addr);
    1615         250 :         if (!pmd)
    1616             :                 return NULL;
    1617             : 
    1618         250 :         VM_BUG_ON(pmd_trans_huge(*pmd));
    1619             :         return pmd;
    1620             : }
    1621             : 
    1622         250 : pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
    1623             :                         spinlock_t **ptl)
    1624             : {
    1625         250 :         pmd_t *pmd = walk_to_pmd(mm, addr);
    1626             : 
    1627         250 :         if (!pmd)
    1628             :                 return NULL;
    1629         500 :         return pte_alloc_map_lock(mm, pmd, addr, ptl);
    1630             : }
    1631             : 
    1632           0 : static int validate_page_before_insert(struct page *page)
    1633             : {
    1634           0 :         if (PageAnon(page) || PageSlab(page) || page_has_type(page))
    1635           0 :                 return -EINVAL;
    1636           0 :         flush_dcache_page(page);
    1637             :         return 0;
    1638             : }
    1639             : 
    1640           0 : static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
    1641             :                         unsigned long addr, struct page *page, pgprot_t prot)
    1642             : {
    1643           0 :         if (!pte_none(*pte))
    1644             :                 return -EBUSY;
    1645             :         /* Ok, finally just insert the thing.. */
    1646           0 :         get_page(page);
    1647           0 :         inc_mm_counter_fast(mm, mm_counter_file(page));
    1648           0 :         page_add_file_rmap(page, false);
    1649           0 :         set_pte_at(mm, addr, pte, mk_pte(page, prot));
    1650           0 :         return 0;
    1651             : }
    1652             : 
    1653             : /*
    1654             :  * This is the old fallback for page remapping.
    1655             :  *
    1656             :  * For historical reasons, it only allows reserved pages. Only
    1657             :  * old drivers should use this, and they needed to mark their
    1658             :  * pages reserved for the old functions anyway.
    1659             :  */
    1660           0 : static int insert_page(struct vm_area_struct *vma, unsigned long addr,
    1661             :                         struct page *page, pgprot_t prot)
    1662             : {
    1663           0 :         struct mm_struct *mm = vma->vm_mm;
    1664           0 :         int retval;
    1665           0 :         pte_t *pte;
    1666           0 :         spinlock_t *ptl;
    1667             : 
    1668           0 :         retval = validate_page_before_insert(page);
    1669           0 :         if (retval)
    1670           0 :                 goto out;
    1671           0 :         retval = -ENOMEM;
    1672           0 :         pte = get_locked_pte(mm, addr, &ptl);
    1673           0 :         if (!pte)
    1674           0 :                 goto out;
    1675           0 :         retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
    1676           0 :         pte_unmap_unlock(pte, ptl);
    1677           0 : out:
    1678           0 :         return retval;
    1679             : }
    1680             : 
    1681             : #ifdef pte_index
    1682             : static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
    1683             :                         unsigned long addr, struct page *page, pgprot_t prot)
    1684             : {
    1685             :         int err;
    1686             : 
    1687             :         if (!page_count(page))
    1688             :                 return -EINVAL;
    1689             :         err = validate_page_before_insert(page);
    1690             :         if (err)
    1691             :                 return err;
    1692             :         return insert_page_into_pte_locked(mm, pte, addr, page, prot);
    1693             : }
    1694             : 
    1695             : /* insert_pages() amortizes the cost of spinlock operations
    1696             :  * when inserting pages in a loop. Arch *must* define pte_index.
    1697             :  */
    1698             : static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
    1699             :                         struct page **pages, unsigned long *num, pgprot_t prot)
    1700             : {
    1701             :         pmd_t *pmd = NULL;
    1702             :         pte_t *start_pte, *pte;
    1703             :         spinlock_t *pte_lock;
    1704             :         struct mm_struct *const mm = vma->vm_mm;
    1705             :         unsigned long curr_page_idx = 0;
    1706             :         unsigned long remaining_pages_total = *num;
    1707             :         unsigned long pages_to_write_in_pmd;
    1708             :         int ret;
    1709             : more:
    1710             :         ret = -EFAULT;
    1711             :         pmd = walk_to_pmd(mm, addr);
    1712             :         if (!pmd)
    1713             :                 goto out;
    1714             : 
    1715             :         pages_to_write_in_pmd = min_t(unsigned long,
    1716             :                 remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
    1717             : 
    1718             :         /* Allocate the PTE if necessary; takes PMD lock once only. */
    1719             :         ret = -ENOMEM;
    1720             :         if (pte_alloc(mm, pmd))
    1721             :                 goto out;
    1722             : 
    1723             :         while (pages_to_write_in_pmd) {
    1724             :                 int pte_idx = 0;
    1725             :                 const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
    1726             : 
    1727             :                 start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
    1728             :                 for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
    1729             :                         int err = insert_page_in_batch_locked(mm, pte,
    1730             :                                 addr, pages[curr_page_idx], prot);
    1731             :                         if (unlikely(err)) {
    1732             :                                 pte_unmap_unlock(start_pte, pte_lock);
    1733             :                                 ret = err;
    1734             :                                 remaining_pages_total -= pte_idx;
    1735             :                                 goto out;
    1736             :                         }
    1737             :                         addr += PAGE_SIZE;
    1738             :                         ++curr_page_idx;
    1739             :                 }
    1740             :                 pte_unmap_unlock(start_pte, pte_lock);
    1741             :                 pages_to_write_in_pmd -= batch_size;
    1742             :                 remaining_pages_total -= batch_size;
    1743             :         }
    1744             :         if (remaining_pages_total)
    1745             :                 goto more;
    1746             :         ret = 0;
    1747             : out:
    1748             :         *num = remaining_pages_total;
    1749             :         return ret;
    1750             : }
    1751             : #endif  /* ifdef pte_index */
    1752             : 
    1753             : /**
    1754             :  * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
    1755             :  * @vma: user vma to map to
    1756             :  * @addr: target start user address of these pages
    1757             :  * @pages: source kernel pages
    1758             :  * @num: in: number of pages to map. out: number of pages that were *not*
    1759             :  * mapped. (0 means all pages were successfully mapped).
    1760             :  *
    1761             :  * Preferred over vm_insert_page() when inserting multiple pages.
    1762             :  *
    1763             :  * In case of error, we may have mapped a subset of the provided
    1764             :  * pages. It is the caller's responsibility to account for this case.
    1765             :  *
    1766             :  * The same restrictions apply as in vm_insert_page().
    1767             :  */
    1768           0 : int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
    1769             :                         struct page **pages, unsigned long *num)
    1770             : {
    1771             : #ifdef pte_index
    1772             :         const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
    1773             : 
    1774             :         if (addr < vma->vm_start || end_addr >= vma->vm_end)
    1775             :                 return -EFAULT;
    1776             :         if (!(vma->vm_flags & VM_MIXEDMAP)) {
    1777             :                 BUG_ON(mmap_read_trylock(vma->vm_mm));
    1778             :                 BUG_ON(vma->vm_flags & VM_PFNMAP);
    1779             :                 vma->vm_flags |= VM_MIXEDMAP;
    1780             :         }
    1781             :         /* Defer page refcount checking till we're about to map that page. */
    1782             :         return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
    1783             : #else
    1784           0 :         unsigned long idx = 0, pgcount = *num;
    1785           0 :         int err = -EINVAL;
    1786             : 
    1787           0 :         for (; idx < pgcount; ++idx) {
    1788           0 :                 err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
    1789           0 :                 if (err)
    1790             :                         break;
    1791             :         }
    1792           0 :         *num = pgcount - idx;
    1793           0 :         return err;
    1794             : #endif  /* ifdef pte_index */
    1795             : }
    1796             : EXPORT_SYMBOL(vm_insert_pages);
    1797             : 
    1798             : /**
    1799             :  * vm_insert_page - insert single page into user vma
    1800             :  * @vma: user vma to map to
    1801             :  * @addr: target user address of this page
    1802             :  * @page: source kernel page
    1803             :  *
    1804             :  * This allows drivers to insert individual pages they've allocated
    1805             :  * into a user vma.
    1806             :  *
    1807             :  * The page has to be a nice clean _individual_ kernel allocation.
    1808             :  * If you allocate a compound page, you need to have marked it as
    1809             :  * such (__GFP_COMP), or manually just split the page up yourself
    1810             :  * (see split_page()).
    1811             :  *
    1812             :  * NOTE! Traditionally this was done with "remap_pfn_range()" which
    1813             :  * took an arbitrary page protection parameter. This doesn't allow
    1814             :  * that. Your vma protection will have to be set up correctly, which
    1815             :  * means that if you want a shared writable mapping, you'd better
    1816             :  * ask for a shared writable mapping!
    1817             :  *
    1818             :  * The page does not need to be reserved.
    1819             :  *
    1820             :  * Usually this function is called from f_op->mmap() handler
    1821             :  * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
    1822             :  * Caller must set VM_MIXEDMAP on vma if it wants to call this
    1823             :  * function from other places, for example from page-fault handler.
    1824             :  *
    1825             :  * Return: %0 on success, negative error code otherwise.
    1826             :  */
    1827           0 : int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
    1828             :                         struct page *page)
    1829             : {
    1830           0 :         if (addr < vma->vm_start || addr >= vma->vm_end)
    1831             :                 return -EFAULT;
    1832           0 :         if (!page_count(page))
    1833             :                 return -EINVAL;
    1834           0 :         if (!(vma->vm_flags & VM_MIXEDMAP)) {
    1835           0 :                 BUG_ON(mmap_read_trylock(vma->vm_mm));
    1836           0 :                 BUG_ON(vma->vm_flags & VM_PFNMAP);
    1837           0 :                 vma->vm_flags |= VM_MIXEDMAP;
    1838             :         }
    1839           0 :         return insert_page(vma, addr, page, vma->vm_page_prot);
    1840             : }
    1841             : EXPORT_SYMBOL(vm_insert_page);
    1842             : 
    1843             : /*
    1844             :  * __vm_map_pages - maps range of kernel pages into user vma
    1845             :  * @vma: user vma to map to
    1846             :  * @pages: pointer to array of source kernel pages
    1847             :  * @num: number of pages in page array
    1848             :  * @offset: user's requested vm_pgoff
    1849             :  *
    1850             :  * This allows drivers to map range of kernel pages into a user vma.
    1851             :  *
    1852             :  * Return: 0 on success and error code otherwise.
    1853             :  */
    1854           0 : static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
    1855             :                                 unsigned long num, unsigned long offset)
    1856             : {
    1857           0 :         unsigned long count = vma_pages(vma);
    1858           0 :         unsigned long uaddr = vma->vm_start;
    1859           0 :         int ret, i;
    1860             : 
    1861             :         /* Fail if the user requested offset is beyond the end of the object */
    1862           0 :         if (offset >= num)
    1863             :                 return -ENXIO;
    1864             : 
    1865             :         /* Fail if the user requested size exceeds available object size */
    1866           0 :         if (count > num - offset)
    1867             :                 return -ENXIO;
    1868             : 
    1869           0 :         for (i = 0; i < count; i++) {
    1870           0 :                 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
    1871           0 :                 if (ret < 0)
    1872           0 :                         return ret;
    1873           0 :                 uaddr += PAGE_SIZE;
    1874             :         }
    1875             : 
    1876             :         return 0;
    1877             : }
    1878             : 
    1879             : /**
    1880             :  * vm_map_pages - maps range of kernel pages starts with non zero offset
    1881             :  * @vma: user vma to map to
    1882             :  * @pages: pointer to array of source kernel pages
    1883             :  * @num: number of pages in page array
    1884             :  *
    1885             :  * Maps an object consisting of @num pages, catering for the user's
    1886             :  * requested vm_pgoff
    1887             :  *
    1888             :  * If we fail to insert any page into the vma, the function will return
    1889             :  * immediately leaving any previously inserted pages present.  Callers
    1890             :  * from the mmap handler may immediately return the error as their caller
    1891             :  * will destroy the vma, removing any successfully inserted pages. Other
    1892             :  * callers should make their own arrangements for calling unmap_region().
    1893             :  *
    1894             :  * Context: Process context. Called by mmap handlers.
    1895             :  * Return: 0 on success and error code otherwise.
    1896             :  */
    1897           0 : int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
    1898             :                                 unsigned long num)
    1899             : {
    1900           0 :         return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
    1901             : }
    1902             : EXPORT_SYMBOL(vm_map_pages);
    1903             : 
    1904             : /**
    1905             :  * vm_map_pages_zero - map range of kernel pages starts with zero offset
    1906             :  * @vma: user vma to map to
    1907             :  * @pages: pointer to array of source kernel pages
    1908             :  * @num: number of pages in page array
    1909             :  *
    1910             :  * Similar to vm_map_pages(), except that it explicitly sets the offset
    1911             :  * to 0. This function is intended for the drivers that did not consider
    1912             :  * vm_pgoff.
    1913             :  *
    1914             :  * Context: Process context. Called by mmap handlers.
    1915             :  * Return: 0 on success and error code otherwise.
    1916             :  */
    1917           0 : int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
    1918             :                                 unsigned long num)
    1919             : {
    1920           0 :         return __vm_map_pages(vma, pages, num, 0);
    1921             : }
    1922             : EXPORT_SYMBOL(vm_map_pages_zero);
    1923             : 
    1924         133 : static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
    1925             :                         pfn_t pfn, pgprot_t prot, bool mkwrite)
    1926             : {
    1927         133 :         struct mm_struct *mm = vma->vm_mm;
    1928         133 :         pte_t *pte, entry;
    1929         133 :         spinlock_t *ptl;
    1930             : 
    1931         133 :         pte = get_locked_pte(mm, addr, &ptl);
    1932         133 :         if (!pte)
    1933             :                 return VM_FAULT_OOM;
    1934         133 :         if (!pte_none(*pte)) {
    1935           0 :                 if (mkwrite) {
    1936             :                         /*
    1937             :                          * For read faults on private mappings the PFN passed
    1938             :                          * in may not match the PFN we have mapped if the
    1939             :                          * mapped PFN is a writeable COW page.  In the mkwrite
    1940             :                          * case we are creating a writable PTE for a shared
    1941             :                          * mapping and we expect the PFNs to match. If they
    1942             :                          * don't match, we are likely racing with block
    1943             :                          * allocation and mapping invalidation so just skip the
    1944             :                          * update.
    1945             :                          */
    1946           0 :                         if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
    1947           0 :                                 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
    1948           0 :                                 goto out_unlock;
    1949             :                         }
    1950           0 :                         entry = pte_mkyoung(*pte);
    1951           0 :                         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    1952           0 :                         if (ptep_set_access_flags(vma, addr, pte, entry, 1))
    1953           0 :                                 update_mmu_cache(vma, addr, pte);
    1954             :                 }
    1955           0 :                 goto out_unlock;
    1956             :         }
    1957             : 
    1958             :         /* Ok, finally just insert the thing.. */
    1959         133 :         if (pfn_t_devmap(pfn))
    1960           0 :                 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
    1961             :         else
    1962         133 :                 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
    1963             : 
    1964         133 :         if (mkwrite) {
    1965           0 :                 entry = pte_mkyoung(entry);
    1966           0 :                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    1967             :         }
    1968             : 
    1969         133 :         set_pte_at(mm, addr, pte, entry);
    1970         133 :         update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
    1971             : 
    1972         133 : out_unlock:
    1973         133 :         pte_unmap_unlock(pte, ptl);
    1974         133 :         return VM_FAULT_NOPAGE;
    1975             : }
    1976             : 
    1977             : /**
    1978             :  * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
    1979             :  * @vma: user vma to map to
    1980             :  * @addr: target user address of this page
    1981             :  * @pfn: source kernel pfn
    1982             :  * @pgprot: pgprot flags for the inserted page
    1983             :  *
    1984             :  * This is exactly like vmf_insert_pfn(), except that it allows drivers
    1985             :  * to override pgprot on a per-page basis.
    1986             :  *
    1987             :  * This only makes sense for IO mappings, and it makes no sense for
    1988             :  * COW mappings.  In general, using multiple vmas is preferable;
    1989             :  * vmf_insert_pfn_prot should only be used if using multiple VMAs is
    1990             :  * impractical.
    1991             :  *
    1992             :  * See vmf_insert_mixed_prot() for a discussion of the implication of using
    1993             :  * a value of @pgprot different from that of @vma->vm_page_prot.
    1994             :  *
    1995             :  * Context: Process context.  May allocate using %GFP_KERNEL.
    1996             :  * Return: vm_fault_t value.
    1997             :  */
    1998         133 : vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
    1999             :                         unsigned long pfn, pgprot_t pgprot)
    2000             : {
    2001             :         /*
    2002             :          * Technically, architectures with pte_special can avoid all these
    2003             :          * restrictions (same for remap_pfn_range).  However we would like
    2004             :          * consistency in testing and feature parity among all, so we should
    2005             :          * try to keep these invariants in place for everybody.
    2006             :          */
    2007         133 :         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
    2008         133 :         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
    2009             :                                                 (VM_PFNMAP|VM_MIXEDMAP));
    2010         133 :         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
    2011         133 :         BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
    2012             : 
    2013         133 :         if (addr < vma->vm_start || addr >= vma->vm_end)
    2014             :                 return VM_FAULT_SIGBUS;
    2015             : 
    2016         133 :         if (!pfn_modify_allowed(pfn, pgprot))
    2017             :                 return VM_FAULT_SIGBUS;
    2018             : 
    2019         133 :         track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
    2020             : 
    2021         133 :         return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
    2022             :                         false);
    2023             : }
    2024             : EXPORT_SYMBOL(vmf_insert_pfn_prot);
    2025             : 
    2026             : /**
    2027             :  * vmf_insert_pfn - insert single pfn into user vma
    2028             :  * @vma: user vma to map to
    2029             :  * @addr: target user address of this page
    2030             :  * @pfn: source kernel pfn
    2031             :  *
    2032             :  * Similar to vm_insert_page, this allows drivers to insert individual pages
    2033             :  * they've allocated into a user vma. Same comments apply.
    2034             :  *
    2035             :  * This function should only be called from a vm_ops->fault handler, and
    2036             :  * in that case the handler should return the result of this function.
    2037             :  *
    2038             :  * vma cannot be a COW mapping.
    2039             :  *
    2040             :  * As this is called only for pages that do not currently exist, we
    2041             :  * do not need to flush old virtual caches or the TLB.
    2042             :  *
    2043             :  * Context: Process context.  May allocate using %GFP_KERNEL.
    2044             :  * Return: vm_fault_t value.
    2045             :  */
    2046          76 : vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
    2047             :                         unsigned long pfn)
    2048             : {
    2049          76 :         return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
    2050             : }
    2051             : EXPORT_SYMBOL(vmf_insert_pfn);
    2052             : 
    2053           0 : static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
    2054             : {
    2055             :         /* these checks mirror the abort conditions in vm_normal_page */
    2056           0 :         if (vma->vm_flags & VM_MIXEDMAP)
    2057             :                 return true;
    2058           0 :         if (pfn_t_devmap(pfn))
    2059             :                 return true;
    2060           0 :         if (pfn_t_special(pfn))
    2061             :                 return true;
    2062           0 :         if (is_zero_pfn(pfn_t_to_pfn(pfn)))
    2063           0 :                 return true;
    2064             :         return false;
    2065             : }
    2066             : 
    2067           0 : static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
    2068             :                 unsigned long addr, pfn_t pfn, pgprot_t pgprot,
    2069             :                 bool mkwrite)
    2070             : {
    2071           0 :         int err;
    2072             : 
    2073           0 :         BUG_ON(!vm_mixed_ok(vma, pfn));
    2074             : 
    2075           0 :         if (addr < vma->vm_start || addr >= vma->vm_end)
    2076             :                 return VM_FAULT_SIGBUS;
    2077             : 
    2078           0 :         track_pfn_insert(vma, &pgprot, pfn);
    2079             : 
    2080           0 :         if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
    2081             :                 return VM_FAULT_SIGBUS;
    2082             : 
    2083             :         /*
    2084             :          * If we don't have pte special, then we have to use the pfn_valid()
    2085             :          * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
    2086             :          * refcount the page if pfn_valid is true (hence insert_page rather
    2087             :          * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
    2088             :          * without pte special, it would there be refcounted as a normal page.
    2089             :          */
    2090           0 :         if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
    2091             :             !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
    2092             :                 struct page *page;
    2093             : 
    2094             :                 /*
    2095             :                  * At this point we are committed to insert_page()
    2096             :                  * regardless of whether the caller specified flags that
    2097             :                  * result in pfn_t_has_page() == false.
    2098             :                  */
    2099             :                 page = pfn_to_page(pfn_t_to_pfn(pfn));
    2100             :                 err = insert_page(vma, addr, page, pgprot);
    2101             :         } else {
    2102           0 :                 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
    2103             :         }
    2104             : 
    2105             :         if (err == -ENOMEM)
    2106             :                 return VM_FAULT_OOM;
    2107             :         if (err < 0 && err != -EBUSY)
    2108             :                 return VM_FAULT_SIGBUS;
    2109             : 
    2110             :         return VM_FAULT_NOPAGE;
    2111             : }
    2112             : 
    2113             : /**
    2114             :  * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
    2115             :  * @vma: user vma to map to
    2116             :  * @addr: target user address of this page
    2117             :  * @pfn: source kernel pfn
    2118             :  * @pgprot: pgprot flags for the inserted page
    2119             :  *
    2120             :  * This is exactly like vmf_insert_mixed(), except that it allows drivers
    2121             :  * to override pgprot on a per-page basis.
    2122             :  *
    2123             :  * Typically this function should be used by drivers to set caching- and
    2124             :  * encryption bits different than those of @vma->vm_page_prot, because
    2125             :  * the caching- or encryption mode may not be known at mmap() time.
    2126             :  * This is ok as long as @vma->vm_page_prot is not used by the core vm
    2127             :  * to set caching and encryption bits for those vmas (except for COW pages).
    2128             :  * This is ensured by core vm only modifying these page table entries using
    2129             :  * functions that don't touch caching- or encryption bits, using pte_modify()
    2130             :  * if needed. (See for example mprotect()).
    2131             :  * Also when new page-table entries are created, this is only done using the
    2132             :  * fault() callback, and never using the value of vma->vm_page_prot,
    2133             :  * except for page-table entries that point to anonymous pages as the result
    2134             :  * of COW.
    2135             :  *
    2136             :  * Context: Process context.  May allocate using %GFP_KERNEL.
    2137             :  * Return: vm_fault_t value.
    2138             :  */
    2139           0 : vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
    2140             :                                  pfn_t pfn, pgprot_t pgprot)
    2141             : {
    2142           0 :         return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
    2143             : }
    2144             : EXPORT_SYMBOL(vmf_insert_mixed_prot);
    2145             : 
    2146           0 : vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
    2147             :                 pfn_t pfn)
    2148             : {
    2149           0 :         return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
    2150             : }
    2151             : EXPORT_SYMBOL(vmf_insert_mixed);
    2152             : 
    2153             : /*
    2154             :  *  If the insertion of PTE failed because someone else already added a
    2155             :  *  different entry in the mean time, we treat that as success as we assume
    2156             :  *  the same entry was actually inserted.
    2157             :  */
    2158           0 : vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
    2159             :                 unsigned long addr, pfn_t pfn)
    2160             : {
    2161           0 :         return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
    2162             : }
    2163             : EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
    2164             : 
    2165             : /*
    2166             :  * maps a range of physical memory into the requested pages. the old
    2167             :  * mappings are removed. any references to nonexistent pages results
    2168             :  * in null mappings (currently treated as "copy-on-access")
    2169             :  */
    2170           0 : static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
    2171             :                         unsigned long addr, unsigned long end,
    2172             :                         unsigned long pfn, pgprot_t prot)
    2173             : {
    2174           0 :         pte_t *pte, *mapped_pte;
    2175           0 :         spinlock_t *ptl;
    2176           0 :         int err = 0;
    2177             : 
    2178           0 :         mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
    2179           0 :         if (!pte)
    2180           0 :                 return -ENOMEM;
    2181           0 :         arch_enter_lazy_mmu_mode();
    2182           0 :         do {
    2183           0 :                 BUG_ON(!pte_none(*pte));
    2184           0 :                 if (!pfn_modify_allowed(pfn, prot)) {
    2185             :                         err = -EACCES;
    2186             :                         break;
    2187             :                 }
    2188           0 :                 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
    2189           0 :                 pfn++;
    2190           0 :         } while (pte++, addr += PAGE_SIZE, addr != end);
    2191           0 :         arch_leave_lazy_mmu_mode();
    2192           0 :         pte_unmap_unlock(mapped_pte, ptl);
    2193           0 :         return err;
    2194             : }
    2195             : 
    2196           0 : static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
    2197             :                         unsigned long addr, unsigned long end,
    2198             :                         unsigned long pfn, pgprot_t prot)
    2199             : {
    2200           0 :         pmd_t *pmd;
    2201           0 :         unsigned long next;
    2202           0 :         int err;
    2203             : 
    2204           0 :         pfn -= addr >> PAGE_SHIFT;
    2205           0 :         pmd = pmd_alloc(mm, pud, addr);
    2206           0 :         if (!pmd)
    2207             :                 return -ENOMEM;
    2208           0 :         VM_BUG_ON(pmd_trans_huge(*pmd));
    2209           0 :         do {
    2210           0 :                 next = pmd_addr_end(addr, end);
    2211           0 :                 err = remap_pte_range(mm, pmd, addr, next,
    2212           0 :                                 pfn + (addr >> PAGE_SHIFT), prot);
    2213           0 :                 if (err)
    2214           0 :                         return err;
    2215           0 :         } while (pmd++, addr = next, addr != end);
    2216             :         return 0;
    2217             : }
    2218             : 
    2219           0 : static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
    2220             :                         unsigned long addr, unsigned long end,
    2221             :                         unsigned long pfn, pgprot_t prot)
    2222             : {
    2223           0 :         pud_t *pud;
    2224           0 :         unsigned long next;
    2225           0 :         int err;
    2226             : 
    2227           0 :         pfn -= addr >> PAGE_SHIFT;
    2228           0 :         pud = pud_alloc(mm, p4d, addr);
    2229           0 :         if (!pud)
    2230             :                 return -ENOMEM;
    2231           0 :         do {
    2232           0 :                 next = pud_addr_end(addr, end);
    2233           0 :                 err = remap_pmd_range(mm, pud, addr, next,
    2234           0 :                                 pfn + (addr >> PAGE_SHIFT), prot);
    2235           0 :                 if (err)
    2236           0 :                         return err;
    2237           0 :         } while (pud++, addr = next, addr != end);
    2238             :         return 0;
    2239             : }
    2240             : 
    2241           0 : static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
    2242             :                         unsigned long addr, unsigned long end,
    2243             :                         unsigned long pfn, pgprot_t prot)
    2244             : {
    2245           0 :         p4d_t *p4d;
    2246           0 :         unsigned long next;
    2247           0 :         int err;
    2248             : 
    2249           0 :         pfn -= addr >> PAGE_SHIFT;
    2250           0 :         p4d = p4d_alloc(mm, pgd, addr);
    2251           0 :         if (!p4d)
    2252             :                 return -ENOMEM;
    2253           0 :         do {
    2254           0 :                 next = p4d_addr_end(addr, end);
    2255           0 :                 err = remap_pud_range(mm, p4d, addr, next,
    2256             :                                 pfn + (addr >> PAGE_SHIFT), prot);
    2257           0 :                 if (err)
    2258           0 :                         return err;
    2259           0 :         } while (p4d++, addr = next, addr != end);
    2260             :         return 0;
    2261             : }
    2262             : 
    2263             : /**
    2264             :  * remap_pfn_range - remap kernel memory to userspace
    2265             :  * @vma: user vma to map to
    2266             :  * @addr: target page aligned user address to start at
    2267             :  * @pfn: page frame number of kernel physical memory address
    2268             :  * @size: size of mapping area
    2269             :  * @prot: page protection flags for this mapping
    2270             :  *
    2271             :  * Note: this is only safe if the mm semaphore is held when called.
    2272             :  *
    2273             :  * Return: %0 on success, negative error code otherwise.
    2274             :  */
    2275           0 : int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
    2276             :                     unsigned long pfn, unsigned long size, pgprot_t prot)
    2277             : {
    2278           0 :         pgd_t *pgd;
    2279           0 :         unsigned long next;
    2280           0 :         unsigned long end = addr + PAGE_ALIGN(size);
    2281           0 :         struct mm_struct *mm = vma->vm_mm;
    2282           0 :         unsigned long remap_pfn = pfn;
    2283           0 :         int err;
    2284             : 
    2285           0 :         if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
    2286             :                 return -EINVAL;
    2287             : 
    2288             :         /*
    2289             :          * Physically remapped pages are special. Tell the
    2290             :          * rest of the world about it:
    2291             :          *   VM_IO tells people not to look at these pages
    2292             :          *      (accesses can have side effects).
    2293             :          *   VM_PFNMAP tells the core MM that the base pages are just
    2294             :          *      raw PFN mappings, and do not have a "struct page" associated
    2295             :          *      with them.
    2296             :          *   VM_DONTEXPAND
    2297             :          *      Disable vma merging and expanding with mremap().
    2298             :          *   VM_DONTDUMP
    2299             :          *      Omit vma from core dump, even when VM_IO turned off.
    2300             :          *
    2301             :          * There's a horrible special case to handle copy-on-write
    2302             :          * behaviour that some programs depend on. We mark the "original"
    2303             :          * un-COW'ed pages by matching them up with "vma->vm_pgoff".
    2304             :          * See vm_normal_page() for details.
    2305             :          */
    2306           0 :         if (is_cow_mapping(vma->vm_flags)) {
    2307           0 :                 if (addr != vma->vm_start || end != vma->vm_end)
    2308             :                         return -EINVAL;
    2309           0 :                 vma->vm_pgoff = pfn;
    2310             :         }
    2311             : 
    2312           0 :         err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
    2313           0 :         if (err)
    2314             :                 return -EINVAL;
    2315             : 
    2316           0 :         vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
    2317             : 
    2318           0 :         BUG_ON(addr >= end);
    2319           0 :         pfn -= addr >> PAGE_SHIFT;
    2320           0 :         pgd = pgd_offset(mm, addr);
    2321           0 :         flush_cache_range(vma, addr, end);
    2322           0 :         do {
    2323           0 :                 next = pgd_addr_end(addr, end);
    2324           0 :                 err = remap_p4d_range(mm, pgd, addr, next,
    2325           0 :                                 pfn + (addr >> PAGE_SHIFT), prot);
    2326           0 :                 if (err)
    2327             :                         break;
    2328           0 :         } while (pgd++, addr = next, addr != end);
    2329             : 
    2330           0 :         if (err)
    2331           0 :                 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
    2332             : 
    2333             :         return err;
    2334             : }
    2335             : EXPORT_SYMBOL(remap_pfn_range);
    2336             : 
    2337             : /**
    2338             :  * vm_iomap_memory - remap memory to userspace
    2339             :  * @vma: user vma to map to
    2340             :  * @start: start of the physical memory to be mapped
    2341             :  * @len: size of area
    2342             :  *
    2343             :  * This is a simplified io_remap_pfn_range() for common driver use. The
    2344             :  * driver just needs to give us the physical memory range to be mapped,
    2345             :  * we'll figure out the rest from the vma information.
    2346             :  *
    2347             :  * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
    2348             :  * whatever write-combining details or similar.
    2349             :  *
    2350             :  * Return: %0 on success, negative error code otherwise.
    2351             :  */
    2352           0 : int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
    2353             : {
    2354           0 :         unsigned long vm_len, pfn, pages;
    2355             : 
    2356             :         /* Check that the physical memory area passed in looks valid */
    2357           0 :         if (start + len < start)
    2358             :                 return -EINVAL;
    2359             :         /*
    2360             :          * You *really* shouldn't map things that aren't page-aligned,
    2361             :          * but we've historically allowed it because IO memory might
    2362             :          * just have smaller alignment.
    2363             :          */
    2364           0 :         len += start & ~PAGE_MASK;
    2365           0 :         pfn = start >> PAGE_SHIFT;
    2366           0 :         pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
    2367           0 :         if (pfn + pages < pfn)
    2368             :                 return -EINVAL;
    2369             : 
    2370             :         /* We start the mapping 'vm_pgoff' pages into the area */
    2371           0 :         if (vma->vm_pgoff > pages)
    2372             :                 return -EINVAL;
    2373           0 :         pfn += vma->vm_pgoff;
    2374           0 :         pages -= vma->vm_pgoff;
    2375             : 
    2376             :         /* Can we fit all of the mapping? */
    2377           0 :         vm_len = vma->vm_end - vma->vm_start;
    2378           0 :         if (vm_len >> PAGE_SHIFT > pages)
    2379             :                 return -EINVAL;
    2380             : 
    2381             :         /* Ok, let it rip */
    2382           0 :         return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
    2383             : }
    2384             : EXPORT_SYMBOL(vm_iomap_memory);
    2385             : 
    2386        9984 : static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
    2387             :                                      unsigned long addr, unsigned long end,
    2388             :                                      pte_fn_t fn, void *data, bool create,
    2389             :                                      pgtbl_mod_mask *mask)
    2390             : {
    2391        9984 :         pte_t *pte, *mapped_pte;
    2392        9984 :         int err = 0;
    2393        9984 :         spinlock_t *ptl;
    2394             : 
    2395        9984 :         if (create) {
    2396       19954 :                 mapped_pte = pte = (mm == &init_mm) ?
    2397       19954 :                         pte_alloc_kernel_track(pmd, addr, mask) :
    2398           0 :                         pte_alloc_map_lock(mm, pmd, addr, &ptl);
    2399        9977 :                 if (!pte)
    2400             :                         return -ENOMEM;
    2401             :         } else {
    2402        9991 :                 mapped_pte = pte = (mm == &init_mm) ?
    2403          14 :                         pte_offset_kernel(pmd, addr) :
    2404           0 :                         pte_offset_map_lock(mm, pmd, addr, &ptl);
    2405             :         }
    2406             : 
    2407        9984 :         BUG_ON(pmd_huge(*pmd));
    2408             : 
    2409        9984 :         arch_enter_lazy_mmu_mode();
    2410             : 
    2411        9984 :         if (fn) {
    2412       11063 :                 do {
    2413       11063 :                         if (create || !pte_none(*pte)) {
    2414       11063 :                                 err = fn(pte++, addr, data);
    2415       11063 :                                 if (err)
    2416             :                                         break;
    2417             :                         }
    2418       11063 :                 } while (addr += PAGE_SIZE, addr != end);
    2419             :         }
    2420        9984 :         *mask |= PGTBL_PTE_MODIFIED;
    2421             : 
    2422        9984 :         arch_leave_lazy_mmu_mode();
    2423             : 
    2424        9984 :         if (mm != &init_mm)
    2425           0 :                 pte_unmap_unlock(mapped_pte, ptl);
    2426             :         return err;
    2427             : }
    2428             : 
    2429        9983 : static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
    2430             :                                      unsigned long addr, unsigned long end,
    2431             :                                      pte_fn_t fn, void *data, bool create,
    2432             :                                      pgtbl_mod_mask *mask)
    2433             : {
    2434        9983 :         pmd_t *pmd;
    2435        9983 :         unsigned long next;
    2436        9983 :         int err = 0;
    2437             : 
    2438        9983 :         BUG_ON(pud_huge(*pud));
    2439             : 
    2440        9983 :         if (create) {
    2441        9976 :                 pmd = pmd_alloc_track(mm, pud, addr, mask);
    2442        9976 :                 if (!pmd)
    2443             :                         return -ENOMEM;
    2444             :         } else {
    2445          14 :                 pmd = pmd_offset(pud, addr);
    2446             :         }
    2447        9983 :         do {
    2448        9983 :                 next = pmd_addr_end(addr, end);
    2449        9983 :                 if (create || !pmd_none_or_clear_bad(pmd)) {
    2450        9983 :                         err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
    2451             :                                                  create, mask);
    2452        9983 :                         if (err)
    2453             :                                 break;
    2454             :                 }
    2455        9983 :         } while (pmd++, addr = next, addr != end);
    2456             :         return err;
    2457             : }
    2458             : 
    2459        9982 : static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
    2460             :                                      unsigned long addr, unsigned long end,
    2461             :                                      pte_fn_t fn, void *data, bool create,
    2462             :                                      pgtbl_mod_mask *mask)
    2463             : {
    2464        9982 :         pud_t *pud;
    2465        9982 :         unsigned long next;
    2466        9982 :         int err = 0;
    2467             : 
    2468        9982 :         if (create) {
    2469        9975 :                 pud = pud_alloc_track(mm, p4d, addr, mask);
    2470        9975 :                 if (!pud)
    2471             :                         return -ENOMEM;
    2472             :         } else {
    2473           7 :                 pud = pud_offset(p4d, addr);
    2474             :         }
    2475        9982 :         do {
    2476        9982 :                 next = pud_addr_end(addr, end);
    2477        9982 :                 if (create || !pud_none_or_clear_bad(pud)) {
    2478        9982 :                         err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
    2479             :                                                  create, mask);
    2480        9982 :                         if (err)
    2481             :                                 break;
    2482             :                 }
    2483        9982 :         } while (pud++, addr = next, addr != end);
    2484             :         return err;
    2485             : }
    2486             : 
    2487        9981 : static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
    2488             :                                      unsigned long addr, unsigned long end,
    2489             :                                      pte_fn_t fn, void *data, bool create,
    2490             :                                      pgtbl_mod_mask *mask)
    2491             : {
    2492        9981 :         p4d_t *p4d;
    2493        9981 :         unsigned long next;
    2494        9981 :         int err = 0;
    2495             : 
    2496        9981 :         if (create) {
    2497        9974 :                 p4d = p4d_alloc_track(mm, pgd, addr, mask);
    2498        9974 :                 if (!p4d)
    2499             :                         return -ENOMEM;
    2500             :         } else {
    2501        9981 :                 p4d = p4d_offset(pgd, addr);
    2502             :         }
    2503        9981 :         do {
    2504        9981 :                 next = p4d_addr_end(addr, end);
    2505        9981 :                 if (create || !p4d_none_or_clear_bad(p4d)) {
    2506        9981 :                         err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
    2507             :                                                  create, mask);
    2508        9981 :                         if (err)
    2509             :                                 break;
    2510             :                 }
    2511        9981 :         } while (p4d++, addr = next, addr != end);
    2512             :         return err;
    2513             : }
    2514             : 
    2515        9980 : static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
    2516             :                                  unsigned long size, pte_fn_t fn,
    2517             :                                  void *data, bool create)
    2518             : {
    2519        9980 :         pgd_t *pgd;
    2520        9980 :         unsigned long start = addr, next;
    2521        9980 :         unsigned long end = addr + size;
    2522        9980 :         pgtbl_mod_mask mask = 0;
    2523        9980 :         int err = 0;
    2524             : 
    2525        9980 :         if (WARN_ON(addr >= end))
    2526             :                 return -EINVAL;
    2527             : 
    2528        9980 :         pgd = pgd_offset(mm, addr);
    2529        9980 :         do {
    2530        9980 :                 next = pgd_addr_end(addr, end);
    2531        9980 :                 if (!create && pgd_none_or_clear_bad(pgd))
    2532             :                         continue;
    2533        9980 :                 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
    2534        9980 :                 if (err)
    2535             :                         break;
    2536        9980 :         } while (pgd++, addr = next, addr != end);
    2537             : 
    2538             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
    2539             :                 arch_sync_kernel_mappings(start, start + size);
    2540             : 
    2541             :         return err;
    2542             : }
    2543             : 
    2544             : /*
    2545             :  * Scan a region of virtual memory, filling in page tables as necessary
    2546             :  * and calling a provided function on each leaf page table.
    2547             :  */
    2548        9972 : int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
    2549             :                         unsigned long size, pte_fn_t fn, void *data)
    2550             : {
    2551        9972 :         return __apply_to_page_range(mm, addr, size, fn, data, true);
    2552             : }
    2553             : EXPORT_SYMBOL_GPL(apply_to_page_range);
    2554             : 
    2555             : /*
    2556             :  * Scan a region of virtual memory, calling a provided function on
    2557             :  * each leaf page table where it exists.
    2558             :  *
    2559             :  * Unlike apply_to_page_range, this does _not_ fill in page tables
    2560             :  * where they are absent.
    2561             :  */
    2562           7 : int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
    2563             :                                  unsigned long size, pte_fn_t fn, void *data)
    2564             : {
    2565           7 :         return __apply_to_page_range(mm, addr, size, fn, data, false);
    2566             : }
    2567             : EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
    2568             : 
    2569             : /*
    2570             :  * handle_pte_fault chooses page fault handler according to an entry which was
    2571             :  * read non-atomically.  Before making any commitment, on those architectures
    2572             :  * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
    2573             :  * parts, do_swap_page must check under lock before unmapping the pte and
    2574             :  * proceeding (but do_wp_page is only called after already making such a check;
    2575             :  * and do_anonymous_page can safely check later on).
    2576             :  */
    2577           0 : static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
    2578             :                                 pte_t *page_table, pte_t orig_pte)
    2579             : {
    2580           0 :         int same = 1;
    2581             : #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
    2582           0 :         if (sizeof(pte_t) > sizeof(unsigned long)) {
    2583             :                 spinlock_t *ptl = pte_lockptr(mm, pmd);
    2584             :                 spin_lock(ptl);
    2585             :                 same = pte_same(*page_table, orig_pte);
    2586             :                 spin_unlock(ptl);
    2587             :         }
    2588             : #endif
    2589           0 :         pte_unmap(page_table);
    2590           0 :         return same;
    2591             : }
    2592             : 
    2593       28758 : static inline bool cow_user_page(struct page *dst, struct page *src,
    2594             :                                  struct vm_fault *vmf)
    2595             : {
    2596       28758 :         bool ret;
    2597       28758 :         void *kaddr;
    2598       28758 :         void __user *uaddr;
    2599       28758 :         bool locked = false;
    2600       28758 :         struct vm_area_struct *vma = vmf->vma;
    2601       28758 :         struct mm_struct *mm = vma->vm_mm;
    2602       28758 :         unsigned long addr = vmf->address;
    2603             : 
    2604       28758 :         if (likely(src)) {
    2605       28758 :                 copy_user_highpage(dst, src, addr, vma);
    2606       28758 :                 return true;
    2607             :         }
    2608             : 
    2609             :         /*
    2610             :          * If the source page was a PFN mapping, we don't have
    2611             :          * a "struct page" for it. We do a best-effort copy by
    2612             :          * just copying from the original user address. If that
    2613             :          * fails, we just zero-fill it. Live with it.
    2614             :          */
    2615           0 :         kaddr = kmap_atomic(dst);
    2616           0 :         uaddr = (void __user *)(addr & PAGE_MASK);
    2617             : 
    2618             :         /*
    2619             :          * On architectures with software "accessed" bits, we would
    2620             :          * take a double page fault, so mark it accessed here.
    2621             :          */
    2622           0 :         if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
    2623             :                 pte_t entry;
    2624             : 
    2625             :                 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
    2626             :                 locked = true;
    2627             :                 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
    2628             :                         /*
    2629             :                          * Other thread has already handled the fault
    2630             :                          * and update local tlb only
    2631             :                          */
    2632             :                         update_mmu_tlb(vma, addr, vmf->pte);
    2633             :                         ret = false;
    2634             :                         goto pte_unlock;
    2635             :                 }
    2636             : 
    2637             :                 entry = pte_mkyoung(vmf->orig_pte);
    2638             :                 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
    2639             :                         update_mmu_cache(vma, addr, vmf->pte);
    2640             :         }
    2641             : 
    2642             :         /*
    2643             :          * This really shouldn't fail, because the page is there
    2644             :          * in the page tables. But it might just be unreadable,
    2645             :          * in which case we just give up and fill the result with
    2646             :          * zeroes.
    2647             :          */
    2648           0 :         if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
    2649           0 :                 if (locked)
    2650             :                         goto warn;
    2651             : 
    2652             :                 /* Re-validate under PTL if the page is still mapped */
    2653           0 :                 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
    2654           0 :                 locked = true;
    2655           0 :                 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
    2656             :                         /* The PTE changed under us, update local tlb */
    2657           0 :                         update_mmu_tlb(vma, addr, vmf->pte);
    2658           0 :                         ret = false;
    2659           0 :                         goto pte_unlock;
    2660             :                 }
    2661             : 
    2662             :                 /*
    2663             :                  * The same page can be mapped back since last copy attempt.
    2664             :                  * Try to copy again under PTL.
    2665             :                  */
    2666           0 :                 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
    2667             :                         /*
    2668             :                          * Give a warn in case there can be some obscure
    2669             :                          * use-case
    2670             :                          */
    2671           0 : warn:
    2672           0 :                         WARN_ON_ONCE(1);
    2673           0 :                         clear_page(kaddr);
    2674             :                 }
    2675             :         }
    2676             : 
    2677           0 :         ret = true;
    2678             : 
    2679             : pte_unlock:
    2680           0 :         if (locked)
    2681           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    2682           0 :         kunmap_atomic(kaddr);
    2683           0 :         flush_dcache_page(dst);
    2684             : 
    2685           0 :         return ret;
    2686             : }
    2687             : 
    2688      171939 : static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
    2689             : {
    2690      171939 :         struct file *vm_file = vma->vm_file;
    2691             : 
    2692      171939 :         if (vm_file)
    2693       99599 :                 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
    2694             : 
    2695             :         /*
    2696             :          * Special mappings (e.g. VDSO) do not have any file so fake
    2697             :          * a default GFP_KERNEL for them.
    2698             :          */
    2699             :         return GFP_KERNEL;
    2700             : }
    2701             : 
    2702             : /*
    2703             :  * Notify the address space that the page is about to become writable so that
    2704             :  * it can prohibit this or wait for the page to get into an appropriate state.
    2705             :  *
    2706             :  * We do this without the lock held, so that it can sleep if it needs to.
    2707             :  */
    2708         304 : static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
    2709             : {
    2710         304 :         vm_fault_t ret;
    2711         304 :         struct page *page = vmf->page;
    2712         304 :         unsigned int old_flags = vmf->flags;
    2713             : 
    2714         304 :         vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
    2715             : 
    2716         304 :         if (vmf->vma->vm_file &&
    2717         304 :             IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
    2718             :                 return VM_FAULT_SIGBUS;
    2719             : 
    2720         304 :         ret = vmf->vma->vm_ops->page_mkwrite(vmf);
    2721             :         /* Restore original flags so that caller is not surprised */
    2722         304 :         vmf->flags = old_flags;
    2723         304 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
    2724             :                 return ret;
    2725         304 :         if (unlikely(!(ret & VM_FAULT_LOCKED))) {
    2726           0 :                 lock_page(page);
    2727           0 :                 if (!page->mapping) {
    2728           0 :                         unlock_page(page);
    2729           0 :                         return 0; /* retry */
    2730             :                 }
    2731           0 :                 ret |= VM_FAULT_LOCKED;
    2732             :         } else
    2733         608 :                 VM_BUG_ON_PAGE(!PageLocked(page), page);
    2734             :         return ret;
    2735             : }
    2736             : 
    2737             : /*
    2738             :  * Handle dirtying of a page in shared file mapping on a write fault.
    2739             :  *
    2740             :  * The function expects the page to be locked and unlocks it.
    2741             :  */
    2742         458 : static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
    2743             : {
    2744         458 :         struct vm_area_struct *vma = vmf->vma;
    2745         458 :         struct address_space *mapping;
    2746         458 :         struct page *page = vmf->page;
    2747         458 :         bool dirtied;
    2748         458 :         bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
    2749             : 
    2750         458 :         dirtied = set_page_dirty(page);
    2751         458 :         VM_BUG_ON_PAGE(PageAnon(page), page);
    2752             :         /*
    2753             :          * Take a local copy of the address_space - page.mapping may be zeroed
    2754             :          * by truncate after unlock_page().   The address_space itself remains
    2755             :          * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
    2756             :          * release semantics to prevent the compiler from undoing this copying.
    2757             :          */
    2758         458 :         mapping = page_rmapping(page);
    2759         458 :         unlock_page(page);
    2760             : 
    2761         458 :         if (!page_mkwrite)
    2762         154 :                 file_update_time(vma->vm_file);
    2763             : 
    2764             :         /*
    2765             :          * Throttle page dirtying rate down to writeback speed.
    2766             :          *
    2767             :          * mapping may be NULL here because some device drivers do not
    2768             :          * set page.mapping but still dirty their pages
    2769             :          *
    2770             :          * Drop the mmap_lock before waiting on IO, if we can. The file
    2771             :          * is pinning the mapping, as per above.
    2772             :          */
    2773         458 :         if ((dirtied || page_mkwrite) && mapping) {
    2774         307 :                 struct file *fpin;
    2775             : 
    2776         307 :                 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
    2777         307 :                 balance_dirty_pages_ratelimited(mapping);
    2778         307 :                 if (fpin) {
    2779         307 :                         fput(fpin);
    2780         307 :                         return VM_FAULT_RETRY;
    2781             :                 }
    2782             :         }
    2783             : 
    2784             :         return 0;
    2785             : }
    2786             : 
    2787             : /*
    2788             :  * Handle write page faults for pages that can be reused in the current vma
    2789             :  *
    2790             :  * This can happen either due to the mapping being with the VM_SHARED flag,
    2791             :  * or due to us being the last reference standing to the page. In either
    2792             :  * case, all we need to do here is to mark the page as writable and update
    2793             :  * any related book-keeping.
    2794             :  */
    2795       17381 : static inline void wp_page_reuse(struct vm_fault *vmf)
    2796             :         __releases(vmf->ptl)
    2797             : {
    2798       17381 :         struct vm_area_struct *vma = vmf->vma;
    2799       17381 :         struct page *page = vmf->page;
    2800       17381 :         pte_t entry;
    2801             :         /*
    2802             :          * Clear the pages cpupid information as the existing
    2803             :          * information potentially belongs to a now completely
    2804             :          * unrelated process.
    2805             :          */
    2806       17381 :         if (page)
    2807       17381 :                 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
    2808             : 
    2809       17381 :         flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
    2810       17381 :         entry = pte_mkyoung(vmf->orig_pte);
    2811       17381 :         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    2812       17381 :         if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
    2813       17381 :                 update_mmu_cache(vma, vmf->address, vmf->pte);
    2814       17381 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    2815       17381 :         count_vm_event(PGREUSE);
    2816       17381 : }
    2817             : 
    2818             : /*
    2819             :  * Handle the case of a page which we actually need to copy to a new page.
    2820             :  *
    2821             :  * Called with mmap_lock locked and the old page referenced, but
    2822             :  * without the ptl held.
    2823             :  *
    2824             :  * High level logic flow:
    2825             :  *
    2826             :  * - Allocate a page, copy the content of the old page to the new one.
    2827             :  * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
    2828             :  * - Take the PTL. If the pte changed, bail out and release the allocated page
    2829             :  * - If the pte is still the way we remember it, update the page table and all
    2830             :  *   relevant references. This includes dropping the reference the page-table
    2831             :  *   held to the old page, as well as updating the rmap.
    2832             :  * - In any case, unlock the PTL and drop the reference we took to the old page.
    2833             :  */
    2834       30052 : static vm_fault_t wp_page_copy(struct vm_fault *vmf)
    2835             : {
    2836       30052 :         struct vm_area_struct *vma = vmf->vma;
    2837       30052 :         struct mm_struct *mm = vma->vm_mm;
    2838       30052 :         struct page *old_page = vmf->page;
    2839       30052 :         struct page *new_page = NULL;
    2840       30052 :         pte_t entry;
    2841       30052 :         int page_copied = 0;
    2842       30052 :         struct mmu_notifier_range range;
    2843             : 
    2844       30320 :         if (unlikely(anon_vma_prepare(vma)))
    2845           0 :                 goto oom;
    2846             : 
    2847       30052 :         if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
    2848        1290 :                 new_page = alloc_zeroed_user_highpage_movable(vma,
    2849             :                                                               vmf->address);
    2850        1290 :                 if (!new_page)
    2851           0 :                         goto oom;
    2852             :         } else {
    2853       28762 :                 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
    2854             :                                 vmf->address);
    2855       28759 :                 if (!new_page)
    2856           0 :                         goto oom;
    2857             : 
    2858       28759 :                 if (!cow_user_page(new_page, old_page, vmf)) {
    2859             :                         /*
    2860             :                          * COW failed, if the fault was solved by other,
    2861             :                          * it's fine. If not, userspace would re-fault on
    2862             :                          * the same address and we will handle the fault
    2863             :                          * from the second attempt.
    2864             :                          */
    2865           0 :                         put_page(new_page);
    2866           0 :                         if (old_page)
    2867           0 :                                 put_page(old_page);
    2868           0 :                         return 0;
    2869             :                 }
    2870             :         }
    2871             : 
    2872       30048 :         if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
    2873             :                 goto oom_free_new;
    2874       30048 :         cgroup_throttle_swaprate(new_page, GFP_KERNEL);
    2875             : 
    2876       30048 :         __SetPageUptodate(new_page);
    2877             : 
    2878       30048 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
    2879             :                                 vmf->address & PAGE_MASK,
    2880             :                                 (vmf->address & PAGE_MASK) + PAGE_SIZE);
    2881       30048 :         mmu_notifier_invalidate_range_start(&range);
    2882             : 
    2883             :         /*
    2884             :          * Re-check the pte - we dropped the lock
    2885             :          */
    2886       60097 :         vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
    2887       30051 :         if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
    2888       30051 :                 if (old_page) {
    2889       28761 :                         if (!PageAnon(old_page)) {
    2890       12876 :                                 dec_mm_counter_fast(mm,
    2891             :                                                 mm_counter_file(old_page));
    2892       12876 :                                 inc_mm_counter_fast(mm, MM_ANONPAGES);
    2893             :                         }
    2894             :                 } else {
    2895        1290 :                         inc_mm_counter_fast(mm, MM_ANONPAGES);
    2896             :                 }
    2897       30051 :                 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
    2898       30051 :                 entry = mk_pte(new_page, vma->vm_page_prot);
    2899       30049 :                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    2900             : 
    2901             :                 /*
    2902             :                  * Clear the pte entry and flush it first, before updating the
    2903             :                  * pte with the new entry, to keep TLBs on different CPUs in
    2904             :                  * sync. This code used to set the new PTE then flush TLBs, but
    2905             :                  * that left a window where the new PTE could be loaded into
    2906             :                  * some TLBs while the old PTE remains in others.
    2907             :                  */
    2908       30049 :                 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
    2909       30052 :                 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
    2910       30052 :                 lru_cache_add_inactive_or_unevictable(new_page, vma);
    2911             :                 /*
    2912             :                  * We call the notify macro here because, when using secondary
    2913             :                  * mmu page tables (such as kvm shadow page tables), we want the
    2914             :                  * new page to be mapped directly into the secondary page table.
    2915             :                  */
    2916       30048 :                 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
    2917       30048 :                 update_mmu_cache(vma, vmf->address, vmf->pte);
    2918       30048 :                 if (old_page) {
    2919             :                         /*
    2920             :                          * Only after switching the pte to the new page may
    2921             :                          * we remove the mapcount here. Otherwise another
    2922             :                          * process may come and find the rmap count decremented
    2923             :                          * before the pte is switched to the new page, and
    2924             :                          * "reuse" the old page writing into it while our pte
    2925             :                          * here still points into it and can be read by other
    2926             :                          * threads.
    2927             :                          *
    2928             :                          * The critical issue is to order this
    2929             :                          * page_remove_rmap with the ptp_clear_flush above.
    2930             :                          * Those stores are ordered by (if nothing else,)
    2931             :                          * the barrier present in the atomic_add_negative
    2932             :                          * in page_remove_rmap.
    2933             :                          *
    2934             :                          * Then the TLB flush in ptep_clear_flush ensures that
    2935             :                          * no process can access the old page before the
    2936             :                          * decremented mapcount is visible. And the old page
    2937             :                          * cannot be reused until after the decremented
    2938             :                          * mapcount is visible. So transitively, TLBs to
    2939             :                          * old page will be flushed before it can be reused.
    2940             :                          */
    2941       28758 :                         page_remove_rmap(old_page, false);
    2942             :                 }
    2943             : 
    2944             :                 /* Free the old page.. */
    2945             :                 new_page = old_page;
    2946             :                 page_copied = 1;
    2947             :         } else {
    2948       30052 :                 update_mmu_tlb(vma, vmf->address, vmf->pte);
    2949             :         }
    2950             : 
    2951       30052 :         if (new_page)
    2952       28762 :                 put_page(new_page);
    2953             : 
    2954       30053 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    2955             :         /*
    2956             :          * No need to double call mmu_notifier->invalidate_range() callback as
    2957             :          * the above ptep_clear_flush_notify() did already call it.
    2958             :          */
    2959       30053 :         mmu_notifier_invalidate_range_only_end(&range);
    2960       30053 :         if (old_page) {
    2961             :                 /*
    2962             :                  * Don't let another task, with possibly unlocked vma,
    2963             :                  * keep the mlocked page.
    2964             :                  */
    2965       28763 :                 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
    2966           0 :                         lock_page(old_page);    /* LRU manipulation */
    2967           0 :                         if (PageMlocked(old_page))
    2968           0 :                                 munlock_vma_page(old_page);
    2969           0 :                         unlock_page(old_page);
    2970             :                 }
    2971       28763 :                 put_page(old_page);
    2972             :         }
    2973       30053 :         return page_copied ? VM_FAULT_WRITE : 0;
    2974             : oom_free_new:
    2975             :         put_page(new_page);
    2976           0 : oom:
    2977           0 :         if (old_page)
    2978           0 :                 put_page(old_page);
    2979             :         return VM_FAULT_OOM;
    2980             : }
    2981             : 
    2982             : /**
    2983             :  * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
    2984             :  *                        writeable once the page is prepared
    2985             :  *
    2986             :  * @vmf: structure describing the fault
    2987             :  *
    2988             :  * This function handles all that is needed to finish a write page fault in a
    2989             :  * shared mapping due to PTE being read-only once the mapped page is prepared.
    2990             :  * It handles locking of PTE and modifying it.
    2991             :  *
    2992             :  * The function expects the page to be locked or other protection against
    2993             :  * concurrent faults / writeback (such as DAX radix tree locks).
    2994             :  *
    2995             :  * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
    2996             :  * we acquired PTE lock.
    2997             :  */
    2998          57 : vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
    2999             : {
    3000          57 :         WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
    3001         114 :         vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
    3002             :                                        &vmf->ptl);
    3003             :         /*
    3004             :          * We might have raced with another page fault while we released the
    3005             :          * pte_offset_map_lock.
    3006             :          */
    3007          57 :         if (!pte_same(*vmf->pte, vmf->orig_pte)) {
    3008           0 :                 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
    3009           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3010           0 :                 return VM_FAULT_NOPAGE;
    3011             :         }
    3012          57 :         wp_page_reuse(vmf);
    3013          57 :         return 0;
    3014             : }
    3015             : 
    3016             : /*
    3017             :  * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
    3018             :  * mapping
    3019             :  */
    3020           0 : static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
    3021             : {
    3022           0 :         struct vm_area_struct *vma = vmf->vma;
    3023             : 
    3024           0 :         if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
    3025           0 :                 vm_fault_t ret;
    3026             : 
    3027           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3028           0 :                 vmf->flags |= FAULT_FLAG_MKWRITE;
    3029           0 :                 ret = vma->vm_ops->pfn_mkwrite(vmf);
    3030           0 :                 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
    3031             :                         return ret;
    3032           0 :                 return finish_mkwrite_fault(vmf);
    3033             :         }
    3034           0 :         wp_page_reuse(vmf);
    3035           0 :         return VM_FAULT_WRITE;
    3036             : }
    3037             : 
    3038          57 : static vm_fault_t wp_page_shared(struct vm_fault *vmf)
    3039             :         __releases(vmf->ptl)
    3040             : {
    3041          57 :         struct vm_area_struct *vma = vmf->vma;
    3042          57 :         vm_fault_t ret = VM_FAULT_WRITE;
    3043             : 
    3044          57 :         get_page(vmf->page);
    3045             : 
    3046          57 :         if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
    3047          57 :                 vm_fault_t tmp;
    3048             : 
    3049          57 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3050          57 :                 tmp = do_page_mkwrite(vmf);
    3051          57 :                 if (unlikely(!tmp || (tmp &
    3052             :                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
    3053           0 :                         put_page(vmf->page);
    3054           0 :                         return tmp;
    3055             :                 }
    3056          57 :                 tmp = finish_mkwrite_fault(vmf);
    3057          57 :                 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
    3058           0 :                         unlock_page(vmf->page);
    3059           0 :                         put_page(vmf->page);
    3060           0 :                         return tmp;
    3061             :                 }
    3062             :         } else {
    3063           0 :                 wp_page_reuse(vmf);
    3064           0 :                 lock_page(vmf->page);
    3065             :         }
    3066          57 :         ret |= fault_dirty_shared_page(vmf);
    3067          57 :         put_page(vmf->page);
    3068             : 
    3069          57 :         return ret;
    3070             : }
    3071             : 
    3072             : /*
    3073             :  * This routine handles present pages, when users try to write
    3074             :  * to a shared page. It is done by copying the page to a new address
    3075             :  * and decrementing the shared-page counter for the old page.
    3076             :  *
    3077             :  * Note that this routine assumes that the protection checks have been
    3078             :  * done by the caller (the low-level page fault routine in most cases).
    3079             :  * Thus we can safely just mark it writable once we've done any necessary
    3080             :  * COW.
    3081             :  *
    3082             :  * We also mark the page dirty at this point even though the page will
    3083             :  * change only once the write actually happens. This avoids a few races,
    3084             :  * and potentially makes it more efficient.
    3085             :  *
    3086             :  * We enter with non-exclusive mmap_lock (to exclude vma changes,
    3087             :  * but allow concurrent faults), with pte both mapped and locked.
    3088             :  * We return with mmap_lock still held, but pte unmapped and unlocked.
    3089             :  */
    3090       47433 : static vm_fault_t do_wp_page(struct vm_fault *vmf)
    3091             :         __releases(vmf->ptl)
    3092             : {
    3093       47433 :         struct vm_area_struct *vma = vmf->vma;
    3094             : 
    3095       47433 :         if (userfaultfd_pte_wp(vma, *vmf->pte)) {
    3096             :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3097             :                 return handle_userfault(vmf, VM_UFFD_WP);
    3098             :         }
    3099             : 
    3100             :         /*
    3101             :          * Userfaultfd write-protect can defer flushes. Ensure the TLB
    3102             :          * is flushed in this case before copying.
    3103             :          */
    3104       47433 :         if (unlikely(userfaultfd_wp(vmf->vma) &&
    3105             :                      mm_tlb_flush_pending(vmf->vma->vm_mm)))
    3106             :                 flush_tlb_page(vmf->vma, vmf->address);
    3107             : 
    3108       47433 :         vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
    3109       47431 :         if (!vmf->page) {
    3110             :                 /*
    3111             :                  * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
    3112             :                  * VM_PFNMAP VMA.
    3113             :                  *
    3114             :                  * We should not cow pages in a shared writeable mapping.
    3115             :                  * Just mark the pages writable and/or call ops->pfn_mkwrite.
    3116             :                  */
    3117        1290 :                 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
    3118             :                                      (VM_WRITE|VM_SHARED))
    3119           0 :                         return wp_pfn_shared(vmf);
    3120             : 
    3121        1290 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3122        1290 :                 return wp_page_copy(vmf);
    3123             :         }
    3124             : 
    3125             :         /*
    3126             :          * Take out anonymous pages first, anonymous shared vmas are
    3127             :          * not dirty accountable.
    3128             :          */
    3129       46141 :         if (PageAnon(vmf->page)) {
    3130       33208 :                 struct page *page = vmf->page;
    3131             : 
    3132             :                 /* PageKsm() doesn't necessarily raise the page refcount */
    3133       33208 :                 if (PageKsm(page) || page_count(page) != 1)
    3134       15882 :                         goto copy;
    3135       17324 :                 if (!trylock_page(page))
    3136           0 :                         goto copy;
    3137       17324 :                 if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
    3138           0 :                         unlock_page(page);
    3139           0 :                         goto copy;
    3140             :                 }
    3141             :                 /*
    3142             :                  * Ok, we've got the only map reference, and the only
    3143             :                  * page count reference, and the page is locked,
    3144             :                  * it's dark out, and we're wearing sunglasses. Hit it.
    3145             :                  */
    3146       17324 :                 unlock_page(page);
    3147       17324 :                 wp_page_reuse(vmf);
    3148       17324 :                 return VM_FAULT_WRITE;
    3149       12933 :         } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
    3150             :                                         (VM_WRITE|VM_SHARED))) {
    3151          57 :                 return wp_page_shared(vmf);
    3152             :         }
    3153       12876 : copy:
    3154             :         /*
    3155             :          * Ok, we need to copy. Oh, well..
    3156             :          */
    3157       28758 :         get_page(vmf->page);
    3158             : 
    3159       28763 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3160       28763 :         return wp_page_copy(vmf);
    3161             : }
    3162             : 
    3163           0 : static void unmap_mapping_range_vma(struct vm_area_struct *vma,
    3164             :                 unsigned long start_addr, unsigned long end_addr,
    3165             :                 struct zap_details *details)
    3166             : {
    3167           0 :         zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
    3168             : }
    3169             : 
    3170           0 : static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
    3171             :                                             struct zap_details *details)
    3172             : {
    3173           0 :         struct vm_area_struct *vma;
    3174           0 :         pgoff_t vba, vea, zba, zea;
    3175             : 
    3176           0 :         vma_interval_tree_foreach(vma, root,
    3177             :                         details->first_index, details->last_index) {
    3178             : 
    3179           0 :                 vba = vma->vm_pgoff;
    3180           0 :                 vea = vba + vma_pages(vma) - 1;
    3181           0 :                 zba = details->first_index;
    3182           0 :                 if (zba < vba)
    3183             :                         zba = vba;
    3184           0 :                 zea = details->last_index;
    3185           0 :                 if (zea > vea)
    3186             :                         zea = vea;
    3187             : 
    3188           0 :                 unmap_mapping_range_vma(vma,
    3189           0 :                         ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
    3190           0 :                         ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
    3191             :                                 details);
    3192             :         }
    3193           0 : }
    3194             : 
    3195             : /**
    3196             :  * unmap_mapping_pages() - Unmap pages from processes.
    3197             :  * @mapping: The address space containing pages to be unmapped.
    3198             :  * @start: Index of first page to be unmapped.
    3199             :  * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
    3200             :  * @even_cows: Whether to unmap even private COWed pages.
    3201             :  *
    3202             :  * Unmap the pages in this address space from any userspace process which
    3203             :  * has them mmaped.  Generally, you want to remove COWed pages as well when
    3204             :  * a file is being truncated, but not when invalidating pages from the page
    3205             :  * cache.
    3206             :  */
    3207          14 : void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
    3208             :                 pgoff_t nr, bool even_cows)
    3209             : {
    3210          14 :         struct zap_details details = { };
    3211             : 
    3212          14 :         details.check_mapping = even_cows ? NULL : mapping;
    3213          14 :         details.first_index = start;
    3214          14 :         details.last_index = start + nr - 1;
    3215          14 :         if (details.last_index < details.first_index)
    3216           4 :                 details.last_index = ULONG_MAX;
    3217             : 
    3218          14 :         i_mmap_lock_write(mapping);
    3219          14 :         if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
    3220           0 :                 unmap_mapping_range_tree(&mapping->i_mmap, &details);
    3221          14 :         i_mmap_unlock_write(mapping);
    3222          14 : }
    3223             : 
    3224             : /**
    3225             :  * unmap_mapping_range - unmap the portion of all mmaps in the specified
    3226             :  * address_space corresponding to the specified byte range in the underlying
    3227             :  * file.
    3228             :  *
    3229             :  * @mapping: the address space containing mmaps to be unmapped.
    3230             :  * @holebegin: byte in first page to unmap, relative to the start of
    3231             :  * the underlying file.  This will be rounded down to a PAGE_SIZE
    3232             :  * boundary.  Note that this is different from truncate_pagecache(), which
    3233             :  * must keep the partial page.  In contrast, we must get rid of
    3234             :  * partial pages.
    3235             :  * @holelen: size of prospective hole in bytes.  This will be rounded
    3236             :  * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
    3237             :  * end of the file.
    3238             :  * @even_cows: 1 when truncating a file, unmap even private COWed pages;
    3239             :  * but 0 when invalidating pagecache, don't throw away private data.
    3240             :  */
    3241          14 : void unmap_mapping_range(struct address_space *mapping,
    3242             :                 loff_t const holebegin, loff_t const holelen, int even_cows)
    3243             : {
    3244          14 :         pgoff_t hba = holebegin >> PAGE_SHIFT;
    3245          14 :         pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
    3246             : 
    3247             :         /* Check for overflow. */
    3248          14 :         if (sizeof(holelen) > sizeof(hlen)) {
    3249             :                 long long holeend =
    3250             :                         (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
    3251             :                 if (holeend & ~(long long)ULONG_MAX)
    3252             :                         hlen = ULONG_MAX - hba + 1;
    3253             :         }
    3254             : 
    3255          14 :         unmap_mapping_pages(mapping, hba, hlen, even_cows);
    3256          14 : }
    3257             : EXPORT_SYMBOL(unmap_mapping_range);
    3258             : 
    3259             : /*
    3260             :  * We enter with non-exclusive mmap_lock (to exclude vma changes,
    3261             :  * but allow concurrent faults), and pte mapped but not yet locked.
    3262             :  * We return with pte unmapped and unlocked.
    3263             :  *
    3264             :  * We return with the mmap_lock locked or unlocked in the same cases
    3265             :  * as does filemap_fault().
    3266             :  */
    3267           0 : vm_fault_t do_swap_page(struct vm_fault *vmf)
    3268             : {
    3269           0 :         struct vm_area_struct *vma = vmf->vma;
    3270           0 :         struct page *page = NULL, *swapcache;
    3271           0 :         swp_entry_t entry;
    3272           0 :         pte_t pte;
    3273           0 :         int locked;
    3274           0 :         int exclusive = 0;
    3275           0 :         vm_fault_t ret = 0;
    3276           0 :         void *shadow = NULL;
    3277             : 
    3278           0 :         if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
    3279             :                 goto out;
    3280             : 
    3281           0 :         entry = pte_to_swp_entry(vmf->orig_pte);
    3282           0 :         if (unlikely(non_swap_entry(entry))) {
    3283           0 :                 if (is_migration_entry(entry)) {
    3284           0 :                         migration_entry_wait(vma->vm_mm, vmf->pmd,
    3285             :                                              vmf->address);
    3286           0 :                 } else if (is_device_private_entry(entry)) {
    3287             :                         vmf->page = device_private_entry_to_page(entry);
    3288             :                         ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
    3289           0 :                 } else if (is_hwpoison_entry(entry)) {
    3290             :                         ret = VM_FAULT_HWPOISON;
    3291             :                 } else {
    3292           0 :                         print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
    3293           0 :                         ret = VM_FAULT_SIGBUS;
    3294             :                 }
    3295           0 :                 goto out;
    3296             :         }
    3297             : 
    3298             : 
    3299           0 :         delayacct_set_flag(DELAYACCT_PF_SWAPIN);
    3300           0 :         page = lookup_swap_cache(entry, vma, vmf->address);
    3301           0 :         swapcache = page;
    3302             : 
    3303           0 :         if (!page) {
    3304           0 :                 struct swap_info_struct *si = swp_swap_info(entry);
    3305             : 
    3306           0 :                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
    3307             :                     __swap_count(entry) == 1) {
    3308             :                         /* skip swapcache */
    3309             :                         page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
    3310             :                                                         vmf->address);
    3311             :                         if (page) {
    3312             :                                 int err;
    3313             : 
    3314             :                                 __SetPageLocked(page);
    3315             :                                 __SetPageSwapBacked(page);
    3316             :                                 set_page_private(page, entry.val);
    3317             : 
    3318             :                                 /* Tell memcg to use swap ownership records */
    3319             :                                 SetPageSwapCache(page);
    3320             :                                 err = mem_cgroup_charge(page, vma->vm_mm,
    3321             :                                                         GFP_KERNEL);
    3322             :                                 ClearPageSwapCache(page);
    3323             :                                 if (err) {
    3324             :                                         ret = VM_FAULT_OOM;
    3325             :                                         goto out_page;
    3326             :                                 }
    3327             : 
    3328             :                                 shadow = get_shadow_from_swap_cache(entry);
    3329             :                                 if (shadow)
    3330             :                                         workingset_refault(page, shadow);
    3331             : 
    3332             :                                 lru_cache_add(page);
    3333           0 :                                 swap_readpage(page, true);
    3334             :                         }
    3335             :                 } else {
    3336           0 :                         page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
    3337             :                                                 vmf);
    3338           0 :                         swapcache = page;
    3339             :                 }
    3340             : 
    3341           0 :                 if (!page) {
    3342             :                         /*
    3343             :                          * Back out if somebody else faulted in this pte
    3344             :                          * while we released the pte lock.
    3345             :                          */
    3346           0 :                         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
    3347             :                                         vmf->address, &vmf->ptl);
    3348           0 :                         if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
    3349           0 :                                 ret = VM_FAULT_OOM;
    3350           0 :                         delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
    3351           0 :                         goto unlock;
    3352             :                 }
    3353             : 
    3354             :                 /* Had to read the page from swap area: Major fault */
    3355             :                 ret = VM_FAULT_MAJOR;
    3356             :                 count_vm_event(PGMAJFAULT);
    3357             :                 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
    3358             :         } else if (PageHWPoison(page)) {
    3359             :                 /*
    3360             :                  * hwpoisoned dirty swapcache pages are kept for killing
    3361             :                  * owner processes (which may be unknown at hwpoison time)
    3362             :                  */
    3363             :                 ret = VM_FAULT_HWPOISON;
    3364             :                 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
    3365             :                 goto out_release;
    3366             :         }
    3367             : 
    3368             :         locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
    3369             : 
    3370             :         delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
    3371             :         if (!locked) {
    3372             :                 ret |= VM_FAULT_RETRY;
    3373             :                 goto out_release;
    3374             :         }
    3375             : 
    3376             :         /*
    3377             :          * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
    3378             :          * release the swapcache from under us.  The page pin, and pte_same
    3379             :          * test below, are not enough to exclude that.  Even if it is still
    3380             :          * swapcache, we need to check that the page's swap has not changed.
    3381             :          */
    3382             :         if (unlikely((!PageSwapCache(page) ||
    3383             :                         page_private(page) != entry.val)) && swapcache)
    3384             :                 goto out_page;
    3385             : 
    3386             :         page = ksm_might_need_to_copy(page, vma, vmf->address);
    3387             :         if (unlikely(!page)) {
    3388             :                 ret = VM_FAULT_OOM;
    3389             :                 page = swapcache;
    3390             :                 goto out_page;
    3391             :         }
    3392             : 
    3393             :         cgroup_throttle_swaprate(page, GFP_KERNEL);
    3394             : 
    3395             :         /*
    3396             :          * Back out if somebody else already faulted in this pte.
    3397             :          */
    3398             :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
    3399             :                         &vmf->ptl);
    3400             :         if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
    3401             :                 goto out_nomap;
    3402             : 
    3403             :         if (unlikely(!PageUptodate(page))) {
    3404             :                 ret = VM_FAULT_SIGBUS;
    3405             :                 goto out_nomap;
    3406             :         }
    3407             : 
    3408             :         /*
    3409             :          * The page isn't present yet, go ahead with the fault.
    3410             :          *
    3411             :          * Be careful about the sequence of operations here.
    3412             :          * To get its accounting right, reuse_swap_page() must be called
    3413             :          * while the page is counted on swap but not yet in mapcount i.e.
    3414             :          * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
    3415             :          * must be called after the swap_free(), or it will never succeed.
    3416             :          */
    3417             : 
    3418             :         inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
    3419             :         dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
    3420             :         pte = mk_pte(page, vma->vm_page_prot);
    3421             :         if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
    3422             :                 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
    3423             :                 vmf->flags &= ~FAULT_FLAG_WRITE;
    3424             :                 ret |= VM_FAULT_WRITE;
    3425             :                 exclusive = RMAP_EXCLUSIVE;
    3426             :         }
    3427             :         flush_icache_page(vma, page);
    3428             :         if (pte_swp_soft_dirty(vmf->orig_pte))
    3429             :                 pte = pte_mksoft_dirty(pte);
    3430             :         if (pte_swp_uffd_wp(vmf->orig_pte)) {
    3431             :                 pte = pte_mkuffd_wp(pte);
    3432             :                 pte = pte_wrprotect(pte);
    3433             :         }
    3434             :         set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
    3435             :         arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
    3436             :         vmf->orig_pte = pte;
    3437             : 
    3438             :         /* ksm created a completely new copy */
    3439             :         if (unlikely(page != swapcache && swapcache)) {
    3440             :                 page_add_new_anon_rmap(page, vma, vmf->address, false);
    3441             :                 lru_cache_add_inactive_or_unevictable(page, vma);
    3442             :         } else {
    3443             :                 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
    3444             :         }
    3445             : 
    3446             :         swap_free(entry);
    3447             :         if (mem_cgroup_swap_full(page) ||
    3448             :             (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
    3449             :                 try_to_free_swap(page);
    3450             :         unlock_page(page);
    3451             :         if (page != swapcache && swapcache) {
    3452             :                 /*
    3453             :                  * Hold the lock to avoid the swap entry to be reused
    3454             :                  * until we take the PT lock for the pte_same() check
    3455             :                  * (to avoid false positives from pte_same). For
    3456             :                  * further safety release the lock after the swap_free
    3457             :                  * so that the swap count won't change under a
    3458             :                  * parallel locked swapcache.
    3459             :                  */
    3460             :                 unlock_page(swapcache);
    3461             :                 put_page(swapcache);
    3462             :         }
    3463             : 
    3464             :         if (vmf->flags & FAULT_FLAG_WRITE) {
    3465             :                 ret |= do_wp_page(vmf);
    3466             :                 if (ret & VM_FAULT_ERROR)
    3467             :                         ret &= VM_FAULT_ERROR;
    3468             :                 goto out;
    3469             :         }
    3470             : 
    3471             :         /* No need to invalidate - it was non-present before */
    3472           0 :         update_mmu_cache(vma, vmf->address, vmf->pte);
    3473           0 : unlock:
    3474           0 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3475           0 : out:
    3476           0 :         return ret;
    3477             : out_nomap:
    3478             :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3479             : out_page:
    3480             :         unlock_page(page);
    3481             : out_release:
    3482             :         put_page(page);
    3483             :         if (page != swapcache && swapcache) {
    3484             :                 unlock_page(swapcache);
    3485             :                 put_page(swapcache);
    3486             :         }
    3487             :         return ret;
    3488             : }
    3489             : 
    3490             : /*
    3491             :  * We enter with non-exclusive mmap_lock (to exclude vma changes,
    3492             :  * but allow concurrent faults), and pte mapped but not yet locked.
    3493             :  * We return with mmap_lock still held, but pte unmapped and unlocked.
    3494             :  */
    3495       43703 : static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
    3496             : {
    3497       43703 :         struct vm_area_struct *vma = vmf->vma;
    3498       43703 :         struct page *page;
    3499       43703 :         vm_fault_t ret = 0;
    3500       43703 :         pte_t entry;
    3501             : 
    3502             :         /* File mapping without ->vm_ops ? */
    3503       43703 :         if (vma->vm_flags & VM_SHARED)
    3504             :                 return VM_FAULT_SIGBUS;
    3505             : 
    3506             :         /*
    3507             :          * Use pte_alloc() instead of pte_alloc_map().  We can't run
    3508             :          * pte_offset_map() on pmds where a huge pmd might be created
    3509             :          * from a different thread.
    3510             :          *
    3511             :          * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
    3512             :          * parallel threads are excluded by other means.
    3513             :          *
    3514             :          * Here we only have mmap_read_lock(mm).
    3515             :          */
    3516       43703 :         if (pte_alloc(vma->vm_mm, vmf->pmd))
    3517             :                 return VM_FAULT_OOM;
    3518             : 
    3519             :         /* See comment in handle_pte_fault() */
    3520       43703 :         if (unlikely(pmd_trans_unstable(vmf->pmd)))
    3521             :                 return 0;
    3522             : 
    3523             :         /* Use the zero-page for reads */
    3524       43703 :         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
    3525             :                         !mm_forbids_zeropage(vma->vm_mm)) {
    3526       14675 :                 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
    3527             :                                                 vma->vm_page_prot));
    3528       29350 :                 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
    3529             :                                 vmf->address, &vmf->ptl);
    3530       14675 :                 if (!pte_none(*vmf->pte)) {
    3531           0 :                         update_mmu_tlb(vma, vmf->address, vmf->pte);
    3532           0 :                         goto unlock;
    3533             :                 }
    3534       14675 :                 ret = check_stable_address_space(vma->vm_mm);
    3535       14675 :                 if (ret)
    3536           0 :                         goto unlock;
    3537             :                 /* Deliver the page fault to userland, check inside PT lock */
    3538       14675 :                 if (userfaultfd_missing(vma)) {
    3539             :                         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3540             :                         return handle_userfault(vmf, VM_UFFD_MISSING);
    3541             :                 }
    3542       14675 :                 goto setpte;
    3543             :         }
    3544             : 
    3545             :         /* Allocate our own private page. */
    3546       36171 :         if (unlikely(anon_vma_prepare(vma)))
    3547           0 :                 goto oom;
    3548       29028 :         page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
    3549       29028 :         if (!page)
    3550           0 :                 goto oom;
    3551             : 
    3552       29028 :         if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
    3553             :                 goto oom_free_page;
    3554       29028 :         cgroup_throttle_swaprate(page, GFP_KERNEL);
    3555             : 
    3556             :         /*
    3557             :          * The memory barrier inside __SetPageUptodate makes sure that
    3558             :          * preceding stores to the page contents become visible before
    3559             :          * the set_pte_at() write.
    3560             :          */
    3561       29028 :         __SetPageUptodate(page);
    3562             : 
    3563       29028 :         entry = mk_pte(page, vma->vm_page_prot);
    3564       29028 :         if (vma->vm_flags & VM_WRITE)
    3565       29028 :                 entry = pte_mkwrite(pte_mkdirty(entry));
    3566             : 
    3567       58056 :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
    3568             :                         &vmf->ptl);
    3569       29027 :         if (!pte_none(*vmf->pte)) {
    3570           0 :                 update_mmu_cache(vma, vmf->address, vmf->pte);
    3571           0 :                 goto release;
    3572             :         }
    3573             : 
    3574       29027 :         ret = check_stable_address_space(vma->vm_mm);
    3575       29027 :         if (ret)
    3576           0 :                 goto release;
    3577             : 
    3578             :         /* Deliver the page fault to userland, check inside PT lock */
    3579       29027 :         if (userfaultfd_missing(vma)) {
    3580             :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3581             :                 put_page(page);
    3582             :                 return handle_userfault(vmf, VM_UFFD_MISSING);
    3583             :         }
    3584             : 
    3585       29027 :         inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
    3586       29028 :         page_add_new_anon_rmap(page, vma, vmf->address, false);
    3587       29028 :         lru_cache_add_inactive_or_unevictable(page, vma);
    3588       43703 : setpte:
    3589       43703 :         set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
    3590             : 
    3591             :         /* No need to invalidate - it was non-present before */
    3592       43703 :         update_mmu_cache(vma, vmf->address, vmf->pte);
    3593       43703 : unlock:
    3594       43703 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3595       43703 :         return ret;
    3596           0 : release:
    3597           0 :         put_page(page);
    3598           0 :         goto unlock;
    3599             : oom_free_page:
    3600             :         put_page(page);
    3601             : oom:
    3602             :         return VM_FAULT_OOM;
    3603             : }
    3604             : 
    3605             : /*
    3606             :  * The mmap_lock must have been held on entry, and may have been
    3607             :  * released depending on flags and vma->vm_ops->fault() return value.
    3608             :  * See filemap_fault() and __lock_page_retry().
    3609             :  */
    3610       11614 : static vm_fault_t __do_fault(struct vm_fault *vmf)
    3611             : {
    3612       11614 :         struct vm_area_struct *vma = vmf->vma;
    3613       11614 :         vm_fault_t ret;
    3614             : 
    3615             :         /*
    3616             :          * Preallocate pte before we take page_lock because this might lead to
    3617             :          * deadlocks for memcg reclaim which waits for pages under writeback:
    3618             :          *                              lock_page(A)
    3619             :          *                              SetPageWriteback(A)
    3620             :          *                              unlock_page(A)
    3621             :          * lock_page(B)
    3622             :          *                              lock_page(B)
    3623             :          * pte_alloc_one
    3624             :          *   shrink_page_list
    3625             :          *     wait_on_page_writeback(A)
    3626             :          *                              SetPageWriteback(B)
    3627             :          *                              unlock_page(B)
    3628             :          *                              # flush A, B to clear the writeback
    3629             :          */
    3630       11614 :         if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
    3631        1968 :                 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
    3632        1968 :                 if (!vmf->prealloc_pte)
    3633             :                         return VM_FAULT_OOM;
    3634        1968 :                 smp_wmb(); /* See comment in __pte_alloc() */
    3635             :         }
    3636             : 
    3637       11614 :         ret = vma->vm_ops->fault(vmf);
    3638       11614 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
    3639             :                             VM_FAULT_DONE_COW)))
    3640             :                 return ret;
    3641             : 
    3642       10583 :         if (unlikely(PageHWPoison(vmf->page))) {
    3643             :                 if (ret & VM_FAULT_LOCKED)
    3644             :                         unlock_page(vmf->page);
    3645             :                 put_page(vmf->page);
    3646             :                 vmf->page = NULL;
    3647             :                 return VM_FAULT_HWPOISON;
    3648             :         }
    3649             : 
    3650       10583 :         if (unlikely(!(ret & VM_FAULT_LOCKED)))
    3651         957 :                 lock_page(vmf->page);
    3652             :         else
    3653       19252 :                 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
    3654             : 
    3655             :         return ret;
    3656             : }
    3657             : 
    3658             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    3659             : static void deposit_prealloc_pte(struct vm_fault *vmf)
    3660             : {
    3661             :         struct vm_area_struct *vma = vmf->vma;
    3662             : 
    3663             :         pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
    3664             :         /*
    3665             :          * We are going to consume the prealloc table,
    3666             :          * count that as nr_ptes.
    3667             :          */
    3668             :         mm_inc_nr_ptes(vma->vm_mm);
    3669             :         vmf->prealloc_pte = NULL;
    3670             : }
    3671             : 
    3672           0 : vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
    3673             : {
    3674           0 :         struct vm_area_struct *vma = vmf->vma;
    3675           0 :         bool write = vmf->flags & FAULT_FLAG_WRITE;
    3676           0 :         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
    3677           0 :         pmd_t entry;
    3678           0 :         int i;
    3679           0 :         vm_fault_t ret = VM_FAULT_FALLBACK;
    3680             : 
    3681           0 :         if (!transhuge_vma_suitable(vma, haddr))
    3682             :                 return ret;
    3683             : 
    3684           0 :         page = compound_head(page);
    3685           0 :         if (compound_order(page) != HPAGE_PMD_ORDER)
    3686             :                 return ret;
    3687             : 
    3688             :         /*
    3689             :          * Archs like ppc64 need additonal space to store information
    3690             :          * related to pte entry. Use the preallocated table for that.
    3691             :          */
    3692           0 :         if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
    3693             :                 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
    3694             :                 if (!vmf->prealloc_pte)
    3695             :                         return VM_FAULT_OOM;
    3696             :                 smp_wmb(); /* See comment in __pte_alloc() */
    3697             :         }
    3698             : 
    3699           0 :         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
    3700           0 :         if (unlikely(!pmd_none(*vmf->pmd)))
    3701           0 :                 goto out;
    3702             : 
    3703           0 :         for (i = 0; i < HPAGE_PMD_NR; i++)
    3704             :                 flush_icache_page(vma, page + i);
    3705             : 
    3706           0 :         entry = mk_huge_pmd(page, vma->vm_page_prot);
    3707           0 :         if (write)
    3708           0 :                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
    3709             : 
    3710           0 :         add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
    3711           0 :         page_add_file_rmap(page, true);
    3712             :         /*
    3713             :          * deposit and withdraw with pmd lock held
    3714             :          */
    3715           0 :         if (arch_needs_pgtable_deposit())
    3716             :                 deposit_prealloc_pte(vmf);
    3717             : 
    3718           0 :         set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
    3719             : 
    3720           0 :         update_mmu_cache_pmd(vma, haddr, vmf->pmd);
    3721             : 
    3722             :         /* fault is handled */
    3723           0 :         ret = 0;
    3724           0 :         count_vm_event(THP_FILE_MAPPED);
    3725           0 : out:
    3726           0 :         spin_unlock(vmf->ptl);
    3727           0 :         return ret;
    3728             : }
    3729             : #else
    3730             : vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
    3731             : {
    3732             :         return VM_FAULT_FALLBACK;
    3733             : }
    3734             : #endif
    3735             : 
    3736      794210 : void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
    3737             : {
    3738      794210 :         struct vm_area_struct *vma = vmf->vma;
    3739      794210 :         bool write = vmf->flags & FAULT_FLAG_WRITE;
    3740      794210 :         bool prefault = vmf->address != addr;
    3741      794210 :         pte_t entry;
    3742             : 
    3743      794210 :         flush_icache_page(vma, page);
    3744      794210 :         entry = mk_pte(page, vma->vm_page_prot);
    3745             : 
    3746      794262 :         if (prefault && arch_wants_old_prefaulted_pte())
    3747             :                 entry = pte_mkold(entry);
    3748             : 
    3749      794262 :         if (write)
    3750        9623 :                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    3751             :         /* copy-on-write page */
    3752      794262 :         if (write && !(vma->vm_flags & VM_SHARED)) {
    3753        9222 :                 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
    3754        9222 :                 page_add_new_anon_rmap(page, vma, addr, false);
    3755        9222 :                 lru_cache_add_inactive_or_unevictable(page, vma);
    3756             :         } else {
    3757      785040 :                 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
    3758      785018 :                 page_add_file_rmap(page, false);
    3759             :         }
    3760      794341 :         set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
    3761      794341 : }
    3762             : 
    3763             : /**
    3764             :  * finish_fault - finish page fault once we have prepared the page to fault
    3765             :  *
    3766             :  * @vmf: structure describing the fault
    3767             :  *
    3768             :  * This function handles all that is needed to finish a page fault once the
    3769             :  * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
    3770             :  * given page, adds reverse page mapping, handles memcg charges and LRU
    3771             :  * addition.
    3772             :  *
    3773             :  * The function expects the page to be locked and on success it consumes a
    3774             :  * reference of a page being mapped (for the PTE which maps it).
    3775             :  *
    3776             :  * Return: %0 on success, %VM_FAULT_ code in case of error.
    3777             :  */
    3778       10583 : vm_fault_t finish_fault(struct vm_fault *vmf)
    3779             : {
    3780       10583 :         struct vm_area_struct *vma = vmf->vma;
    3781       10583 :         struct page *page;
    3782       10583 :         vm_fault_t ret;
    3783             : 
    3784             :         /* Did we COW the page? */
    3785       10583 :         if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
    3786        9222 :                 page = vmf->cow_page;
    3787             :         else
    3788        1361 :                 page = vmf->page;
    3789             : 
    3790             :         /*
    3791             :          * check even for read faults because we might have lost our CoWed
    3792             :          * page
    3793             :          */
    3794       10583 :         if (!(vma->vm_flags & VM_SHARED)) {
    3795       10181 :                 ret = check_stable_address_space(vma->vm_mm);
    3796       10181 :                 if (ret)
    3797             :                         return ret;
    3798             :         }
    3799             : 
    3800       10583 :         if (pmd_none(*vmf->pmd)) {
    3801        1915 :                 if (PageTransCompound(page)) {
    3802           0 :                         ret = do_set_pmd(vmf, page);
    3803           0 :                         if (ret != VM_FAULT_FALLBACK)
    3804             :                                 return ret;
    3805             :                 }
    3806             : 
    3807        1915 :                 if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
    3808             :                         return VM_FAULT_OOM;
    3809             :         }
    3810             : 
    3811             :         /* See comment in handle_pte_fault() */
    3812       10583 :         if (pmd_devmap_trans_unstable(vmf->pmd))
    3813             :                 return 0;
    3814             : 
    3815       21166 :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
    3816             :                                       vmf->address, &vmf->ptl);
    3817       10583 :         ret = 0;
    3818             :         /* Re-check under ptl */
    3819       10583 :         if (likely(pte_none(*vmf->pte)))
    3820       10583 :                 do_set_pte(vmf, page, vmf->address);
    3821             :         else
    3822             :                 ret = VM_FAULT_NOPAGE;
    3823             : 
    3824       10583 :         update_mmu_tlb(vma, vmf->address, vmf->pte);
    3825       10583 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3826       10583 :         return ret;
    3827             : }
    3828             : 
    3829             : static unsigned long fault_around_bytes __read_mostly =
    3830             :         rounddown_pow_of_two(65536);
    3831             : 
    3832             : #ifdef CONFIG_DEBUG_FS
    3833           0 : static int fault_around_bytes_get(void *data, u64 *val)
    3834             : {
    3835           0 :         *val = fault_around_bytes;
    3836           0 :         return 0;
    3837             : }
    3838             : 
    3839             : /*
    3840             :  * fault_around_bytes must be rounded down to the nearest page order as it's
    3841             :  * what do_fault_around() expects to see.
    3842             :  */
    3843           0 : static int fault_around_bytes_set(void *data, u64 val)
    3844             : {
    3845           0 :         if (val / PAGE_SIZE > PTRS_PER_PTE)
    3846             :                 return -EINVAL;
    3847           0 :         if (val > PAGE_SIZE)
    3848           0 :                 fault_around_bytes = rounddown_pow_of_two(val);
    3849             :         else
    3850           0 :                 fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
    3851             :         return 0;
    3852             : }
    3853           0 : DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
    3854             :                 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
    3855             : 
    3856           1 : static int __init fault_around_debugfs(void)
    3857             : {
    3858           1 :         debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
    3859             :                                    &fault_around_bytes_fops);
    3860           1 :         return 0;
    3861             : }
    3862             : late_initcall(fault_around_debugfs);
    3863             : #endif
    3864             : 
    3865             : /*
    3866             :  * do_fault_around() tries to map few pages around the fault address. The hope
    3867             :  * is that the pages will be needed soon and this will lower the number of
    3868             :  * faults to handle.
    3869             :  *
    3870             :  * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
    3871             :  * not ready to be mapped: not up-to-date, locked, etc.
    3872             :  *
    3873             :  * This function is called with the page table lock taken. In the split ptlock
    3874             :  * case the page table lock only protects only those entries which belong to
    3875             :  * the page table corresponding to the fault address.
    3876             :  *
    3877             :  * This function doesn't cross the VMA boundaries, in order to call map_pages()
    3878             :  * only once.
    3879             :  *
    3880             :  * fault_around_bytes defines how many bytes we'll try to map.
    3881             :  * do_fault_around() expects it to be set to a power of two less than or equal
    3882             :  * to PTRS_PER_PTE.
    3883             :  *
    3884             :  * The virtual address of the area that we map is naturally aligned to
    3885             :  * fault_around_bytes rounded down to the machine page size
    3886             :  * (and therefore to page order).  This way it's easier to guarantee
    3887             :  * that we don't cross page table boundaries.
    3888             :  */
    3889       69617 : static vm_fault_t do_fault_around(struct vm_fault *vmf)
    3890             : {
    3891       69617 :         unsigned long address = vmf->address, nr_pages, mask;
    3892       69617 :         pgoff_t start_pgoff = vmf->pgoff;
    3893       69617 :         pgoff_t end_pgoff;
    3894       69617 :         int off;
    3895             : 
    3896       69617 :         nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
    3897       69617 :         mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
    3898             : 
    3899       69617 :         address = max(address & mask, vmf->vma->vm_start);
    3900       69617 :         off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
    3901       69617 :         start_pgoff -= off;
    3902             : 
    3903             :         /*
    3904             :          *  end_pgoff is either the end of the page table, the end of
    3905             :          *  the vma or nr_pages from start_pgoff, depending what is nearest.
    3906             :          */
    3907       69617 :         end_pgoff = start_pgoff -
    3908       69617 :                 ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
    3909             :                 PTRS_PER_PTE - 1;
    3910       69617 :         end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
    3911             :                         start_pgoff + nr_pages - 1);
    3912             : 
    3913       69617 :         if (pmd_none(*vmf->pmd)) {
    3914        3151 :                 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
    3915        3151 :                 if (!vmf->prealloc_pte)
    3916             :                         return VM_FAULT_OOM;
    3917        3151 :                 smp_wmb(); /* See comment in __pte_alloc() */
    3918             :         }
    3919             : 
    3920       69617 :         return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
    3921             : }
    3922             : 
    3923       70707 : static vm_fault_t do_read_fault(struct vm_fault *vmf)
    3924             : {
    3925       70707 :         struct vm_area_struct *vma = vmf->vma;
    3926       70707 :         vm_fault_t ret = 0;
    3927             : 
    3928             :         /*
    3929             :          * Let's call ->map_pages() first and use ->fault() as fallback
    3930             :          * if page by the offset is not ready to be mapped (cold cache or
    3931             :          * something).
    3932             :          */
    3933       70707 :         if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
    3934       69617 :                 ret = do_fault_around(vmf);
    3935       69613 :                 if (ret)
    3936             :                         return ret;
    3937             :         }
    3938             : 
    3939        1838 :         ret = __do_fault(vmf);
    3940        1838 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    3941             :                 return ret;
    3942             : 
    3943         960 :         ret |= finish_fault(vmf);
    3944         960 :         unlock_page(vmf->page);
    3945         960 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    3946           0 :                 put_page(vmf->page);
    3947             :         return ret;
    3948             : }
    3949             : 
    3950        9374 : static vm_fault_t do_cow_fault(struct vm_fault *vmf)
    3951             : {
    3952        9374 :         struct vm_area_struct *vma = vmf->vma;
    3953        9374 :         vm_fault_t ret;
    3954             : 
    3955       15314 :         if (unlikely(anon_vma_prepare(vma)))
    3956             :                 return VM_FAULT_OOM;
    3957             : 
    3958        9375 :         vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
    3959        9375 :         if (!vmf->cow_page)
    3960             :                 return VM_FAULT_OOM;
    3961             : 
    3962        9375 :         if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
    3963             :                 put_page(vmf->cow_page);
    3964             :                 return VM_FAULT_OOM;
    3965             :         }
    3966        9375 :         cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
    3967             : 
    3968        9375 :         ret = __do_fault(vmf);
    3969        9375 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    3970         153 :                 goto uncharge_out;
    3971        9222 :         if (ret & VM_FAULT_DONE_COW)
    3972             :                 return ret;
    3973             : 
    3974        9222 :         copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
    3975        9221 :         __SetPageUptodate(vmf->cow_page);
    3976             : 
    3977        9221 :         ret |= finish_fault(vmf);
    3978        9222 :         unlock_page(vmf->page);
    3979        9222 :         put_page(vmf->page);
    3980        9222 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    3981           0 :                 goto uncharge_out;
    3982             :         return ret;
    3983         153 : uncharge_out:
    3984         153 :         put_page(vmf->cow_page);
    3985         153 :         return ret;
    3986             : }
    3987             : 
    3988         401 : static vm_fault_t do_shared_fault(struct vm_fault *vmf)
    3989             : {
    3990         401 :         struct vm_area_struct *vma = vmf->vma;
    3991         401 :         vm_fault_t ret, tmp;
    3992             : 
    3993         401 :         ret = __do_fault(vmf);
    3994         401 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    3995             :                 return ret;
    3996             : 
    3997             :         /*
    3998             :          * Check if the backing address space wants to know that the page is
    3999             :          * about to become writable
    4000             :          */
    4001         401 :         if (vma->vm_ops->page_mkwrite) {
    4002         247 :                 unlock_page(vmf->page);
    4003         247 :                 tmp = do_page_mkwrite(vmf);
    4004         247 :                 if (unlikely(!tmp ||
    4005             :                                 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
    4006           0 :                         put_page(vmf->page);
    4007           0 :                         return tmp;
    4008             :                 }
    4009             :         }
    4010             : 
    4011         401 :         ret |= finish_fault(vmf);
    4012         401 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
    4013             :                                         VM_FAULT_RETRY))) {
    4014           0 :                 unlock_page(vmf->page);
    4015           0 :                 put_page(vmf->page);
    4016           0 :                 return ret;
    4017             :         }
    4018             : 
    4019         401 :         ret |= fault_dirty_shared_page(vmf);
    4020         401 :         return ret;
    4021             : }
    4022             : 
    4023             : /*
    4024             :  * We enter with non-exclusive mmap_lock (to exclude vma changes,
    4025             :  * but allow concurrent faults).
    4026             :  * The mmap_lock may have been released depending on flags and our
    4027             :  * return value.  See filemap_fault() and __lock_page_or_retry().
    4028             :  * If mmap_lock is released, vma may become invalid (for example
    4029             :  * by other thread calling munmap()).
    4030             :  */
    4031       80482 : static vm_fault_t do_fault(struct vm_fault *vmf)
    4032             : {
    4033       80482 :         struct vm_area_struct *vma = vmf->vma;
    4034       80482 :         struct mm_struct *vm_mm = vma->vm_mm;
    4035       80482 :         vm_fault_t ret;
    4036             : 
    4037             :         /*
    4038             :          * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
    4039             :          */
    4040       80482 :         if (!vma->vm_ops->fault) {
    4041             :                 /*
    4042             :                  * If we find a migration pmd entry or a none pmd entry, which
    4043             :                  * should never happen, return SIGBUS
    4044             :                  */
    4045           0 :                 if (unlikely(!pmd_present(*vmf->pmd)))
    4046             :                         ret = VM_FAULT_SIGBUS;
    4047             :                 else {
    4048           0 :                         vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
    4049             :                                                        vmf->pmd,
    4050             :                                                        vmf->address,
    4051             :                                                        &vmf->ptl);
    4052             :                         /*
    4053             :                          * Make sure this is not a temporary clearing of pte
    4054             :                          * by holding ptl and checking again. A R/M/W update
    4055             :                          * of pte involves: take ptl, clearing the pte so that
    4056             :                          * we don't have concurrent modification by hardware
    4057             :                          * followed by an update.
    4058             :                          */
    4059           0 :                         if (unlikely(pte_none(*vmf->pte)))
    4060             :                                 ret = VM_FAULT_SIGBUS;
    4061             :                         else
    4062           0 :                                 ret = VM_FAULT_NOPAGE;
    4063             : 
    4064           0 :                         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4065             :                 }
    4066       80482 :         } else if (!(vmf->flags & FAULT_FLAG_WRITE))
    4067       70707 :                 ret = do_read_fault(vmf);
    4068        9775 :         else if (!(vma->vm_flags & VM_SHARED))
    4069        9374 :                 ret = do_cow_fault(vmf);
    4070             :         else
    4071         401 :                 ret = do_shared_fault(vmf);
    4072             : 
    4073             :         /* preallocated pagetable is unused: free it */
    4074       80479 :         if (vmf->prealloc_pte) {
    4075        1975 :                 pte_free(vm_mm, vmf->prealloc_pte);
    4076        1975 :                 vmf->prealloc_pte = NULL;
    4077             :         }
    4078       80479 :         return ret;
    4079             : }
    4080             : 
    4081             : static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
    4082             :                                 unsigned long addr, int page_nid,
    4083             :                                 int *flags)
    4084             : {
    4085             :         get_page(page);
    4086             : 
    4087             :         count_vm_numa_event(NUMA_HINT_FAULTS);
    4088             :         if (page_nid == numa_node_id()) {
    4089             :                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
    4090             :                 *flags |= TNF_FAULT_LOCAL;
    4091             :         }
    4092             : 
    4093             :         return mpol_misplaced(page, vma, addr);
    4094             : }
    4095             : 
    4096             : static vm_fault_t do_numa_page(struct vm_fault *vmf)
    4097             : {
    4098             :         struct vm_area_struct *vma = vmf->vma;
    4099             :         struct page *page = NULL;
    4100             :         int page_nid = NUMA_NO_NODE;
    4101             :         int last_cpupid;
    4102             :         int target_nid;
    4103             :         bool migrated = false;
    4104             :         pte_t pte, old_pte;
    4105             :         bool was_writable = pte_savedwrite(vmf->orig_pte);
    4106             :         int flags = 0;
    4107             : 
    4108             :         /*
    4109             :          * The "pte" at this point cannot be used safely without
    4110             :          * validation through pte_unmap_same(). It's of NUMA type but
    4111             :          * the pfn may be screwed if the read is non atomic.
    4112             :          */
    4113             :         vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
    4114             :         spin_lock(vmf->ptl);
    4115             :         if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
    4116             :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    4117             :                 goto out;
    4118             :         }
    4119             : 
    4120             :         /*
    4121             :          * Make it present again, Depending on how arch implementes non
    4122             :          * accessible ptes, some can allow access by kernel mode.
    4123             :          */
    4124             :         old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
    4125             :         pte = pte_modify(old_pte, vma->vm_page_prot);
    4126             :         pte = pte_mkyoung(pte);
    4127             :         if (was_writable)
    4128             :                 pte = pte_mkwrite(pte);
    4129             :         ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
    4130             :         update_mmu_cache(vma, vmf->address, vmf->pte);
    4131             : 
    4132             :         page = vm_normal_page(vma, vmf->address, pte);
    4133             :         if (!page) {
    4134             :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    4135             :                 return 0;
    4136             :         }
    4137             : 
    4138             :         /* TODO: handle PTE-mapped THP */
    4139             :         if (PageCompound(page)) {
    4140             :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    4141             :                 return 0;
    4142             :         }
    4143             : 
    4144             :         /*
    4145             :          * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
    4146             :          * much anyway since they can be in shared cache state. This misses
    4147             :          * the case where a mapping is writable but the process never writes
    4148             :          * to it but pte_write gets cleared during protection updates and
    4149             :          * pte_dirty has unpredictable behaviour between PTE scan updates,
    4150             :          * background writeback, dirty balancing and application behaviour.
    4151             :          */
    4152             :         if (!pte_write(pte))
    4153             :                 flags |= TNF_NO_GROUP;
    4154             : 
    4155             :         /*
    4156             :          * Flag if the page is shared between multiple address spaces. This
    4157             :          * is later used when determining whether to group tasks together
    4158             :          */
    4159             :         if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
    4160             :                 flags |= TNF_SHARED;
    4161             : 
    4162             :         last_cpupid = page_cpupid_last(page);
    4163             :         page_nid = page_to_nid(page);
    4164             :         target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
    4165             :                         &flags);
    4166             :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4167             :         if (target_nid == NUMA_NO_NODE) {
    4168             :                 put_page(page);
    4169             :                 goto out;
    4170             :         }
    4171             : 
    4172             :         /* Migrate to the requested node */
    4173             :         migrated = migrate_misplaced_page(page, vma, target_nid);
    4174             :         if (migrated) {
    4175             :                 page_nid = target_nid;
    4176             :                 flags |= TNF_MIGRATED;
    4177             :         } else
    4178             :                 flags |= TNF_MIGRATE_FAIL;
    4179             : 
    4180             : out:
    4181             :         if (page_nid != NUMA_NO_NODE)
    4182             :                 task_numa_fault(last_cpupid, page_nid, 1, flags);
    4183             :         return 0;
    4184             : }
    4185             : 
    4186        6167 : static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
    4187             : {
    4188        6167 :         if (vma_is_anonymous(vmf->vma))
    4189        1048 :                 return do_huge_pmd_anonymous_page(vmf);
    4190        5119 :         if (vmf->vma->vm_ops->huge_fault)
    4191           0 :                 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
    4192             :         return VM_FAULT_FALLBACK;
    4193             : }
    4194             : 
    4195             : /* `inline' is required to avoid gcc 4.1.2 build error */
    4196           0 : static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
    4197             : {
    4198           0 :         if (vma_is_anonymous(vmf->vma)) {
    4199           0 :                 if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
    4200             :                         return handle_userfault(vmf, VM_UFFD_WP);
    4201           0 :                 return do_huge_pmd_wp_page(vmf, orig_pmd);
    4202             :         }
    4203           0 :         if (vmf->vma->vm_ops->huge_fault) {
    4204           0 :                 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
    4205             : 
    4206           0 :                 if (!(ret & VM_FAULT_FALLBACK))
    4207             :                         return ret;
    4208             :         }
    4209             : 
    4210             :         /* COW or write-notify handled on pte level: split pmd. */
    4211           0 :         __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
    4212             : 
    4213           0 :         return VM_FAULT_FALLBACK;
    4214             : }
    4215             : 
    4216        1881 : static vm_fault_t create_huge_pud(struct vm_fault *vmf)
    4217             : {
    4218             : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                     \
    4219             :         defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
    4220             :         /* No support for anonymous transparent PUD pages yet */
    4221        1881 :         if (vma_is_anonymous(vmf->vma))
    4222          13 :                 goto split;
    4223        1868 :         if (vmf->vma->vm_ops->huge_fault) {
    4224           0 :                 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
    4225             : 
    4226           0 :                 if (!(ret & VM_FAULT_FALLBACK))
    4227             :                         return ret;
    4228             :         }
    4229        1868 : split:
    4230             :         /* COW or write-notify not handled on PUD level: split pud.*/
    4231        1881 :         __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
    4232             : #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    4233        1881 :         return VM_FAULT_FALLBACK;
    4234             : }
    4235             : 
    4236           0 : static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
    4237             : {
    4238             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    4239             :         /* No support for anonymous transparent PUD pages yet */
    4240           0 :         if (vma_is_anonymous(vmf->vma))
    4241             :                 return VM_FAULT_FALLBACK;
    4242           0 :         if (vmf->vma->vm_ops->huge_fault)
    4243           0 :                 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
    4244             : #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    4245             :         return VM_FAULT_FALLBACK;
    4246             : }
    4247             : 
    4248             : /*
    4249             :  * These routines also need to handle stuff like marking pages dirty
    4250             :  * and/or accessed for architectures that don't do it in hardware (most
    4251             :  * RISC architectures).  The early dirtying is also good on the i386.
    4252             :  *
    4253             :  * There is also a hook called "update_mmu_cache()" that architectures
    4254             :  * with external mmu caches can use to update those (ie the Sparc or
    4255             :  * PowerPC hashed page tables that act as extended TLBs).
    4256             :  *
    4257             :  * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
    4258             :  * concurrent faults).
    4259             :  *
    4260             :  * The mmap_lock may have been released depending on flags and our return value.
    4261             :  * See filemap_fault() and __lock_page_or_retry().
    4262             :  */
    4263      171922 : static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
    4264             : {
    4265      171922 :         pte_t entry;
    4266             : 
    4267      171922 :         if (unlikely(pmd_none(*vmf->pmd))) {
    4268             :                 /*
    4269             :                  * Leave __pte_alloc() until later: because vm_ops->fault may
    4270             :                  * want to allocate huge page, and if we expose page table
    4271             :                  * for an instant, it will be difficult to retract from
    4272             :                  * concurrent faults and from rmap lookups.
    4273             :                  */
    4274        8637 :                 vmf->pte = NULL;
    4275             :         } else {
    4276             :                 /*
    4277             :                  * If a huge pmd materialized under us just retry later.  Use
    4278             :                  * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
    4279             :                  * of pmd_trans_huge() to ensure the pmd didn't become
    4280             :                  * pmd_trans_huge under us and then back to pmd_none, as a
    4281             :                  * result of MADV_DONTNEED running immediately after a huge pmd
    4282             :                  * fault in a different thread of this mm, in turn leading to a
    4283             :                  * misleading pmd_trans_huge() retval. All we have to ensure is
    4284             :                  * that it is a regular pmd that we can walk with
    4285             :                  * pte_offset_map() and we can do that through an atomic read
    4286             :                  * in C, which is what pmd_trans_unstable() provides.
    4287             :                  */
    4288      163285 :                 if (pmd_devmap_trans_unstable(vmf->pmd))
    4289             :                         return 0;
    4290             :                 /*
    4291             :                  * A regular pmd is established and it can't morph into a huge
    4292             :                  * pmd from under us anymore at this point because we hold the
    4293             :                  * mmap_lock read mode and khugepaged takes it in write mode.
    4294             :                  * So now it's safe to run pte_offset_map().
    4295             :                  */
    4296      163286 :                 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
    4297      163286 :                 vmf->orig_pte = *vmf->pte;
    4298             : 
    4299             :                 /*
    4300             :                  * some architectures can have larger ptes than wordsize,
    4301             :                  * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
    4302             :                  * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
    4303             :                  * accesses.  The code below just needs a consistent view
    4304             :                  * for the ifs and we later double check anyway with the
    4305             :                  * ptl lock held. So here a barrier will do.
    4306             :                  */
    4307      163286 :                 barrier();
    4308      163288 :                 if (pte_none(vmf->orig_pte)) {
    4309      115549 :                         pte_unmap(vmf->pte);
    4310      115549 :                         vmf->pte = NULL;
    4311             :                 }
    4312             :         }
    4313             : 
    4314      171925 :         if (!vmf->pte) {
    4315      124186 :                 if (vma_is_anonymous(vmf->vma))
    4316       43703 :                         return do_anonymous_page(vmf);
    4317             :                 else
    4318       80483 :                         return do_fault(vmf);
    4319             :         }
    4320             : 
    4321       47739 :         if (!pte_present(vmf->orig_pte))
    4322           0 :                 return do_swap_page(vmf);
    4323             : 
    4324       47739 :         if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
    4325             :                 return do_numa_page(vmf);
    4326             : 
    4327       47739 :         vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
    4328       47739 :         spin_lock(vmf->ptl);
    4329       47740 :         entry = vmf->orig_pte;
    4330       47740 :         if (unlikely(!pte_same(*vmf->pte, entry))) {
    4331           0 :                 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
    4332           0 :                 goto unlock;
    4333             :         }
    4334       47740 :         if (vmf->flags & FAULT_FLAG_WRITE) {
    4335       47740 :                 if (!pte_write(entry))
    4336       47433 :                         return do_wp_page(vmf);
    4337         307 :                 entry = pte_mkdirty(entry);
    4338             :         }
    4339         307 :         entry = pte_mkyoung(entry);
    4340         307 :         if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
    4341         307 :                                 vmf->flags & FAULT_FLAG_WRITE)) {
    4342         307 :                 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
    4343             :         } else {
    4344             :                 /* Skip spurious TLB flush for retried page fault */
    4345         307 :                 if (vmf->flags & FAULT_FLAG_TRIED)
    4346             :                         goto unlock;
    4347             :                 /*
    4348             :                  * This is needed only for protection faults but the arch code
    4349             :                  * is not yet telling us if this is a protection fault or not.
    4350             :                  * This still avoids useless tlb flushes for .text page faults
    4351             :                  * with threads.
    4352             :                  */
    4353             :                 if (vmf->flags & FAULT_FLAG_WRITE)
    4354         307 :                         flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
    4355             :         }
    4356         307 : unlock:
    4357         307 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4358         307 :         return 0;
    4359             : }
    4360             : 
    4361             : /*
    4362             :  * By the time we get here, we already hold the mm semaphore
    4363             :  *
    4364             :  * The mmap_lock may have been released depending on flags and our
    4365             :  * return value.  See filemap_fault() and __lock_page_or_retry().
    4366             :  */
    4367      171939 : static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
    4368             :                 unsigned long address, unsigned int flags)
    4369             : {
    4370      171939 :         struct vm_fault vmf = {
    4371             :                 .vma = vma,
    4372      171939 :                 .address = address & PAGE_MASK,
    4373             :                 .flags = flags,
    4374      171939 :                 .pgoff = linear_page_index(vma, address),
    4375      171939 :                 .gfp_mask = __get_fault_gfp_mask(vma),
    4376             :         };
    4377      171939 :         unsigned int dirty = flags & FAULT_FLAG_WRITE;
    4378      171939 :         struct mm_struct *mm = vma->vm_mm;
    4379      171939 :         pgd_t *pgd;
    4380      171939 :         p4d_t *p4d;
    4381      171939 :         vm_fault_t ret;
    4382             : 
    4383      171939 :         pgd = pgd_offset(mm, address);
    4384      171939 :         p4d = p4d_alloc(mm, pgd, address);
    4385      171939 :         if (!p4d)
    4386             :                 return VM_FAULT_OOM;
    4387             : 
    4388      171939 :         vmf.pud = pud_alloc(mm, p4d, address);
    4389      171939 :         if (!vmf.pud)
    4390             :                 return VM_FAULT_OOM;
    4391      171939 : retry_pud:
    4392      171939 :         if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
    4393        1881 :                 ret = create_huge_pud(&vmf);
    4394        1881 :                 if (!(ret & VM_FAULT_FALLBACK))
    4395           0 :                         return ret;
    4396             :         } else {
    4397      170058 :                 pud_t orig_pud = *vmf.pud;
    4398             : 
    4399      170058 :                 barrier();
    4400      170057 :                 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
    4401             : 
    4402             :                         /* NUMA case for anonymous PUDs would go here */
    4403             : 
    4404           0 :                         if (dirty && !pud_write(orig_pud)) {
    4405           0 :                                 ret = wp_huge_pud(&vmf, orig_pud);
    4406           0 :                                 if (!(ret & VM_FAULT_FALLBACK))
    4407           0 :                                         return ret;
    4408             :                         } else {
    4409           0 :                                 huge_pud_set_accessed(&vmf, orig_pud);
    4410           0 :                                 return 0;
    4411             :                         }
    4412             :                 }
    4413             :         }
    4414             : 
    4415      171938 :         vmf.pmd = pmd_alloc(mm, vmf.pud, address);
    4416      171939 :         if (!vmf.pmd)
    4417             :                 return VM_FAULT_OOM;
    4418             : 
    4419             :         /* Huge pud page fault raced with pmd_alloc? */
    4420      171939 :         if (pud_trans_unstable(vmf.pud))
    4421           0 :                 goto retry_pud;
    4422             : 
    4423      171939 :         if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
    4424        6167 :                 ret = create_huge_pmd(&vmf);
    4425        6167 :                 if (!(ret & VM_FAULT_FALLBACK))
    4426             :                         return ret;
    4427             :         } else {
    4428      165772 :                 pmd_t orig_pmd = *vmf.pmd;
    4429             : 
    4430      165772 :                 barrier();
    4431      165771 :                 if (unlikely(is_swap_pmd(orig_pmd))) {
    4432           0 :                         VM_BUG_ON(thp_migration_supported() &&
    4433             :                                           !is_pmd_migration_entry(orig_pmd));
    4434           0 :                         if (is_pmd_migration_entry(orig_pmd))
    4435           0 :                                 pmd_migration_entry_wait(mm, vmf.pmd);
    4436           0 :                         return 0;
    4437             :                 }
    4438      165771 :                 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
    4439           0 :                         if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
    4440             :                                 return do_huge_pmd_numa_page(&vmf, orig_pmd);
    4441             : 
    4442           0 :                         if (dirty && !pmd_write(orig_pmd)) {
    4443           0 :                                 ret = wp_huge_pmd(&vmf, orig_pmd);
    4444           0 :                                 if (!(ret & VM_FAULT_FALLBACK))
    4445             :                                         return ret;
    4446             :                         } else {
    4447           0 :                                 huge_pmd_set_accessed(&vmf, orig_pmd);
    4448           0 :                                 return 0;
    4449             :                         }
    4450             :                 }
    4451             :         }
    4452             : 
    4453      171921 :         return handle_pte_fault(&vmf);
    4454             : }
    4455             : 
    4456             : /**
    4457             :  * mm_account_fault - Do page fault accountings
    4458             :  *
    4459             :  * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
    4460             :  *        of perf event counters, but we'll still do the per-task accounting to
    4461             :  *        the task who triggered this page fault.
    4462             :  * @address: the faulted address.
    4463             :  * @flags: the fault flags.
    4464             :  * @ret: the fault retcode.
    4465             :  *
    4466             :  * This will take care of most of the page fault accountings.  Meanwhile, it
    4467             :  * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
    4468             :  * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
    4469             :  * still be in per-arch page fault handlers at the entry of page fault.
    4470             :  */
    4471      171930 : static inline void mm_account_fault(struct pt_regs *regs,
    4472             :                                     unsigned long address, unsigned int flags,
    4473             :                                     vm_fault_t ret)
    4474             : {
    4475      171930 :         bool major;
    4476             : 
    4477             :         /*
    4478             :          * We don't do accounting for some specific faults:
    4479             :          *
    4480             :          * - Unsuccessful faults (e.g. when the address wasn't valid).  That
    4481             :          *   includes arch_vma_access_permitted() failing before reaching here.
    4482             :          *   So this is not a "this many hardware page faults" counter.  We
    4483             :          *   should use the hw profiling for that.
    4484             :          *
    4485             :          * - Incomplete faults (VM_FAULT_RETRY).  They will only be counted
    4486             :          *   once they're completed.
    4487             :          */
    4488      171930 :         if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
    4489             :                 return;
    4490             : 
    4491             :         /*
    4492             :          * We define the fault as a major fault when the final successful fault
    4493             :          * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
    4494             :          * handle it immediately previously).
    4495             :          */
    4496      170725 :         major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
    4497             : 
    4498      170725 :         if (major)
    4499        1195 :                 current->maj_flt++;
    4500             :         else
    4501      169530 :                 current->min_flt++;
    4502             : 
    4503             :         /*
    4504             :          * If the fault is done for GUP, regs will be NULL.  We only do the
    4505             :          * accounting for the per thread fault counters who triggered the
    4506             :          * fault, and we skip the perf event updates.
    4507             :          */
    4508      170725 :         if (!regs)
    4509             :                 return;
    4510             : 
    4511      168213 :         if (major)
    4512        1195 :                 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
    4513             :         else
    4514      167018 :                 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
    4515             : }
    4516             : 
    4517             : /*
    4518             :  * By the time we get here, we already hold the mm semaphore
    4519             :  *
    4520             :  * The mmap_lock may have been released depending on flags and our
    4521             :  * return value.  See filemap_fault() and __lock_page_or_retry().
    4522             :  */
    4523      171934 : vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
    4524             :                            unsigned int flags, struct pt_regs *regs)
    4525             : {
    4526      171934 :         vm_fault_t ret;
    4527             : 
    4528      171934 :         __set_current_state(TASK_RUNNING);
    4529             : 
    4530      171934 :         count_vm_event(PGFAULT);
    4531      171934 :         count_memcg_event_mm(vma->vm_mm, PGFAULT);
    4532             : 
    4533             :         /* do counter updates before entering really critical section. */
    4534      171934 :         check_sync_rss_stat(current);
    4535             : 
    4536      171934 :         if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
    4537      171932 :                                             flags & FAULT_FLAG_INSTRUCTION,
    4538      171932 :                                             flags & FAULT_FLAG_REMOTE))
    4539             :                 return VM_FAULT_SIGSEGV;
    4540             : 
    4541             :         /*
    4542             :          * Enable the memcg OOM handling for faults triggered in user
    4543             :          * space.  Kernel faults are handled more gracefully.
    4544             :          */
    4545      171934 :         if (flags & FAULT_FLAG_USER)
    4546             :                 mem_cgroup_enter_user_fault();
    4547             : 
    4548      171934 :         if (unlikely(is_vm_hugetlb_page(vma)))
    4549             :                 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
    4550             :         else
    4551      171934 :                 ret = __handle_mm_fault(vma, address, flags);
    4552             : 
    4553      171928 :         if (flags & FAULT_FLAG_USER) {
    4554      162524 :                 mem_cgroup_exit_user_fault();
    4555             :                 /*
    4556             :                  * The task may have entered a memcg OOM situation but
    4557             :                  * if the allocation error was handled gracefully (no
    4558             :                  * VM_FAULT_OOM), there is no need to kill anything.
    4559             :                  * Just clean up the OOM state peacefully.
    4560             :                  */
    4561      162524 :                 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
    4562      171928 :                         mem_cgroup_oom_synchronize(false);
    4563             :         }
    4564             : 
    4565      171928 :         mm_account_fault(regs, address, flags, ret);
    4566             : 
    4567      171928 :         return ret;
    4568             : }
    4569             : EXPORT_SYMBOL_GPL(handle_mm_fault);
    4570             : 
    4571             : #ifndef __PAGETABLE_P4D_FOLDED
    4572             : /*
    4573             :  * Allocate p4d page table.
    4574             :  * We've already handled the fast-path in-line.
    4575             :  */
    4576             : int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
    4577             : {
    4578             :         p4d_t *new = p4d_alloc_one(mm, address);
    4579             :         if (!new)
    4580             :                 return -ENOMEM;
    4581             : 
    4582             :         smp_wmb(); /* See comment in __pte_alloc */
    4583             : 
    4584             :         spin_lock(&mm->page_table_lock);
    4585             :         if (pgd_present(*pgd))          /* Another has populated it */
    4586             :                 p4d_free(mm, new);
    4587             :         else
    4588             :                 pgd_populate(mm, pgd, new);
    4589             :         spin_unlock(&mm->page_table_lock);
    4590             :         return 0;
    4591             : }
    4592             : #endif /* __PAGETABLE_P4D_FOLDED */
    4593             : 
    4594             : #ifndef __PAGETABLE_PUD_FOLDED
    4595             : /*
    4596             :  * Allocate page upper directory.
    4597             :  * We've already handled the fast-path in-line.
    4598             :  */
    4599        7132 : int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
    4600             : {
    4601        7132 :         pud_t *new = pud_alloc_one(mm, address);
    4602        7132 :         if (!new)
    4603             :                 return -ENOMEM;
    4604             : 
    4605        7132 :         smp_wmb(); /* See comment in __pte_alloc */
    4606             : 
    4607        7132 :         spin_lock(&mm->page_table_lock);
    4608        7132 :         if (!p4d_present(*p4d)) {
    4609        7132 :                 mm_inc_nr_puds(mm);
    4610        7132 :                 p4d_populate(mm, p4d, new);
    4611             :         } else  /* Another has populated it */
    4612           0 :                 pud_free(mm, new);
    4613        7132 :         spin_unlock(&mm->page_table_lock);
    4614        7132 :         return 0;
    4615             : }
    4616             : #endif /* __PAGETABLE_PUD_FOLDED */
    4617             : 
    4618             : #ifndef __PAGETABLE_PMD_FOLDED
    4619             : /*
    4620             :  * Allocate page middle directory.
    4621             :  * We've already handled the fast-path in-line.
    4622             :  */
    4623        8967 : int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
    4624             : {
    4625        8967 :         spinlock_t *ptl;
    4626        8967 :         pmd_t *new = pmd_alloc_one(mm, address);
    4627        8967 :         if (!new)
    4628             :                 return -ENOMEM;
    4629             : 
    4630        8967 :         smp_wmb(); /* See comment in __pte_alloc */
    4631             : 
    4632        8967 :         ptl = pud_lock(mm, pud);
    4633       17934 :         if (!pud_present(*pud)) {
    4634        8967 :                 mm_inc_nr_pmds(mm);
    4635        8967 :                 pud_populate(mm, pud, new);
    4636             :         } else  /* Another has populated it */
    4637           0 :                 pmd_free(mm, new);
    4638        8967 :         spin_unlock(ptl);
    4639        8967 :         return 0;
    4640             : }
    4641             : #endif /* __PAGETABLE_PMD_FOLDED */
    4642             : 
    4643           0 : int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
    4644             :                           struct mmu_notifier_range *range, pte_t **ptepp,
    4645             :                           pmd_t **pmdpp, spinlock_t **ptlp)
    4646             : {
    4647           0 :         pgd_t *pgd;
    4648           0 :         p4d_t *p4d;
    4649           0 :         pud_t *pud;
    4650           0 :         pmd_t *pmd;
    4651           0 :         pte_t *ptep;
    4652             : 
    4653           0 :         pgd = pgd_offset(mm, address);
    4654           0 :         if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
    4655             :                 goto out;
    4656             : 
    4657           0 :         p4d = p4d_offset(pgd, address);
    4658           0 :         if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
    4659           0 :                 goto out;
    4660             : 
    4661           0 :         pud = pud_offset(p4d, address);
    4662           0 :         if (pud_none(*pud) || unlikely(pud_bad(*pud)))
    4663           0 :                 goto out;
    4664             : 
    4665           0 :         pmd = pmd_offset(pud, address);
    4666           0 :         VM_BUG_ON(pmd_trans_huge(*pmd));
    4667             : 
    4668           0 :         if (pmd_huge(*pmd)) {
    4669             :                 if (!pmdpp)
    4670             :                         goto out;
    4671             : 
    4672             :                 if (range) {
    4673             :                         mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
    4674             :                                                 NULL, mm, address & PMD_MASK,
    4675             :                                                 (address & PMD_MASK) + PMD_SIZE);
    4676             :                         mmu_notifier_invalidate_range_start(range);
    4677             :                 }
    4678             :                 *ptlp = pmd_lock(mm, pmd);
    4679             :                 if (pmd_huge(*pmd)) {
    4680             :                         *pmdpp = pmd;
    4681             :                         return 0;
    4682             :                 }
    4683             :                 spin_unlock(*ptlp);
    4684             :                 if (range)
    4685           0 :                         mmu_notifier_invalidate_range_end(range);
    4686             :         }
    4687             : 
    4688           0 :         if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
    4689           0 :                 goto out;
    4690             : 
    4691           0 :         if (range) {
    4692           0 :                 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
    4693             :                                         address & PAGE_MASK,
    4694             :                                         (address & PAGE_MASK) + PAGE_SIZE);
    4695           0 :                 mmu_notifier_invalidate_range_start(range);
    4696             :         }
    4697           0 :         ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
    4698           0 :         if (!pte_present(*ptep))
    4699           0 :                 goto unlock;
    4700           0 :         *ptepp = ptep;
    4701           0 :         return 0;
    4702           0 : unlock:
    4703           0 :         pte_unmap_unlock(ptep, *ptlp);
    4704           0 :         if (range)
    4705           0 :                 mmu_notifier_invalidate_range_end(range);
    4706           0 : out:
    4707             :         return -EINVAL;
    4708             : }
    4709             : 
    4710             : /**
    4711             :  * follow_pte - look up PTE at a user virtual address
    4712             :  * @mm: the mm_struct of the target address space
    4713             :  * @address: user virtual address
    4714             :  * @ptepp: location to store found PTE
    4715             :  * @ptlp: location to store the lock for the PTE
    4716             :  *
    4717             :  * On a successful return, the pointer to the PTE is stored in @ptepp;
    4718             :  * the corresponding lock is taken and its location is stored in @ptlp.
    4719             :  * The contents of the PTE are only stable until @ptlp is released;
    4720             :  * any further use, if any, must be protected against invalidation
    4721             :  * with MMU notifiers.
    4722             :  *
    4723             :  * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
    4724             :  * should be taken for read.
    4725             :  *
    4726             :  * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
    4727             :  * it is not a good general-purpose API.
    4728             :  *
    4729             :  * Return: zero on success, -ve otherwise.
    4730             :  */
    4731           0 : int follow_pte(struct mm_struct *mm, unsigned long address,
    4732             :                pte_t **ptepp, spinlock_t **ptlp)
    4733             : {
    4734           0 :         return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
    4735             : }
    4736             : EXPORT_SYMBOL_GPL(follow_pte);
    4737             : 
    4738             : /**
    4739             :  * follow_pfn - look up PFN at a user virtual address
    4740             :  * @vma: memory mapping
    4741             :  * @address: user virtual address
    4742             :  * @pfn: location to store found PFN
    4743             :  *
    4744             :  * Only IO mappings and raw PFN mappings are allowed.
    4745             :  *
    4746             :  * This function does not allow the caller to read the permissions
    4747             :  * of the PTE.  Do not use it.
    4748             :  *
    4749             :  * Return: zero and the pfn at @pfn on success, -ve otherwise.
    4750             :  */
    4751           0 : int follow_pfn(struct vm_area_struct *vma, unsigned long address,
    4752             :         unsigned long *pfn)
    4753             : {
    4754           0 :         int ret = -EINVAL;
    4755           0 :         spinlock_t *ptl;
    4756           0 :         pte_t *ptep;
    4757             : 
    4758           0 :         if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
    4759             :                 return ret;
    4760             : 
    4761           0 :         ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
    4762           0 :         if (ret)
    4763             :                 return ret;
    4764           0 :         *pfn = pte_pfn(*ptep);
    4765           0 :         pte_unmap_unlock(ptep, ptl);
    4766           0 :         return 0;
    4767             : }
    4768             : EXPORT_SYMBOL(follow_pfn);
    4769             : 
    4770             : #ifdef CONFIG_HAVE_IOREMAP_PROT
    4771           0 : int follow_phys(struct vm_area_struct *vma,
    4772             :                 unsigned long address, unsigned int flags,
    4773             :                 unsigned long *prot, resource_size_t *phys)
    4774             : {
    4775           0 :         int ret = -EINVAL;
    4776           0 :         pte_t *ptep, pte;
    4777           0 :         spinlock_t *ptl;
    4778             : 
    4779           0 :         if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
    4780           0 :                 goto out;
    4781             : 
    4782           0 :         if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
    4783           0 :                 goto out;
    4784           0 :         pte = *ptep;
    4785             : 
    4786           0 :         if ((flags & FOLL_WRITE) && !pte_write(pte))
    4787           0 :                 goto unlock;
    4788             : 
    4789           0 :         *prot = pgprot_val(pte_pgprot(pte));
    4790           0 :         *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
    4791             : 
    4792           0 :         ret = 0;
    4793           0 : unlock:
    4794           0 :         pte_unmap_unlock(ptep, ptl);
    4795           0 : out:
    4796           0 :         return ret;
    4797             : }
    4798             : 
    4799             : /**
    4800             :  * generic_access_phys - generic implementation for iomem mmap access
    4801             :  * @vma: the vma to access
    4802             :  * @addr: userspace addres, not relative offset within @vma
    4803             :  * @buf: buffer to read/write
    4804             :  * @len: length of transfer
    4805             :  * @write: set to FOLL_WRITE when writing, otherwise reading
    4806             :  *
    4807             :  * This is a generic implementation for &vm_operations_struct.access for an
    4808             :  * iomem mapping. This callback is used by access_process_vm() when the @vma is
    4809             :  * not page based.
    4810             :  */
    4811           0 : int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
    4812             :                         void *buf, int len, int write)
    4813             : {
    4814           0 :         resource_size_t phys_addr;
    4815           0 :         unsigned long prot = 0;
    4816           0 :         void __iomem *maddr;
    4817           0 :         pte_t *ptep, pte;
    4818           0 :         spinlock_t *ptl;
    4819           0 :         int offset = offset_in_page(addr);
    4820           0 :         int ret = -EINVAL;
    4821             : 
    4822           0 :         if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
    4823             :                 return -EINVAL;
    4824             : 
    4825           0 : retry:
    4826           0 :         if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
    4827             :                 return -EINVAL;
    4828           0 :         pte = *ptep;
    4829           0 :         pte_unmap_unlock(ptep, ptl);
    4830             : 
    4831           0 :         prot = pgprot_val(pte_pgprot(pte));
    4832           0 :         phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
    4833             : 
    4834           0 :         if ((write & FOLL_WRITE) && !pte_write(pte))
    4835             :                 return -EINVAL;
    4836             : 
    4837           0 :         maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
    4838           0 :         if (!maddr)
    4839             :                 return -ENOMEM;
    4840             : 
    4841           0 :         if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
    4842           0 :                 goto out_unmap;
    4843             : 
    4844           0 :         if (!pte_same(pte, *ptep)) {
    4845           0 :                 pte_unmap_unlock(ptep, ptl);
    4846           0 :                 iounmap(maddr);
    4847             : 
    4848           0 :                 goto retry;
    4849             :         }
    4850             : 
    4851           0 :         if (write)
    4852           0 :                 memcpy_toio(maddr + offset, buf, len);
    4853             :         else
    4854           0 :                 memcpy_fromio(buf, maddr + offset, len);
    4855           0 :         ret = len;
    4856           0 :         pte_unmap_unlock(ptep, ptl);
    4857           0 : out_unmap:
    4858           0 :         iounmap(maddr);
    4859             : 
    4860           0 :         return ret;
    4861             : }
    4862             : EXPORT_SYMBOL_GPL(generic_access_phys);
    4863             : #endif
    4864             : 
    4865             : /*
    4866             :  * Access another process' address space as given in mm.
    4867             :  */
    4868         220 : int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
    4869             :                        int len, unsigned int gup_flags)
    4870             : {
    4871         220 :         struct vm_area_struct *vma;
    4872         220 :         void *old_buf = buf;
    4873         220 :         int write = gup_flags & FOLL_WRITE;
    4874             : 
    4875         220 :         if (mmap_read_lock_killable(mm))
    4876             :                 return 0;
    4877             : 
    4878             :         /* ignore errors, just check how much was successfully transferred */
    4879         440 :         while (len) {
    4880         220 :                 int bytes, ret, offset;
    4881         220 :                 void *maddr;
    4882         220 :                 struct page *page = NULL;
    4883             : 
    4884         220 :                 ret = get_user_pages_remote(mm, addr, 1,
    4885             :                                 gup_flags, &page, &vma, NULL);
    4886         220 :                 if (ret <= 0) {
    4887             : #ifndef CONFIG_HAVE_IOREMAP_PROT
    4888             :                         break;
    4889             : #else
    4890             :                         /*
    4891             :                          * Check if this is a VM_IO | VM_PFNMAP VMA, which
    4892             :                          * we can access using slightly different code.
    4893             :                          */
    4894           0 :                         vma = find_vma(mm, addr);
    4895           0 :                         if (!vma || vma->vm_start > addr)
    4896             :                                 break;
    4897           0 :                         if (vma->vm_ops && vma->vm_ops->access)
    4898           0 :                                 ret = vma->vm_ops->access(vma, addr, buf,
    4899             :                                                           len, write);
    4900           0 :                         if (ret <= 0)
    4901             :                                 break;
    4902             :                         bytes = ret;
    4903             : #endif
    4904             :                 } else {
    4905         220 :                         bytes = len;
    4906         220 :                         offset = addr & (PAGE_SIZE-1);
    4907         220 :                         if (bytes > PAGE_SIZE-offset)
    4908           0 :                                 bytes = PAGE_SIZE-offset;
    4909             : 
    4910         220 :                         maddr = kmap(page);
    4911         220 :                         if (write) {
    4912           0 :                                 copy_to_user_page(vma, page, addr,
    4913             :                                                   maddr + offset, buf, bytes);
    4914           0 :                                 set_page_dirty_lock(page);
    4915             :                         } else {
    4916         220 :                                 copy_from_user_page(vma, page, addr,
    4917             :                                                     buf, maddr + offset, bytes);
    4918             :                         }
    4919         220 :                         kunmap(page);
    4920         220 :                         put_page(page);
    4921             :                 }
    4922         220 :                 len -= bytes;
    4923         220 :                 buf += bytes;
    4924         220 :                 addr += bytes;
    4925             :         }
    4926         220 :         mmap_read_unlock(mm);
    4927             : 
    4928         220 :         return buf - old_buf;
    4929             : }
    4930             : 
    4931             : /**
    4932             :  * access_remote_vm - access another process' address space
    4933             :  * @mm:         the mm_struct of the target address space
    4934             :  * @addr:       start address to access
    4935             :  * @buf:        source or destination buffer
    4936             :  * @len:        number of bytes to transfer
    4937             :  * @gup_flags:  flags modifying lookup behaviour
    4938             :  *
    4939             :  * The caller must hold a reference on @mm.
    4940             :  *
    4941             :  * Return: number of bytes copied from source to destination.
    4942             :  */
    4943         220 : int access_remote_vm(struct mm_struct *mm, unsigned long addr,
    4944             :                 void *buf, int len, unsigned int gup_flags)
    4945             : {
    4946         220 :         return __access_remote_vm(mm, addr, buf, len, gup_flags);
    4947             : }
    4948             : 
    4949             : /*
    4950             :  * Access another process' address space.
    4951             :  * Source/target buffer must be kernel space,
    4952             :  * Do not walk the page table directly, use get_user_pages
    4953             :  */
    4954           0 : int access_process_vm(struct task_struct *tsk, unsigned long addr,
    4955             :                 void *buf, int len, unsigned int gup_flags)
    4956             : {
    4957           0 :         struct mm_struct *mm;
    4958           0 :         int ret;
    4959             : 
    4960           0 :         mm = get_task_mm(tsk);
    4961           0 :         if (!mm)
    4962             :                 return 0;
    4963             : 
    4964           0 :         ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
    4965             : 
    4966           0 :         mmput(mm);
    4967             : 
    4968           0 :         return ret;
    4969             : }
    4970             : EXPORT_SYMBOL_GPL(access_process_vm);
    4971             : 
    4972             : /*
    4973             :  * Print the name of a VMA.
    4974             :  */
    4975           0 : void print_vma_addr(char *prefix, unsigned long ip)
    4976             : {
    4977           0 :         struct mm_struct *mm = current->mm;
    4978           0 :         struct vm_area_struct *vma;
    4979             : 
    4980             :         /*
    4981             :          * we might be running from an atomic context so we cannot sleep
    4982             :          */
    4983           0 :         if (!mmap_read_trylock(mm))
    4984             :                 return;
    4985             : 
    4986           0 :         vma = find_vma(mm, ip);
    4987           0 :         if (vma && vma->vm_file) {
    4988           0 :                 struct file *f = vma->vm_file;
    4989           0 :                 char *buf = (char *)__get_free_page(GFP_NOWAIT);
    4990           0 :                 if (buf) {
    4991           0 :                         char *p;
    4992             : 
    4993           0 :                         p = file_path(f, buf, PAGE_SIZE);
    4994           0 :                         if (IS_ERR(p))
    4995           0 :                                 p = "?";
    4996           0 :                         printk("%s%s[%lx+%lx]", prefix, kbasename(p),
    4997             :                                         vma->vm_start,
    4998           0 :                                         vma->vm_end - vma->vm_start);
    4999           0 :                         free_page((unsigned long)buf);
    5000             :                 }
    5001             :         }
    5002           0 :         mmap_read_unlock(mm);
    5003             : }
    5004             : 
    5005             : #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
    5006      419805 : void __might_fault(const char *file, int line)
    5007             : {
    5008             :         /*
    5009             :          * Some code (nfs/sunrpc) uses socket ops on kernel memory while
    5010             :          * holding the mmap_lock, this is safe because kernel memory doesn't
    5011             :          * get paged out, therefore we'll never actually fault, and the
    5012             :          * below annotations will generate false positives.
    5013             :          */
    5014      419805 :         if (uaccess_kernel())
    5015             :                 return;
    5016      419805 :         if (pagefault_disabled())
    5017             :                 return;
    5018      418849 :         __might_sleep(file, line, 0);
    5019             : #if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
    5020      418871 :         if (current->mm)
    5021      418871 :                 might_lock_read(&current->mm->mmap_lock);
    5022             : #endif
    5023             : }
    5024             : EXPORT_SYMBOL(__might_fault);
    5025             : #endif
    5026             : 
    5027             : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
    5028             : /*
    5029             :  * Process all subpages of the specified huge page with the specified
    5030             :  * operation.  The target subpage will be processed last to keep its
    5031             :  * cache lines hot.
    5032             :  */
    5033          17 : static inline void process_huge_page(
    5034             :         unsigned long addr_hint, unsigned int pages_per_huge_page,
    5035             :         void (*process_subpage)(unsigned long addr, int idx, void *arg),
    5036             :         void *arg)
    5037             : {
    5038          17 :         int i, n, base, l;
    5039          17 :         unsigned long addr = addr_hint &
    5040          17 :                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
    5041             : 
    5042             :         /* Process target subpage last to keep its cache lines hot */
    5043          17 :         might_sleep();
    5044          17 :         n = (addr_hint - addr) / PAGE_SIZE;
    5045          17 :         if (2 * n <= pages_per_huge_page) {
    5046             :                 /* If target subpage in first half of huge page */
    5047           4 :                 base = 0;
    5048           4 :                 l = n;
    5049             :                 /* Process subpages at the end of huge page */
    5050        2052 :                 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
    5051        2048 :                         cond_resched();
    5052        2048 :                         process_subpage(addr + i * PAGE_SIZE, i, arg);
    5053             :                 }
    5054             :         } else {
    5055             :                 /* If target subpage in second half of huge page */
    5056          13 :                 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
    5057          13 :                 l = pages_per_huge_page - n;
    5058             :                 /* Process subpages at the begin of huge page */
    5059        5473 :                 for (i = 0; i < base; i++) {
    5060        5460 :                         cond_resched();
    5061        5460 :                         process_subpage(addr + i * PAGE_SIZE, i, arg);
    5062             :                 }
    5063             :         }
    5064             :         /*
    5065             :          * Process remaining subpages in left-right-left-right pattern
    5066             :          * towards the target subpage
    5067             :          */
    5068         615 :         for (i = 0; i < l; i++) {
    5069         598 :                 int left_idx = base + i;
    5070         598 :                 int right_idx = base + 2 * l - 1 - i;
    5071             : 
    5072         598 :                 cond_resched();
    5073         598 :                 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
    5074         598 :                 cond_resched();
    5075         598 :                 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
    5076             :         }
    5077          17 : }
    5078             : 
    5079           0 : static void clear_gigantic_page(struct page *page,
    5080             :                                 unsigned long addr,
    5081             :                                 unsigned int pages_per_huge_page)
    5082             : {
    5083           0 :         int i;
    5084           0 :         struct page *p = page;
    5085             : 
    5086           0 :         might_sleep();
    5087           0 :         for (i = 0; i < pages_per_huge_page;
    5088           0 :              i++, p = mem_map_next(p, page, i)) {
    5089           0 :                 cond_resched();
    5090           0 :                 clear_user_highpage(p, addr + i * PAGE_SIZE);
    5091             :         }
    5092           0 : }
    5093             : 
    5094        8704 : static void clear_subpage(unsigned long addr, int idx, void *arg)
    5095             : {
    5096        8704 :         struct page *page = arg;
    5097             : 
    5098        8704 :         clear_user_highpage(page + idx, addr);
    5099        8704 : }
    5100             : 
    5101          17 : void clear_huge_page(struct page *page,
    5102             :                      unsigned long addr_hint, unsigned int pages_per_huge_page)
    5103             : {
    5104          17 :         unsigned long addr = addr_hint &
    5105          17 :                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
    5106             : 
    5107          17 :         if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
    5108           0 :                 clear_gigantic_page(page, addr, pages_per_huge_page);
    5109           0 :                 return;
    5110             :         }
    5111             : 
    5112          17 :         process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
    5113             : }
    5114             : 
    5115           0 : static void copy_user_gigantic_page(struct page *dst, struct page *src,
    5116             :                                     unsigned long addr,
    5117             :                                     struct vm_area_struct *vma,
    5118             :                                     unsigned int pages_per_huge_page)
    5119             : {
    5120           0 :         int i;
    5121           0 :         struct page *dst_base = dst;
    5122           0 :         struct page *src_base = src;
    5123             : 
    5124           0 :         for (i = 0; i < pages_per_huge_page; ) {
    5125           0 :                 cond_resched();
    5126           0 :                 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
    5127             : 
    5128           0 :                 i++;
    5129           0 :                 dst = mem_map_next(dst, dst_base, i);
    5130           0 :                 src = mem_map_next(src, src_base, i);
    5131             :         }
    5132           0 : }
    5133             : 
    5134             : struct copy_subpage_arg {
    5135             :         struct page *dst;
    5136             :         struct page *src;
    5137             :         struct vm_area_struct *vma;
    5138             : };
    5139             : 
    5140           0 : static void copy_subpage(unsigned long addr, int idx, void *arg)
    5141             : {
    5142           0 :         struct copy_subpage_arg *copy_arg = arg;
    5143             : 
    5144           0 :         copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
    5145             :                            addr, copy_arg->vma);
    5146           0 : }
    5147             : 
    5148           0 : void copy_user_huge_page(struct page *dst, struct page *src,
    5149             :                          unsigned long addr_hint, struct vm_area_struct *vma,
    5150             :                          unsigned int pages_per_huge_page)
    5151             : {
    5152           0 :         unsigned long addr = addr_hint &
    5153           0 :                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
    5154           0 :         struct copy_subpage_arg arg = {
    5155             :                 .dst = dst,
    5156             :                 .src = src,
    5157             :                 .vma = vma,
    5158             :         };
    5159             : 
    5160           0 :         if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
    5161           0 :                 copy_user_gigantic_page(dst, src, addr, vma,
    5162             :                                         pages_per_huge_page);
    5163           0 :                 return;
    5164             :         }
    5165             : 
    5166           0 :         process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
    5167             : }
    5168             : 
    5169           0 : long copy_huge_page_from_user(struct page *dst_page,
    5170             :                                 const void __user *usr_src,
    5171             :                                 unsigned int pages_per_huge_page,
    5172             :                                 bool allow_pagefault)
    5173             : {
    5174           0 :         void *src = (void *)usr_src;
    5175           0 :         void *page_kaddr;
    5176           0 :         unsigned long i, rc = 0;
    5177           0 :         unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
    5178           0 :         struct page *subpage = dst_page;
    5179             : 
    5180           0 :         for (i = 0; i < pages_per_huge_page;
    5181           0 :              i++, subpage = mem_map_next(subpage, dst_page, i)) {
    5182           0 :                 if (allow_pagefault)
    5183           0 :                         page_kaddr = kmap(subpage);
    5184             :                 else
    5185           0 :                         page_kaddr = kmap_atomic(subpage);
    5186           0 :                 rc = copy_from_user(page_kaddr,
    5187           0 :                                 (const void __user *)(src + i * PAGE_SIZE),
    5188             :                                 PAGE_SIZE);
    5189           0 :                 if (allow_pagefault)
    5190           0 :                         kunmap(subpage);
    5191             :                 else
    5192           0 :                         kunmap_atomic(page_kaddr);
    5193             : 
    5194           0 :                 ret_val -= (PAGE_SIZE - rc);
    5195           0 :                 if (rc)
    5196             :                         break;
    5197             : 
    5198           0 :                 cond_resched();
    5199             :         }
    5200           0 :         return ret_val;
    5201             : }
    5202             : #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
    5203             : 
    5204             : #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
    5205             : 
    5206             : static struct kmem_cache *page_ptl_cachep;
    5207             : 
    5208           1 : void __init ptlock_cache_init(void)
    5209             : {
    5210           1 :         page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
    5211             :                         SLAB_PANIC, NULL);
    5212           1 : }
    5213             : 
    5214       28139 : bool ptlock_alloc(struct page *page)
    5215             : {
    5216       28139 :         spinlock_t *ptl;
    5217             : 
    5218       28139 :         ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
    5219       28139 :         if (!ptl)
    5220             :                 return false;
    5221       28139 :         page->ptl = ptl;
    5222       28139 :         return true;
    5223             : }
    5224             : 
    5225       27810 : void ptlock_free(struct page *page)
    5226             : {
    5227       27810 :         kmem_cache_free(page_ptl_cachep, page->ptl);
    5228       25835 : }
    5229             : #endif

Generated by: LCOV version 1.14