LCOV - code coverage report
Current view: top level - mm - rmap.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 287 698 41.1 %
Date: 2021-04-22 12:43:58 Functions: 23 46 50.0 %

          Line data    Source code
       1             : /*
       2             :  * mm/rmap.c - physical to virtual reverse mappings
       3             :  *
       4             :  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
       5             :  * Released under the General Public License (GPL).
       6             :  *
       7             :  * Simple, low overhead reverse mapping scheme.
       8             :  * Please try to keep this thing as modular as possible.
       9             :  *
      10             :  * Provides methods for unmapping each kind of mapped page:
      11             :  * the anon methods track anonymous pages, and
      12             :  * the file methods track pages belonging to an inode.
      13             :  *
      14             :  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
      15             :  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
      16             :  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
      17             :  * Contributions by Hugh Dickins 2003, 2004
      18             :  */
      19             : 
      20             : /*
      21             :  * Lock ordering in mm:
      22             :  *
      23             :  * inode->i_mutex    (while writing or truncating, not reading or faulting)
      24             :  *   mm->mmap_lock
      25             :  *     page->flags PG_locked (lock_page)   * (see huegtlbfs below)
      26             :  *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
      27             :  *         mapping->i_mmap_rwsem
      28             :  *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
      29             :  *           anon_vma->rwsem
      30             :  *             mm->page_table_lock or pte_lock
      31             :  *               swap_lock (in swap_duplicate, swap_info_get)
      32             :  *                 mmlist_lock (in mmput, drain_mmlist and others)
      33             :  *                 mapping->private_lock (in __set_page_dirty_buffers)
      34             :  *                   lock_page_memcg move_lock (in __set_page_dirty_buffers)
      35             :  *                     i_pages lock (widely used)
      36             :  *                       lruvec->lru_lock (in lock_page_lruvec_irq)
      37             :  *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
      38             :  *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
      39             :  *                   sb_lock (within inode_lock in fs/fs-writeback.c)
      40             :  *                   i_pages lock (widely used, in set_page_dirty,
      41             :  *                             in arch-dependent flush_dcache_mmap_lock,
      42             :  *                             within bdi.wb->list_lock in __sync_single_inode)
      43             :  *
      44             :  * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
      45             :  *   ->tasklist_lock
      46             :  *     pte map lock
      47             :  *
      48             :  * * hugetlbfs PageHuge() pages take locks in this order:
      49             :  *         mapping->i_mmap_rwsem
      50             :  *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
      51             :  *             page->flags PG_locked (lock_page)
      52             :  */
      53             : 
      54             : #include <linux/mm.h>
      55             : #include <linux/sched/mm.h>
      56             : #include <linux/sched/task.h>
      57             : #include <linux/pagemap.h>
      58             : #include <linux/swap.h>
      59             : #include <linux/swapops.h>
      60             : #include <linux/slab.h>
      61             : #include <linux/init.h>
      62             : #include <linux/ksm.h>
      63             : #include <linux/rmap.h>
      64             : #include <linux/rcupdate.h>
      65             : #include <linux/export.h>
      66             : #include <linux/memcontrol.h>
      67             : #include <linux/mmu_notifier.h>
      68             : #include <linux/migrate.h>
      69             : #include <linux/hugetlb.h>
      70             : #include <linux/huge_mm.h>
      71             : #include <linux/backing-dev.h>
      72             : #include <linux/page_idle.h>
      73             : #include <linux/memremap.h>
      74             : #include <linux/userfaultfd_k.h>
      75             : 
      76             : #include <asm/tlbflush.h>
      77             : 
      78             : #include <trace/events/tlb.h>
      79             : 
      80             : #include "internal.h"
      81             : 
      82             : static struct kmem_cache *anon_vma_cachep;
      83             : static struct kmem_cache *anon_vma_chain_cachep;
      84             : 
      85       40833 : static inline struct anon_vma *anon_vma_alloc(void)
      86             : {
      87       40833 :         struct anon_vma *anon_vma;
      88             : 
      89       40833 :         anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
      90       40833 :         if (anon_vma) {
      91       40833 :                 atomic_set(&anon_vma->refcount, 1);
      92       40833 :                 anon_vma->degree = 1;        /* Reference for first vma */
      93       40833 :                 anon_vma->parent = anon_vma;
      94             :                 /*
      95             :                  * Initialise the anon_vma root to point to itself. If called
      96             :                  * from fork, the root will be reset to the parents anon_vma.
      97             :                  */
      98       40833 :                 anon_vma->root = anon_vma;
      99             :         }
     100             : 
     101       40833 :         return anon_vma;
     102             : }
     103             : 
     104       39927 : static inline void anon_vma_free(struct anon_vma *anon_vma)
     105             : {
     106       39927 :         VM_BUG_ON(atomic_read(&anon_vma->refcount));
     107             : 
     108             :         /*
     109             :          * Synchronize against page_lock_anon_vma_read() such that
     110             :          * we can safely hold the lock without the anon_vma getting
     111             :          * freed.
     112             :          *
     113             :          * Relies on the full mb implied by the atomic_dec_and_test() from
     114             :          * put_anon_vma() against the acquire barrier implied by
     115             :          * down_read_trylock() from page_lock_anon_vma_read(). This orders:
     116             :          *
     117             :          * page_lock_anon_vma_read()    VS      put_anon_vma()
     118             :          *   down_read_trylock()                  atomic_dec_and_test()
     119             :          *   LOCK                                 MB
     120             :          *   atomic_read()                        rwsem_is_locked()
     121             :          *
     122             :          * LOCK should suffice since the actual taking of the lock must
     123             :          * happen _before_ what follows.
     124             :          */
     125       39927 :         might_sleep();
     126       39927 :         if (rwsem_is_locked(&anon_vma->root->rwsem)) {
     127          18 :                 anon_vma_lock_write(anon_vma);
     128          18 :                 anon_vma_unlock_write(anon_vma);
     129             :         }
     130             : 
     131       39926 :         kmem_cache_free(anon_vma_cachep, anon_vma);
     132       39925 : }
     133             : 
     134       82839 : static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
     135             : {
     136       82839 :         return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
     137             : }
     138             : 
     139       81411 : static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
     140             : {
     141       81411 :         kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
     142       41490 : }
     143             : 
     144       82840 : static void anon_vma_chain_link(struct vm_area_struct *vma,
     145             :                                 struct anon_vma_chain *avc,
     146             :                                 struct anon_vma *anon_vma)
     147             : {
     148       82840 :         avc->vma = vma;
     149       82840 :         avc->anon_vma = anon_vma;
     150       82840 :         list_add(&avc->same_vma, &vma->anon_vma_chain);
     151       82840 :         anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
     152             : }
     153             : 
     154             : /**
     155             :  * __anon_vma_prepare - attach an anon_vma to a memory region
     156             :  * @vma: the memory region in question
     157             :  *
     158             :  * This makes sure the memory mapping described by 'vma' has
     159             :  * an 'anon_vma' attached to it, so that we can associate the
     160             :  * anonymous pages mapped into it with that anon_vma.
     161             :  *
     162             :  * The common case will be that we already have one, which
     163             :  * is handled inline by anon_vma_prepare(). But if
     164             :  * not we either need to find an adjacent mapping that we
     165             :  * can re-use the anon_vma from (very common when the only
     166             :  * reason for splitting a vma has been mprotect()), or we
     167             :  * allocate a new one.
     168             :  *
     169             :  * Anon-vma allocations are very subtle, because we may have
     170             :  * optimistically looked up an anon_vma in page_lock_anon_vma_read()
     171             :  * and that may actually touch the rwsem even in the newly
     172             :  * allocated vma (it depends on RCU to make sure that the
     173             :  * anon_vma isn't actually destroyed).
     174             :  *
     175             :  * As a result, we need to do proper anon_vma locking even
     176             :  * for the new allocation. At the same time, we do not want
     177             :  * to do any locking for the common case of already having
     178             :  * an anon_vma.
     179             :  *
     180             :  * This must be called with the mmap_lock held for reading.
     181             :  */
     182       13582 : int __anon_vma_prepare(struct vm_area_struct *vma)
     183             : {
     184       13582 :         struct mm_struct *mm = vma->vm_mm;
     185       13582 :         struct anon_vma *anon_vma, *allocated;
     186       13582 :         struct anon_vma_chain *avc;
     187             : 
     188       13582 :         might_sleep();
     189             : 
     190       13582 :         avc = anon_vma_chain_alloc(GFP_KERNEL);
     191       13582 :         if (!avc)
     192           0 :                 goto out_enomem;
     193             : 
     194       13582 :         anon_vma = find_mergeable_anon_vma(vma);
     195       13582 :         allocated = NULL;
     196       13582 :         if (!anon_vma) {
     197       13582 :                 anon_vma = anon_vma_alloc();
     198       13582 :                 if (unlikely(!anon_vma))
     199           0 :                         goto out_enomem_free_avc;
     200             :                 allocated = anon_vma;
     201             :         }
     202             : 
     203       13582 :         anon_vma_lock_write(anon_vma);
     204             :         /* page_table_lock to protect against threads */
     205       13582 :         spin_lock(&mm->page_table_lock);
     206       13582 :         if (likely(!vma->anon_vma)) {
     207       13582 :                 vma->anon_vma = anon_vma;
     208       13582 :                 anon_vma_chain_link(vma, avc, anon_vma);
     209             :                 /* vma reference or self-parent link for new root */
     210       13582 :                 anon_vma->degree++;
     211       13582 :                 allocated = NULL;
     212       13582 :                 avc = NULL;
     213             :         }
     214       13582 :         spin_unlock(&mm->page_table_lock);
     215       13582 :         anon_vma_unlock_write(anon_vma);
     216             : 
     217       13582 :         if (unlikely(allocated))
     218           0 :                 put_anon_vma(allocated);
     219       13582 :         if (unlikely(avc))
     220           0 :                 anon_vma_chain_free(avc);
     221             : 
     222             :         return 0;
     223             : 
     224           0 :  out_enomem_free_avc:
     225           0 :         anon_vma_chain_free(avc);
     226             :  out_enomem:
     227             :         return -ENOMEM;
     228             : }
     229             : 
     230             : /*
     231             :  * This is a useful helper function for locking the anon_vma root as
     232             :  * we traverse the vma->anon_vma_chain, looping over anon_vma's that
     233             :  * have the same vma.
     234             :  *
     235             :  * Such anon_vma's should have the same root, so you'd expect to see
     236             :  * just a single mutex_lock for the whole traversal.
     237             :  */
     238      123426 : static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
     239             : {
     240      123426 :         struct anon_vma *new_root = anon_vma->root;
     241      123426 :         if (new_root != root) {
     242       79238 :                 if (WARN_ON_ONCE(root))
     243           0 :                         up_write(&root->rwsem);
     244       79238 :                 root = new_root;
     245       79238 :                 down_write(&root->rwsem);
     246             :         }
     247      123421 :         return root;
     248             : }
     249             : 
     250      168391 : static inline void unlock_anon_vma_root(struct anon_vma *root)
     251             : {
     252      168391 :         if (root)
     253       79235 :                 up_write(&root->rwsem);
     254             : }
     255             : 
     256             : /*
     257             :  * Attach the anon_vmas from src to dst.
     258             :  * Returns 0 on success, -ENOMEM on failure.
     259             :  *
     260             :  * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
     261             :  * anon_vma_fork(). The first three want an exact copy of src, while the last
     262             :  * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
     263             :  * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
     264             :  * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
     265             :  *
     266             :  * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
     267             :  * and reuse existing anon_vma which has no vmas and only one child anon_vma.
     268             :  * This prevents degradation of anon_vma hierarchy to endless linear chain in
     269             :  * case of constantly forking task. On the other hand, an anon_vma with more
     270             :  * than one child isn't reused even if there was no alive vma, thus rmap
     271             :  * walker has a good chance of avoiding scanning the whole hierarchy when it
     272             :  * searches where page is mapped.
     273             :  */
     274       52824 : int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
     275             : {
     276       52824 :         struct anon_vma_chain *avc, *pavc;
     277       52824 :         struct anon_vma *root = NULL;
     278             : 
     279       94830 :         list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
     280       42006 :                 struct anon_vma *anon_vma;
     281             : 
     282       42006 :                 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
     283       42007 :                 if (unlikely(!avc)) {
     284           0 :                         unlock_anon_vma_root(root);
     285           0 :                         root = NULL;
     286           0 :                         avc = anon_vma_chain_alloc(GFP_KERNEL);
     287           0 :                         if (!avc)
     288           0 :                                 goto enomem_failure;
     289             :                 }
     290       42007 :                 anon_vma = pavc->anon_vma;
     291       42007 :                 root = lock_anon_vma_root(root, anon_vma);
     292       42007 :                 anon_vma_chain_link(dst, avc, anon_vma);
     293             : 
     294             :                 /*
     295             :                  * Reuse existing anon_vma if its degree lower than two,
     296             :                  * that means it has no vma and only one anon_vma child.
     297             :                  *
     298             :                  * Do not chose parent anon_vma, otherwise first child
     299             :                  * will always reuse it. Root anon_vma is never reused:
     300             :                  * it has self-parent reference and at least one child.
     301             :                  */
     302       42006 :                 if (!dst->anon_vma && src->anon_vma &&
     303        8603 :                     anon_vma != src->anon_vma && anon_vma->degree < 2)
     304           0 :                         dst->anon_vma = anon_vma;
     305             :         }
     306       52824 :         if (dst->anon_vma)
     307        6152 :                 dst->anon_vma->degree++;
     308       52824 :         unlock_anon_vma_root(root);
     309             :         return 0;
     310             : 
     311           0 :  enomem_failure:
     312             :         /*
     313             :          * dst->anon_vma is dropped here otherwise its degree can be incorrectly
     314             :          * decremented in unlink_anon_vmas().
     315             :          * We can safely do this because callers of anon_vma_clone() don't care
     316             :          * about dst->anon_vma if anon_vma_clone() failed.
     317             :          */
     318           0 :         dst->anon_vma = NULL;
     319           0 :         unlink_anon_vmas(dst);
     320           0 :         return -ENOMEM;
     321             : }
     322             : 
     323             : /*
     324             :  * Attach vma to its own anon_vma, as well as to the anon_vmas that
     325             :  * the corresponding VMA in the parent process is attached to.
     326             :  * Returns 0 on success, non-zero on failure.
     327             :  */
     328       60180 : int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
     329             : {
     330       60180 :         struct anon_vma_chain *avc;
     331       60180 :         struct anon_vma *anon_vma;
     332       60180 :         int error;
     333             : 
     334             :         /* Don't bother if the parent process has no anon_vma here. */
     335       60180 :         if (!pvma->anon_vma)
     336             :                 return 0;
     337             : 
     338             :         /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
     339       27251 :         vma->anon_vma = NULL;
     340             : 
     341             :         /*
     342             :          * First, attach the new VMA to the parent VMA's anon_vmas,
     343             :          * so rmap can find non-COWed pages in child processes.
     344             :          */
     345       27251 :         error = anon_vma_clone(vma, pvma);
     346       27251 :         if (error)
     347             :                 return error;
     348             : 
     349             :         /* An existing anon_vma has been reused, all done then. */
     350       27251 :         if (vma->anon_vma)
     351             :                 return 0;
     352             : 
     353             :         /* Then add our own anon_vma. */
     354       27251 :         anon_vma = anon_vma_alloc();
     355       27251 :         if (!anon_vma)
     356           0 :                 goto out_error;
     357       27251 :         avc = anon_vma_chain_alloc(GFP_KERNEL);
     358       27251 :         if (!avc)
     359           0 :                 goto out_error_free_anon_vma;
     360             : 
     361             :         /*
     362             :          * The root anon_vma's rwsem is the lock actually used when we
     363             :          * lock any of the anon_vmas in this anon_vma tree.
     364             :          */
     365       27251 :         anon_vma->root = pvma->anon_vma->root;
     366       27251 :         anon_vma->parent = pvma->anon_vma;
     367             :         /*
     368             :          * With refcounts, an anon_vma can stay around longer than the
     369             :          * process it belongs to. The root anon_vma needs to be pinned until
     370             :          * this anon_vma is freed, because the lock lives in the root.
     371             :          */
     372       27251 :         get_anon_vma(anon_vma->root);
     373             :         /* Mark this anon_vma as the one where our new (COWed) pages go. */
     374       27251 :         vma->anon_vma = anon_vma;
     375       27251 :         anon_vma_lock_write(anon_vma);
     376       27251 :         anon_vma_chain_link(vma, avc, anon_vma);
     377       27251 :         anon_vma->parent->degree++;
     378       27251 :         anon_vma_unlock_write(anon_vma);
     379             : 
     380       27251 :         return 0;
     381             : 
     382           0 :  out_error_free_anon_vma:
     383           0 :         put_anon_vma(anon_vma);
     384           0 :  out_error:
     385           0 :         unlink_anon_vmas(vma);
     386           0 :         return -ENOMEM;
     387             : }
     388             : 
     389      115566 : void unlink_anon_vmas(struct vm_area_struct *vma)
     390             : {
     391      115566 :         struct anon_vma_chain *avc, *next;
     392      115566 :         struct anon_vma *root = NULL;
     393             : 
     394             :         /*
     395             :          * Unlink each anon_vma chained to the VMA.  This list is ordered
     396             :          * from newest to oldest, ensuring the root anon_vma gets freed last.
     397             :          */
     398      196970 :         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
     399       81403 :                 struct anon_vma *anon_vma = avc->anon_vma;
     400             : 
     401       81403 :                 root = lock_anon_vma_root(root, anon_vma);
     402       81404 :                 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
     403             : 
     404             :                 /*
     405             :                  * Leave empty anon_vmas on the list - we'll need
     406             :                  * to free them outside the lock.
     407             :                  */
     408       81411 :                 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
     409       39914 :                         anon_vma->parent->degree--;
     410       39914 :                         continue;
     411             :                 }
     412             : 
     413       41497 :                 list_del(&avc->same_vma);
     414      122901 :                 anon_vma_chain_free(avc);
     415             :         }
     416      115567 :         if (vma->anon_vma) {
     417       45831 :                 vma->anon_vma->degree--;
     418             : 
     419             :                 /*
     420             :                  * vma would still be needed after unlink, and anon_vma will be prepared
     421             :                  * when handle fault.
     422             :                  */
     423       45831 :                 vma->anon_vma = NULL;
     424             :         }
     425      115567 :         unlock_anon_vma_root(root);
     426             : 
     427             :         /*
     428             :          * Iterate the list once more, it now only contains empty and unlinked
     429             :          * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
     430             :          * needing to write-acquire the anon_vma->root->rwsem.
     431             :          */
     432      155474 :         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
     433       39915 :                 struct anon_vma *anon_vma = avc->anon_vma;
     434             : 
     435       39915 :                 VM_WARN_ON(anon_vma->degree);
     436       39915 :                 put_anon_vma(anon_vma);
     437             : 
     438       39914 :                 list_del(&avc->same_vma);
     439       39914 :                 anon_vma_chain_free(avc);
     440             :         }
     441      115559 : }
     442             : 
     443        4464 : static void anon_vma_ctor(void *data)
     444             : {
     445        4464 :         struct anon_vma *anon_vma = data;
     446             : 
     447        4464 :         init_rwsem(&anon_vma->rwsem);
     448        4464 :         atomic_set(&anon_vma->refcount, 0);
     449        4463 :         anon_vma->rb_root = RB_ROOT_CACHED;
     450        4463 : }
     451             : 
     452           1 : void __init anon_vma_init(void)
     453             : {
     454           1 :         anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
     455             :                         0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
     456             :                         anon_vma_ctor);
     457           1 :         anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
     458             :                         SLAB_PANIC|SLAB_ACCOUNT);
     459           1 : }
     460             : 
     461             : /*
     462             :  * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
     463             :  *
     464             :  * Since there is no serialization what so ever against page_remove_rmap()
     465             :  * the best this function can do is return a refcount increased anon_vma
     466             :  * that might have been relevant to this page.
     467             :  *
     468             :  * The page might have been remapped to a different anon_vma or the anon_vma
     469             :  * returned may already be freed (and even reused).
     470             :  *
     471             :  * In case it was remapped to a different anon_vma, the new anon_vma will be a
     472             :  * child of the old anon_vma, and the anon_vma lifetime rules will therefore
     473             :  * ensure that any anon_vma obtained from the page will still be valid for as
     474             :  * long as we observe page_mapped() [ hence all those page_mapped() tests ].
     475             :  *
     476             :  * All users of this function must be very careful when walking the anon_vma
     477             :  * chain and verify that the page in question is indeed mapped in it
     478             :  * [ something equivalent to page_mapped_in_vma() ].
     479             :  *
     480             :  * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
     481             :  * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
     482             :  * if there is a mapcount, we can dereference the anon_vma after observing
     483             :  * those.
     484             :  */
     485           0 : struct anon_vma *page_get_anon_vma(struct page *page)
     486             : {
     487           0 :         struct anon_vma *anon_vma = NULL;
     488           0 :         unsigned long anon_mapping;
     489             : 
     490           0 :         rcu_read_lock();
     491           0 :         anon_mapping = (unsigned long)READ_ONCE(page->mapping);
     492           0 :         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     493           0 :                 goto out;
     494           0 :         if (!page_mapped(page))
     495           0 :                 goto out;
     496             : 
     497           0 :         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
     498           0 :         if (!atomic_inc_not_zero(&anon_vma->refcount)) {
     499           0 :                 anon_vma = NULL;
     500           0 :                 goto out;
     501             :         }
     502             : 
     503             :         /*
     504             :          * If this page is still mapped, then its anon_vma cannot have been
     505             :          * freed.  But if it has been unmapped, we have no security against the
     506             :          * anon_vma structure being freed and reused (for another anon_vma:
     507             :          * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
     508             :          * above cannot corrupt).
     509             :          */
     510           0 :         if (!page_mapped(page)) {
     511           0 :                 rcu_read_unlock();
     512           0 :                 put_anon_vma(anon_vma);
     513           0 :                 return NULL;
     514             :         }
     515           0 : out:
     516           0 :         rcu_read_unlock();
     517             : 
     518           0 :         return anon_vma;
     519             : }
     520             : 
     521             : /*
     522             :  * Similar to page_get_anon_vma() except it locks the anon_vma.
     523             :  *
     524             :  * Its a little more complex as it tries to keep the fast path to a single
     525             :  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
     526             :  * reference like with page_get_anon_vma() and then block on the mutex.
     527             :  */
     528           0 : struct anon_vma *page_lock_anon_vma_read(struct page *page)
     529             : {
     530           0 :         struct anon_vma *anon_vma = NULL;
     531           0 :         struct anon_vma *root_anon_vma;
     532           0 :         unsigned long anon_mapping;
     533             : 
     534           0 :         rcu_read_lock();
     535           0 :         anon_mapping = (unsigned long)READ_ONCE(page->mapping);
     536           0 :         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     537           0 :                 goto out;
     538           0 :         if (!page_mapped(page))
     539           0 :                 goto out;
     540             : 
     541           0 :         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
     542           0 :         root_anon_vma = READ_ONCE(anon_vma->root);
     543           0 :         if (down_read_trylock(&root_anon_vma->rwsem)) {
     544             :                 /*
     545             :                  * If the page is still mapped, then this anon_vma is still
     546             :                  * its anon_vma, and holding the mutex ensures that it will
     547             :                  * not go away, see anon_vma_free().
     548             :                  */
     549           0 :                 if (!page_mapped(page)) {
     550           0 :                         up_read(&root_anon_vma->rwsem);
     551           0 :                         anon_vma = NULL;
     552             :                 }
     553           0 :                 goto out;
     554             :         }
     555             : 
     556             :         /* trylock failed, we got to sleep */
     557           0 :         if (!atomic_inc_not_zero(&anon_vma->refcount)) {
     558           0 :                 anon_vma = NULL;
     559           0 :                 goto out;
     560             :         }
     561             : 
     562           0 :         if (!page_mapped(page)) {
     563           0 :                 rcu_read_unlock();
     564           0 :                 put_anon_vma(anon_vma);
     565           0 :                 return NULL;
     566             :         }
     567             : 
     568             :         /* we pinned the anon_vma, its safe to sleep */
     569           0 :         rcu_read_unlock();
     570           0 :         anon_vma_lock_read(anon_vma);
     571             : 
     572           0 :         if (atomic_dec_and_test(&anon_vma->refcount)) {
     573             :                 /*
     574             :                  * Oops, we held the last refcount, release the lock
     575             :                  * and bail -- can't simply use put_anon_vma() because
     576             :                  * we'll deadlock on the anon_vma_lock_write() recursion.
     577             :                  */
     578           0 :                 anon_vma_unlock_read(anon_vma);
     579           0 :                 __put_anon_vma(anon_vma);
     580           0 :                 anon_vma = NULL;
     581             :         }
     582             : 
     583             :         return anon_vma;
     584             : 
     585           0 : out:
     586           0 :         rcu_read_unlock();
     587           0 :         return anon_vma;
     588             : }
     589             : 
     590           0 : void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
     591             : {
     592           0 :         anon_vma_unlock_read(anon_vma);
     593           0 : }
     594             : 
     595             : #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
     596             : /*
     597             :  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
     598             :  * important if a PTE was dirty when it was unmapped that it's flushed
     599             :  * before any IO is initiated on the page to prevent lost writes. Similarly,
     600             :  * it must be flushed before freeing to prevent data leakage.
     601             :  */
     602           0 : void try_to_unmap_flush(void)
     603             : {
     604           0 :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     605             : 
     606           0 :         if (!tlb_ubc->flush_required)
     607             :                 return;
     608             : 
     609           0 :         arch_tlbbatch_flush(&tlb_ubc->arch);
     610           0 :         tlb_ubc->flush_required = false;
     611           0 :         tlb_ubc->writable = false;
     612             : }
     613             : 
     614             : /* Flush iff there are potentially writable TLB entries that can race with IO */
     615           0 : void try_to_unmap_flush_dirty(void)
     616             : {
     617           0 :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     618             : 
     619           0 :         if (tlb_ubc->writable)
     620           0 :                 try_to_unmap_flush();
     621           0 : }
     622             : 
     623           0 : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
     624             : {
     625           0 :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     626             : 
     627           0 :         arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
     628           0 :         tlb_ubc->flush_required = true;
     629             : 
     630             :         /*
     631             :          * Ensure compiler does not re-order the setting of tlb_flush_batched
     632             :          * before the PTE is cleared.
     633             :          */
     634           0 :         barrier();
     635           0 :         mm->tlb_flush_batched = true;
     636             : 
     637             :         /*
     638             :          * If the PTE was dirty then it's best to assume it's writable. The
     639             :          * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
     640             :          * before the page is queued for IO.
     641             :          */
     642           0 :         if (writable)
     643           0 :                 tlb_ubc->writable = true;
     644           0 : }
     645             : 
     646             : /*
     647             :  * Returns true if the TLB flush should be deferred to the end of a batch of
     648             :  * unmap operations to reduce IPIs.
     649             :  */
     650           0 : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
     651             : {
     652           0 :         bool should_defer = false;
     653             : 
     654           0 :         if (!(flags & TTU_BATCH_FLUSH))
     655             :                 return false;
     656             : 
     657             :         /* If remote CPUs need to be flushed then defer batch the flush */
     658           0 :         if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
     659           0 :                 should_defer = true;
     660           0 :         put_cpu();
     661             : 
     662           0 :         return should_defer;
     663             : }
     664             : 
     665             : /*
     666             :  * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
     667             :  * releasing the PTL if TLB flushes are batched. It's possible for a parallel
     668             :  * operation such as mprotect or munmap to race between reclaim unmapping
     669             :  * the page and flushing the page. If this race occurs, it potentially allows
     670             :  * access to data via a stale TLB entry. Tracking all mm's that have TLB
     671             :  * batching in flight would be expensive during reclaim so instead track
     672             :  * whether TLB batching occurred in the past and if so then do a flush here
     673             :  * if required. This will cost one additional flush per reclaim cycle paid
     674             :  * by the first operation at risk such as mprotect and mumap.
     675             :  *
     676             :  * This must be called under the PTL so that an access to tlb_flush_batched
     677             :  * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
     678             :  * via the PTL.
     679             :  */
     680      128052 : void flush_tlb_batched_pending(struct mm_struct *mm)
     681             : {
     682      128052 :         if (data_race(mm->tlb_flush_batched)) {
     683           0 :                 flush_tlb_mm(mm);
     684             : 
     685             :                 /*
     686             :                  * Do not allow the compiler to re-order the clearing of
     687             :                  * tlb_flush_batched before the tlb is flushed.
     688             :                  */
     689           0 :                 barrier();
     690           0 :                 mm->tlb_flush_batched = false;
     691             :         }
     692      128052 : }
     693             : #else
     694             : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
     695             : {
     696             : }
     697             : 
     698             : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
     699             : {
     700             :         return false;
     701             : }
     702             : #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
     703             : 
     704             : /*
     705             :  * At what user virtual address is page expected in vma?
     706             :  * Caller should check the page is actually part of the vma.
     707             :  */
     708           0 : unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
     709             : {
     710           0 :         unsigned long address;
     711           0 :         if (PageAnon(page)) {
     712           0 :                 struct anon_vma *page__anon_vma = page_anon_vma(page);
     713             :                 /*
     714             :                  * Note: swapoff's unuse_vma() is more efficient with this
     715             :                  * check, and needs it to match anon_vma when KSM is active.
     716             :                  */
     717           0 :                 if (!vma->anon_vma || !page__anon_vma ||
     718           0 :                     vma->anon_vma->root != page__anon_vma->root)
     719             :                         return -EFAULT;
     720           0 :         } else if (page->mapping) {
     721           0 :                 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
     722             :                         return -EFAULT;
     723             :         } else
     724             :                 return -EFAULT;
     725           0 :         address = __vma_address(page, vma);
     726           0 :         if (unlikely(address < vma->vm_start || address >= vma->vm_end))
     727           0 :                 return -EFAULT;
     728             :         return address;
     729             : }
     730             : 
     731          28 : pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
     732             : {
     733          28 :         pgd_t *pgd;
     734          28 :         p4d_t *p4d;
     735          28 :         pud_t *pud;
     736          28 :         pmd_t *pmd = NULL;
     737          28 :         pmd_t pmde;
     738             : 
     739          28 :         pgd = pgd_offset(mm, address);
     740          28 :         if (!pgd_present(*pgd))
     741             :                 goto out;
     742             : 
     743          28 :         p4d = p4d_offset(pgd, address);
     744          28 :         if (!p4d_present(*p4d))
     745           0 :                 goto out;
     746             : 
     747          28 :         pud = pud_offset(p4d, address);
     748          56 :         if (!pud_present(*pud))
     749           0 :                 goto out;
     750             : 
     751          28 :         pmd = pmd_offset(pud, address);
     752             :         /*
     753             :          * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
     754             :          * without holding anon_vma lock for write.  So when looking for a
     755             :          * genuine pmde (in which to find pte), test present and !THP together.
     756             :          */
     757          28 :         pmde = *pmd;
     758          28 :         barrier();
     759          56 :         if (!pmd_present(pmde) || pmd_trans_huge(pmde))
     760             :                 pmd = NULL;
     761           6 : out:
     762          28 :         return pmd;
     763             : }
     764             : 
     765             : struct page_referenced_arg {
     766             :         int mapcount;
     767             :         int referenced;
     768             :         unsigned long vm_flags;
     769             :         struct mem_cgroup *memcg;
     770             : };
     771             : /*
     772             :  * arg: page_referenced_arg will be passed
     773             :  */
     774           0 : static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
     775             :                         unsigned long address, void *arg)
     776             : {
     777           0 :         struct page_referenced_arg *pra = arg;
     778           0 :         struct page_vma_mapped_walk pvmw = {
     779             :                 .page = page,
     780             :                 .vma = vma,
     781             :                 .address = address,
     782             :         };
     783           0 :         int referenced = 0;
     784             : 
     785           0 :         while (page_vma_mapped_walk(&pvmw)) {
     786           0 :                 address = pvmw.address;
     787             : 
     788           0 :                 if (vma->vm_flags & VM_LOCKED) {
     789           0 :                         page_vma_mapped_walk_done(&pvmw);
     790           0 :                         pra->vm_flags |= VM_LOCKED;
     791           0 :                         return false; /* To break the loop */
     792             :                 }
     793             : 
     794           0 :                 if (pvmw.pte) {
     795           0 :                         if (ptep_clear_flush_young_notify(vma, address,
     796             :                                                 pvmw.pte)) {
     797             :                                 /*
     798             :                                  * Don't treat a reference through
     799             :                                  * a sequentially read mapping as such.
     800             :                                  * If the page has been used in another mapping,
     801             :                                  * we will catch it; if this other mapping is
     802             :                                  * already gone, the unmap path will have set
     803             :                                  * PG_referenced or activated the page.
     804             :                                  */
     805           0 :                                 if (likely(!(vma->vm_flags & VM_SEQ_READ)))
     806           0 :                                         referenced++;
     807             :                         }
     808           0 :                 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
     809           0 :                         if (pmdp_clear_flush_young_notify(vma, address,
     810             :                                                 pvmw.pmd))
     811           0 :                                 referenced++;
     812             :                 } else {
     813             :                         /* unexpected pmd-mapped page? */
     814             :                         WARN_ON_ONCE(1);
     815             :                 }
     816             : 
     817           0 :                 pra->mapcount--;
     818             :         }
     819             : 
     820           0 :         if (referenced)
     821           0 :                 clear_page_idle(page);
     822           0 :         if (test_and_clear_page_young(page))
     823             :                 referenced++;
     824             : 
     825           0 :         if (referenced) {
     826           0 :                 pra->referenced++;
     827           0 :                 pra->vm_flags |= vma->vm_flags;
     828             :         }
     829             : 
     830           0 :         if (!pra->mapcount)
     831           0 :                 return false; /* To break the loop */
     832             : 
     833             :         return true;
     834             : }
     835             : 
     836           0 : static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
     837             : {
     838           0 :         struct page_referenced_arg *pra = arg;
     839           0 :         struct mem_cgroup *memcg = pra->memcg;
     840             : 
     841           0 :         if (!mm_match_cgroup(vma->vm_mm, memcg))
     842             :                 return true;
     843             : 
     844           0 :         return false;
     845             : }
     846             : 
     847             : /**
     848             :  * page_referenced - test if the page was referenced
     849             :  * @page: the page to test
     850             :  * @is_locked: caller holds lock on the page
     851             :  * @memcg: target memory cgroup
     852             :  * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
     853             :  *
     854             :  * Quick test_and_clear_referenced for all mappings to a page,
     855             :  * returns the number of ptes which referenced the page.
     856             :  */
     857           0 : int page_referenced(struct page *page,
     858             :                     int is_locked,
     859             :                     struct mem_cgroup *memcg,
     860             :                     unsigned long *vm_flags)
     861             : {
     862           0 :         int we_locked = 0;
     863           0 :         struct page_referenced_arg pra = {
     864           0 :                 .mapcount = total_mapcount(page),
     865             :                 .memcg = memcg,
     866             :         };
     867           0 :         struct rmap_walk_control rwc = {
     868             :                 .rmap_one = page_referenced_one,
     869             :                 .arg = (void *)&pra,
     870             :                 .anon_lock = page_lock_anon_vma_read,
     871             :         };
     872             : 
     873           0 :         *vm_flags = 0;
     874           0 :         if (!pra.mapcount)
     875             :                 return 0;
     876             : 
     877           0 :         if (!page_rmapping(page))
     878             :                 return 0;
     879             : 
     880           0 :         if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
     881           0 :                 we_locked = trylock_page(page);
     882           0 :                 if (!we_locked)
     883             :                         return 1;
     884             :         }
     885             : 
     886             :         /*
     887             :          * If we are reclaiming on behalf of a cgroup, skip
     888             :          * counting on behalf of references from different
     889             :          * cgroups
     890             :          */
     891           0 :         if (memcg) {
     892           0 :                 rwc.invalid_vma = invalid_page_referenced_vma;
     893             :         }
     894             : 
     895           0 :         rmap_walk(page, &rwc);
     896           0 :         *vm_flags = pra.vm_flags;
     897             : 
     898           0 :         if (we_locked)
     899           0 :                 unlock_page(page);
     900             : 
     901           0 :         return pra.referenced;
     902             : }
     903             : 
     904          68 : static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
     905             :                             unsigned long address, void *arg)
     906             : {
     907          68 :         struct page_vma_mapped_walk pvmw = {
     908             :                 .page = page,
     909             :                 .vma = vma,
     910             :                 .address = address,
     911             :                 .flags = PVMW_SYNC,
     912             :         };
     913          68 :         struct mmu_notifier_range range;
     914          68 :         int *cleaned = arg;
     915             : 
     916             :         /*
     917             :          * We have to assume the worse case ie pmd for invalidation. Note that
     918             :          * the page can not be free from this function.
     919             :          */
     920          68 :         mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
     921             :                                 0, vma, vma->vm_mm, address,
     922             :                                 min(vma->vm_end, address + page_size(page)));
     923         136 :         mmu_notifier_invalidate_range_start(&range);
     924             : 
     925         136 :         while (page_vma_mapped_walk(&pvmw)) {
     926          68 :                 int ret = 0;
     927             : 
     928          68 :                 address = pvmw.address;
     929          68 :                 if (pvmw.pte) {
     930          68 :                         pte_t entry;
     931          68 :                         pte_t *pte = pvmw.pte;
     932             : 
     933          68 :                         if (!pte_dirty(*pte) && !pte_write(*pte))
     934           0 :                                 continue;
     935             : 
     936          68 :                         flush_cache_page(vma, address, pte_pfn(*pte));
     937          68 :                         entry = ptep_clear_flush(vma, address, pte);
     938          68 :                         entry = pte_wrprotect(entry);
     939          68 :                         entry = pte_mkclean(entry);
     940          68 :                         set_pte_at(vma->vm_mm, address, pte, entry);
     941          68 :                         ret = 1;
     942             :                 } else {
     943             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     944           0 :                         pmd_t *pmd = pvmw.pmd;
     945           0 :                         pmd_t entry;
     946             : 
     947           0 :                         if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
     948           0 :                                 continue;
     949             : 
     950           0 :                         flush_cache_page(vma, address, page_to_pfn(page));
     951           0 :                         entry = pmdp_invalidate(vma, address, pmd);
     952           0 :                         entry = pmd_wrprotect(entry);
     953           0 :                         entry = pmd_mkclean(entry);
     954           0 :                         set_pmd_at(vma->vm_mm, address, pmd, entry);
     955           0 :                         ret = 1;
     956             : #else
     957             :                         /* unexpected pmd-mapped page? */
     958             :                         WARN_ON_ONCE(1);
     959             : #endif
     960             :                 }
     961             : 
     962             :                 /*
     963             :                  * No need to call mmu_notifier_invalidate_range() as we are
     964             :                  * downgrading page table protection not changing it to point
     965             :                  * to a new page.
     966             :                  *
     967             :                  * See Documentation/vm/mmu_notifier.rst
     968             :                  */
     969          68 :                 if (ret)
     970          68 :                         (*cleaned)++;
     971             :         }
     972             : 
     973          68 :         mmu_notifier_invalidate_range_end(&range);
     974             : 
     975          68 :         return true;
     976             : }
     977             : 
     978          68 : static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
     979             : {
     980          68 :         if (vma->vm_flags & VM_SHARED)
     981          68 :                 return false;
     982             : 
     983             :         return true;
     984             : }
     985             : 
     986        1286 : int page_mkclean(struct page *page)
     987             : {
     988        1286 :         int cleaned = 0;
     989        1286 :         struct address_space *mapping;
     990        1286 :         struct rmap_walk_control rwc = {
     991             :                 .arg = (void *)&cleaned,
     992             :                 .rmap_one = page_mkclean_one,
     993             :                 .invalid_vma = invalid_mkclean_vma,
     994             :         };
     995             : 
     996        2572 :         BUG_ON(!PageLocked(page));
     997             : 
     998        1286 :         if (!page_mapped(page))
     999             :                 return 0;
    1000             : 
    1001          68 :         mapping = page_mapping(page);
    1002          68 :         if (!mapping)
    1003             :                 return 0;
    1004             : 
    1005          68 :         rmap_walk(page, &rwc);
    1006             : 
    1007          68 :         return cleaned;
    1008             : }
    1009             : EXPORT_SYMBOL_GPL(page_mkclean);
    1010             : 
    1011             : /**
    1012             :  * page_move_anon_rmap - move a page to our anon_vma
    1013             :  * @page:       the page to move to our anon_vma
    1014             :  * @vma:        the vma the page belongs to
    1015             :  *
    1016             :  * When a page belongs exclusively to one process after a COW event,
    1017             :  * that page can be moved into the anon_vma that belongs to just that
    1018             :  * process, so the rmap code will not search the parent or sibling
    1019             :  * processes.
    1020             :  */
    1021           0 : void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
    1022             : {
    1023           0 :         struct anon_vma *anon_vma = vma->anon_vma;
    1024             : 
    1025           0 :         page = compound_head(page);
    1026             : 
    1027           0 :         VM_BUG_ON_PAGE(!PageLocked(page), page);
    1028           0 :         VM_BUG_ON_VMA(!anon_vma, vma);
    1029             : 
    1030           0 :         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
    1031             :         /*
    1032             :          * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
    1033             :          * simultaneously, so a concurrent reader (eg page_referenced()'s
    1034             :          * PageAnon()) will not see one without the other.
    1035             :          */
    1036           0 :         WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
    1037           0 : }
    1038             : 
    1039             : /**
    1040             :  * __page_set_anon_rmap - set up new anonymous rmap
    1041             :  * @page:       Page or Hugepage to add to rmap
    1042             :  * @vma:        VM area to add page to.
    1043             :  * @address:    User virtual address of the mapping     
    1044             :  * @exclusive:  the page is exclusively owned by the current process
    1045             :  */
    1046       69131 : static void __page_set_anon_rmap(struct page *page,
    1047             :         struct vm_area_struct *vma, unsigned long address, int exclusive)
    1048             : {
    1049       69131 :         struct anon_vma *anon_vma = vma->anon_vma;
    1050             : 
    1051       69131 :         BUG_ON(!anon_vma);
    1052             : 
    1053       69131 :         if (PageAnon(page))
    1054             :                 return;
    1055             : 
    1056             :         /*
    1057             :          * If the page isn't exclusively mapped into this vma,
    1058             :          * we must use the _oldest_ possible anon_vma for the
    1059             :          * page mapping!
    1060             :          */
    1061       69131 :         if (!exclusive)
    1062           0 :                 anon_vma = anon_vma->root;
    1063             : 
    1064             :         /*
    1065             :          * page_idle does a lockless/optimistic rmap scan on page->mapping.
    1066             :          * Make sure the compiler doesn't split the stores of anon_vma and
    1067             :          * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
    1068             :          * could mistake the mapping for a struct address_space and crash.
    1069             :          */
    1070       69131 :         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
    1071       69131 :         WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
    1072       69131 :         page->index = linear_page_index(vma, address);
    1073             : }
    1074             : 
    1075             : /**
    1076             :  * __page_check_anon_rmap - sanity check anonymous rmap addition
    1077             :  * @page:       the page to add the mapping to
    1078             :  * @vma:        the vm area in which the mapping is added
    1079             :  * @address:    the user virtual address mapped
    1080             :  */
    1081           0 : static void __page_check_anon_rmap(struct page *page,
    1082             :         struct vm_area_struct *vma, unsigned long address)
    1083             : {
    1084             :         /*
    1085             :          * The page's anon-rmap details (mapping and index) are guaranteed to
    1086             :          * be set up correctly at this point.
    1087             :          *
    1088             :          * We have exclusion against page_add_anon_rmap because the caller
    1089             :          * always holds the page locked.
    1090             :          *
    1091             :          * We have exclusion against page_add_new_anon_rmap because those pages
    1092             :          * are initially only visible via the pagetables, and the pte is locked
    1093             :          * over the call to page_add_new_anon_rmap.
    1094             :          */
    1095           0 :         VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
    1096           0 :         VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
    1097             :                        page);
    1098           0 : }
    1099             : 
    1100             : /**
    1101             :  * page_add_anon_rmap - add pte mapping to an anonymous page
    1102             :  * @page:       the page to add the mapping to
    1103             :  * @vma:        the vm area in which the mapping is added
    1104             :  * @address:    the user virtual address mapped
    1105             :  * @compound:   charge the page as compound or small page
    1106             :  *
    1107             :  * The caller needs to hold the pte lock, and the page must be locked in
    1108             :  * the anon_vma case: to serialize mapping,index checking after setting,
    1109             :  * and to ensure that PageAnon is not being upgraded racily to PageKsm
    1110             :  * (but PageKsm is never downgraded to PageAnon).
    1111             :  */
    1112           0 : void page_add_anon_rmap(struct page *page,
    1113             :         struct vm_area_struct *vma, unsigned long address, bool compound)
    1114             : {
    1115           0 :         do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
    1116           0 : }
    1117             : 
    1118             : /*
    1119             :  * Special version of the above for do_swap_page, which often runs
    1120             :  * into pages that are exclusively owned by the current process.
    1121             :  * Everybody else should continue to use page_add_anon_rmap above.
    1122             :  */
    1123           0 : void do_page_add_anon_rmap(struct page *page,
    1124             :         struct vm_area_struct *vma, unsigned long address, int flags)
    1125             : {
    1126           0 :         bool compound = flags & RMAP_COMPOUND;
    1127           0 :         bool first;
    1128             : 
    1129           0 :         if (unlikely(PageKsm(page)))
    1130           0 :                 lock_page_memcg(page);
    1131             :         else
    1132           0 :                 VM_BUG_ON_PAGE(!PageLocked(page), page);
    1133             : 
    1134           0 :         if (compound) {
    1135           0 :                 atomic_t *mapcount;
    1136           0 :                 VM_BUG_ON_PAGE(!PageLocked(page), page);
    1137           0 :                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
    1138           0 :                 mapcount = compound_mapcount_ptr(page);
    1139           0 :                 first = atomic_inc_and_test(mapcount);
    1140             :         } else {
    1141           0 :                 first = atomic_inc_and_test(&page->_mapcount);
    1142             :         }
    1143             : 
    1144           0 :         if (first) {
    1145           0 :                 int nr = compound ? thp_nr_pages(page) : 1;
    1146             :                 /*
    1147             :                  * We use the irq-unsafe __{inc|mod}_zone_page_stat because
    1148             :                  * these counters are not modified in interrupt context, and
    1149             :                  * pte lock(a spinlock) is held, which implies preemption
    1150             :                  * disabled.
    1151             :                  */
    1152           0 :                 if (compound)
    1153           0 :                         __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
    1154           0 :                 __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
    1155             :         }
    1156             : 
    1157           0 :         if (unlikely(PageKsm(page))) {
    1158           0 :                 unlock_page_memcg(page);
    1159             :                 return;
    1160             :         }
    1161             : 
    1162             :         /* address might be in next vma when migration races vma_adjust */
    1163           0 :         if (first)
    1164           0 :                 __page_set_anon_rmap(page, vma, address,
    1165             :                                 flags & RMAP_EXCLUSIVE);
    1166             :         else
    1167           0 :                 __page_check_anon_rmap(page, vma, address);
    1168             : }
    1169             : 
    1170             : /**
    1171             :  * page_add_new_anon_rmap - add pte mapping to a new anonymous page
    1172             :  * @page:       the page to add the mapping to
    1173             :  * @vma:        the vm area in which the mapping is added
    1174             :  * @address:    the user virtual address mapped
    1175             :  * @compound:   charge the page as compound or small page
    1176             :  *
    1177             :  * Same as page_add_anon_rmap but must only be called on *new* pages.
    1178             :  * This means the inc-and-test can be bypassed.
    1179             :  * Page does not have to be locked.
    1180             :  */
    1181       69131 : void page_add_new_anon_rmap(struct page *page,
    1182             :         struct vm_area_struct *vma, unsigned long address, bool compound)
    1183             : {
    1184       69131 :         int nr = compound ? thp_nr_pages(page) : 1;
    1185             : 
    1186       69131 :         VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
    1187       69131 :         __SetPageSwapBacked(page);
    1188       69132 :         if (compound) {
    1189          19 :                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
    1190             :                 /* increment count (starts at -1) */
    1191          19 :                 atomic_set(compound_mapcount_ptr(page), 0);
    1192          19 :                 if (hpage_pincount_available(page))
    1193          19 :                         atomic_set(compound_pincount_ptr(page), 0);
    1194             : 
    1195          19 :                 __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
    1196             :         } else {
    1197             :                 /* Anon THP always mapped first with PMD */
    1198       69113 :                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
    1199             :                 /* increment count (starts at -1) */
    1200       69113 :                 atomic_set(&page->_mapcount, 0);
    1201             :         }
    1202       69132 :         __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
    1203       69133 :         __page_set_anon_rmap(page, vma, address, 1);
    1204       69132 : }
    1205             : 
    1206             : /**
    1207             :  * page_add_file_rmap - add pte mapping to a file page
    1208             :  * @page: the page to add the mapping to
    1209             :  * @compound: charge the page as compound or small page
    1210             :  *
    1211             :  * The caller needs to hold the pte lock.
    1212             :  */
    1213      798022 : void page_add_file_rmap(struct page *page, bool compound)
    1214             : {
    1215      798022 :         int i, nr = 1;
    1216             : 
    1217      798022 :         VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
    1218      798022 :         lock_page_memcg(page);
    1219      798022 :         if (compound && PageTransHuge(page)) {
    1220           0 :                 int nr_pages = thp_nr_pages(page);
    1221             : 
    1222           0 :                 for (i = 0, nr = 0; i < nr_pages; i++) {
    1223           0 :                         if (atomic_inc_and_test(&page[i]._mapcount))
    1224           0 :                                 nr++;
    1225             :                 }
    1226           0 :                 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
    1227           0 :                         goto out;
    1228           0 :                 if (PageSwapBacked(page))
    1229           0 :                         __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
    1230             :                                                 nr_pages);
    1231             :                 else
    1232           0 :                         __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
    1233             :                                                 nr_pages);
    1234             :         } else {
    1235      798022 :                 if (PageTransCompound(page) && page_mapping(page)) {
    1236           0 :                         VM_WARN_ON_ONCE(!PageLocked(page));
    1237             : 
    1238           0 :                         SetPageDoubleMap(compound_head(page));
    1239           0 :                         if (PageMlocked(page))
    1240           0 :                                 clear_page_mlock(compound_head(page));
    1241             :                 }
    1242     1596138 :                 if (!atomic_inc_and_test(&page->_mapcount))
    1243      695126 :                         goto out;
    1244             :         }
    1245      103007 :         __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
    1246      798133 : out:
    1247      798133 :         unlock_page_memcg(page);
    1248      798133 : }
    1249             : 
    1250      776429 : static void page_remove_file_rmap(struct page *page, bool compound)
    1251             : {
    1252      776429 :         int i, nr = 1;
    1253             : 
    1254      776429 :         VM_BUG_ON_PAGE(compound && !PageHead(page), page);
    1255             : 
    1256             :         /* Hugepages are not counted in NR_FILE_MAPPED for now. */
    1257      776429 :         if (unlikely(PageHuge(page))) {
    1258             :                 /* hugetlb pages are always mapped with pmds */
    1259             :                 atomic_dec(compound_mapcount_ptr(page));
    1260             :                 return;
    1261             :         }
    1262             : 
    1263             :         /* page still mapped by someone else? */
    1264      776429 :         if (compound && PageTransHuge(page)) {
    1265           0 :                 int nr_pages = thp_nr_pages(page);
    1266             : 
    1267           0 :                 for (i = 0, nr = 0; i < nr_pages; i++) {
    1268           0 :                         if (atomic_add_negative(-1, &page[i]._mapcount))
    1269           0 :                                 nr++;
    1270             :                 }
    1271           0 :                 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
    1272             :                         return;
    1273           0 :                 if (PageSwapBacked(page))
    1274           0 :                         __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
    1275             :                                                 -nr_pages);
    1276             :                 else
    1277           0 :                         __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
    1278             :                                                 -nr_pages);
    1279             :         } else {
    1280     1553185 :                 if (!atomic_add_negative(-1, &page->_mapcount))
    1281             :                         return;
    1282             :         }
    1283             : 
    1284             :         /*
    1285             :          * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
    1286             :          * these counters are not modified in interrupt context, and
    1287             :          * pte lock(a spinlock) is held, which implies preemption disabled.
    1288             :          */
    1289       98037 :         __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
    1290             : 
    1291      196082 :         if (unlikely(PageMlocked(page)))
    1292           0 :                 clear_page_mlock(page);
    1293             : }
    1294             : 
    1295          17 : static void page_remove_anon_compound_rmap(struct page *page)
    1296             : {
    1297          17 :         int i, nr;
    1298             : 
    1299          34 :         if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
    1300             :                 return;
    1301             : 
    1302             :         /* Hugepages are not counted in NR_ANON_PAGES for now. */
    1303          17 :         if (unlikely(PageHuge(page)))
    1304             :                 return;
    1305             : 
    1306          17 :         if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
    1307             :                 return;
    1308             : 
    1309          34 :         __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
    1310             : 
    1311          17 :         if (TestClearPageDoubleMap(page)) {
    1312             :                 /*
    1313             :                  * Subpages can be mapped with PTEs too. Check how many of
    1314             :                  * them are still mapped.
    1315             :                  */
    1316           0 :                 for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
    1317           0 :                         if (atomic_add_negative(-1, &page[i]._mapcount))
    1318           0 :                                 nr++;
    1319             :                 }
    1320             : 
    1321             :                 /*
    1322             :                  * Queue the page for deferred split if at least one small
    1323             :                  * page of the compound page is unmapped, but at least one
    1324             :                  * small page is still mapped.
    1325             :                  */
    1326           0 :                 if (nr && nr < thp_nr_pages(page))
    1327           0 :                         deferred_split_huge_page(page);
    1328             :         } else {
    1329          17 :                 nr = thp_nr_pages(page);
    1330             :         }
    1331             : 
    1332          34 :         if (unlikely(PageMlocked(page)))
    1333           0 :                 clear_page_mlock(page);
    1334             : 
    1335          17 :         if (nr)
    1336          17 :                 __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
    1337             : }
    1338             : 
    1339             : /**
    1340             :  * page_remove_rmap - take down pte mapping from a page
    1341             :  * @page:       page to remove mapping from
    1342             :  * @compound:   uncharge the page as compound or small page
    1343             :  *
    1344             :  * The caller needs to hold the pte lock.
    1345             :  */
    1346      923070 : void page_remove_rmap(struct page *page, bool compound)
    1347             : {
    1348      923070 :         lock_page_memcg(page);
    1349             : 
    1350      923070 :         if (!PageAnon(page)) {
    1351      776428 :                 page_remove_file_rmap(page, compound);
    1352      776693 :                 goto out;
    1353             :         }
    1354             : 
    1355      146642 :         if (compound) {
    1356          17 :                 page_remove_anon_compound_rmap(page);
    1357          17 :                 goto out;
    1358             :         }
    1359             : 
    1360             :         /* page still mapped by someone else? */
    1361      293262 :         if (!atomic_add_negative(-1, &page->_mapcount))
    1362       81577 :                 goto out;
    1363             : 
    1364             :         /*
    1365             :          * We use the irq-unsafe __{inc|mod}_zone_page_stat because
    1366             :          * these counters are not modified in interrupt context, and
    1367             :          * pte lock(a spinlock) is held, which implies preemption disabled.
    1368             :          */
    1369       65060 :         __dec_lruvec_page_state(page, NR_ANON_MAPPED);
    1370             : 
    1371      130117 :         if (unlikely(PageMlocked(page)))
    1372           0 :                 clear_page_mlock(page);
    1373             : 
    1374       65057 :         if (PageTransCompound(page))
    1375           0 :                 deferred_split_huge_page(compound_head(page));
    1376             : 
    1377             :         /*
    1378             :          * It would be tidy to reset the PageAnon mapping here,
    1379             :          * but that might overwrite a racing page_add_anon_rmap
    1380             :          * which increments mapcount after us but sets mapping
    1381             :          * before us: so leave the reset to free_unref_page,
    1382             :          * and remember that it's only reliable while mapped.
    1383             :          * Leaving it set also helps swapoff to reinstate ptes
    1384             :          * faster for those pages still in swapcache.
    1385             :          */
    1386       65059 : out:
    1387      923346 :         unlock_page_memcg(page);
    1388      923346 : }
    1389             : 
    1390             : /*
    1391             :  * @arg: enum ttu_flags will be passed to this argument
    1392             :  */
    1393           0 : static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
    1394             :                      unsigned long address, void *arg)
    1395             : {
    1396           0 :         struct mm_struct *mm = vma->vm_mm;
    1397           0 :         struct page_vma_mapped_walk pvmw = {
    1398             :                 .page = page,
    1399             :                 .vma = vma,
    1400             :                 .address = address,
    1401             :         };
    1402           0 :         pte_t pteval;
    1403           0 :         struct page *subpage;
    1404           0 :         bool ret = true;
    1405           0 :         struct mmu_notifier_range range;
    1406           0 :         enum ttu_flags flags = (enum ttu_flags)(long)arg;
    1407             : 
    1408             :         /* munlock has nothing to gain from examining un-locked vmas */
    1409           0 :         if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
    1410             :                 return true;
    1411             : 
    1412           0 :         if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
    1413           0 :             is_zone_device_page(page) && !is_device_private_page(page))
    1414             :                 return true;
    1415             : 
    1416           0 :         if (flags & TTU_SPLIT_HUGE_PMD) {
    1417           0 :                 split_huge_pmd_address(vma, address,
    1418           0 :                                 flags & TTU_SPLIT_FREEZE, page);
    1419             :         }
    1420             : 
    1421             :         /*
    1422             :          * For THP, we have to assume the worse case ie pmd for invalidation.
    1423             :          * For hugetlb, it could be much worse if we need to do pud
    1424             :          * invalidation in the case of pmd sharing.
    1425             :          *
    1426             :          * Note that the page can not be free in this function as call of
    1427             :          * try_to_unmap() must hold a reference on the page.
    1428             :          */
    1429           0 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
    1430             :                                 address,
    1431             :                                 min(vma->vm_end, address + page_size(page)));
    1432           0 :         if (PageHuge(page)) {
    1433             :                 /*
    1434             :                  * If sharing is possible, start and end will be adjusted
    1435             :                  * accordingly.
    1436             :                  */
    1437           0 :                 adjust_range_if_pmd_sharing_possible(vma, &range.start,
    1438             :                                                      &range.end);
    1439             :         }
    1440           0 :         mmu_notifier_invalidate_range_start(&range);
    1441             : 
    1442           0 :         while (page_vma_mapped_walk(&pvmw)) {
    1443             : #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
    1444             :                 /* PMD-mapped THP migration entry */
    1445           0 :                 if (!pvmw.pte && (flags & TTU_MIGRATION)) {
    1446           0 :                         VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
    1447             : 
    1448           0 :                         set_pmd_migration_entry(&pvmw, page);
    1449           0 :                         continue;
    1450             :                 }
    1451             : #endif
    1452             : 
    1453             :                 /*
    1454             :                  * If the page is mlock()d, we cannot swap it out.
    1455             :                  * If it's recently referenced (perhaps page_referenced
    1456             :                  * skipped over this mm) then we should reactivate it.
    1457             :                  */
    1458           0 :                 if (!(flags & TTU_IGNORE_MLOCK)) {
    1459           0 :                         if (vma->vm_flags & VM_LOCKED) {
    1460             :                                 /* PTE-mapped THP are never mlocked */
    1461           0 :                                 if (!PageTransCompound(page)) {
    1462             :                                         /*
    1463             :                                          * Holding pte lock, we do *not* need
    1464             :                                          * mmap_lock here
    1465             :                                          */
    1466           0 :                                         mlock_vma_page(page);
    1467             :                                 }
    1468           0 :                                 ret = false;
    1469           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1470             :                                 break;
    1471             :                         }
    1472           0 :                         if (flags & TTU_MUNLOCK)
    1473           0 :                                 continue;
    1474             :                 }
    1475             : 
    1476             :                 /* Unexpected PMD-mapped THP? */
    1477           0 :                 VM_BUG_ON_PAGE(!pvmw.pte, page);
    1478             : 
    1479           0 :                 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
    1480           0 :                 address = pvmw.address;
    1481             : 
    1482           0 :                 if (PageHuge(page) && !PageAnon(page)) {
    1483             :                         /*
    1484             :                          * To call huge_pmd_unshare, i_mmap_rwsem must be
    1485             :                          * held in write mode.  Caller needs to explicitly
    1486             :                          * do this outside rmap routines.
    1487             :                          */
    1488             :                         VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
    1489             :                         if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
    1490             :                                 /*
    1491             :                                  * huge_pmd_unshare unmapped an entire PMD
    1492             :                                  * page.  There is no way of knowing exactly
    1493             :                                  * which PMDs may be cached for this mm, so
    1494             :                                  * we must flush them all.  start/end were
    1495             :                                  * already adjusted above to cover this range.
    1496             :                                  */
    1497             :                                 flush_cache_range(vma, range.start, range.end);
    1498             :                                 flush_tlb_range(vma, range.start, range.end);
    1499             :                                 mmu_notifier_invalidate_range(mm, range.start,
    1500             :                                                               range.end);
    1501             : 
    1502             :                                 /*
    1503             :                                  * The ref count of the PMD page was dropped
    1504             :                                  * which is part of the way map counting
    1505             :                                  * is done for shared PMDs.  Return 'true'
    1506             :                                  * here.  When there is no other sharing,
    1507             :                                  * huge_pmd_unshare returns false and we will
    1508             :                                  * unmap the actual page and drop map count
    1509             :                                  * to zero.
    1510             :                                  */
    1511           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1512             :                                 break;
    1513             :                         }
    1514             :                 }
    1515             : 
    1516           0 :                 if (IS_ENABLED(CONFIG_MIGRATION) &&
    1517             :                     (flags & TTU_MIGRATION) &&
    1518           0 :                     is_zone_device_page(page)) {
    1519             :                         swp_entry_t entry;
    1520             :                         pte_t swp_pte;
    1521             : 
    1522             :                         pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
    1523             : 
    1524             :                         /*
    1525             :                          * Store the pfn of the page in a special migration
    1526             :                          * pte. do_swap_page() will wait until the migration
    1527             :                          * pte is removed and then restart fault handling.
    1528             :                          */
    1529             :                         entry = make_migration_entry(page, 0);
    1530             :                         swp_pte = swp_entry_to_pte(entry);
    1531             : 
    1532             :                         /*
    1533             :                          * pteval maps a zone device page and is therefore
    1534             :                          * a swap pte.
    1535             :                          */
    1536             :                         if (pte_swp_soft_dirty(pteval))
    1537             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    1538             :                         if (pte_swp_uffd_wp(pteval))
    1539             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    1540             :                         set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
    1541             :                         /*
    1542             :                          * No need to invalidate here it will synchronize on
    1543             :                          * against the special swap migration pte.
    1544             :                          *
    1545             :                          * The assignment to subpage above was computed from a
    1546             :                          * swap PTE which results in an invalid pointer.
    1547             :                          * Since only PAGE_SIZE pages can currently be
    1548             :                          * migrated, just set it to page. This will need to be
    1549             :                          * changed when hugepage migrations to device private
    1550             :                          * memory are supported.
    1551             :                          */
    1552             :                         subpage = page;
    1553             :                         goto discard;
    1554             :                 }
    1555             : 
    1556             :                 /* Nuke the page table entry. */
    1557           0 :                 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    1558           0 :                 if (should_defer_flush(mm, flags)) {
    1559             :                         /*
    1560             :                          * We clear the PTE but do not flush so potentially
    1561             :                          * a remote CPU could still be writing to the page.
    1562             :                          * If the entry was previously clean then the
    1563             :                          * architecture must guarantee that a clear->dirty
    1564             :                          * transition on a cached TLB entry is written through
    1565             :                          * and traps if the PTE is unmapped.
    1566             :                          */
    1567           0 :                         pteval = ptep_get_and_clear(mm, address, pvmw.pte);
    1568             : 
    1569           0 :                         set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
    1570             :                 } else {
    1571           0 :                         pteval = ptep_clear_flush(vma, address, pvmw.pte);
    1572             :                 }
    1573             : 
    1574             :                 /* Move the dirty bit to the page. Now the pte is gone. */
    1575           0 :                 if (pte_dirty(pteval))
    1576           0 :                         set_page_dirty(page);
    1577             : 
    1578             :                 /* Update high watermark before we lower rss */
    1579           0 :                 update_hiwater_rss(mm);
    1580             : 
    1581           0 :                 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
    1582             :                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
    1583             :                         if (PageHuge(page)) {
    1584             :                                 hugetlb_count_sub(compound_nr(page), mm);
    1585             :                                 set_huge_swap_pte_at(mm, address,
    1586             :                                                      pvmw.pte, pteval,
    1587             :                                                      vma_mmu_pagesize(vma));
    1588             :                         } else {
    1589             :                                 dec_mm_counter(mm, mm_counter(page));
    1590             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1591             :                         }
    1592             : 
    1593           0 :                 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
    1594             :                         /*
    1595             :                          * The guest indicated that the page content is of no
    1596             :                          * interest anymore. Simply discard the pte, vmscan
    1597             :                          * will take care of the rest.
    1598             :                          * A future reference will then fault in a new zero
    1599             :                          * page. When userfaultfd is active, we must not drop
    1600             :                          * this page though, as its main user (postcopy
    1601             :                          * migration) will not expect userfaults on already
    1602             :                          * copied pages.
    1603             :                          */
    1604             :                         dec_mm_counter(mm, mm_counter(page));
    1605             :                         /* We have to invalidate as we cleared the pte */
    1606           0 :                         mmu_notifier_invalidate_range(mm, address,
    1607             :                                                       address + PAGE_SIZE);
    1608           0 :                 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
    1609           0 :                                 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
    1610           0 :                         swp_entry_t entry;
    1611           0 :                         pte_t swp_pte;
    1612             : 
    1613           0 :                         if (arch_unmap_one(mm, vma, address, pteval) < 0) {
    1614             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1615             :                                 ret = false;
    1616             :                                 page_vma_mapped_walk_done(&pvmw);
    1617             :                                 break;
    1618             :                         }
    1619             : 
    1620             :                         /*
    1621             :                          * Store the pfn of the page in a special migration
    1622             :                          * pte. do_swap_page() will wait until the migration
    1623             :                          * pte is removed and then restart fault handling.
    1624             :                          */
    1625           0 :                         entry = make_migration_entry(subpage,
    1626             :                                         pte_write(pteval));
    1627           0 :                         swp_pte = swp_entry_to_pte(entry);
    1628           0 :                         if (pte_soft_dirty(pteval))
    1629             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    1630           0 :                         if (pte_uffd_wp(pteval))
    1631             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    1632           0 :                         set_pte_at(mm, address, pvmw.pte, swp_pte);
    1633             :                         /*
    1634             :                          * No need to invalidate here it will synchronize on
    1635             :                          * against the special swap migration pte.
    1636             :                          */
    1637           0 :                 } else if (PageAnon(page)) {
    1638           0 :                         swp_entry_t entry = { .val = page_private(subpage) };
    1639           0 :                         pte_t swp_pte;
    1640             :                         /*
    1641             :                          * Store the swap location in the pte.
    1642             :                          * See handle_pte_fault() ...
    1643             :                          */
    1644           0 :                         if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
    1645           0 :                                 WARN_ON_ONCE(1);
    1646           0 :                                 ret = false;
    1647             :                                 /* We have to invalidate as we cleared the pte */
    1648           0 :                                 mmu_notifier_invalidate_range(mm, address,
    1649             :                                                         address + PAGE_SIZE);
    1650           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1651           0 :                                 break;
    1652             :                         }
    1653             : 
    1654             :                         /* MADV_FREE page check */
    1655           0 :                         if (!PageSwapBacked(page)) {
    1656           0 :                                 if (!PageDirty(page)) {
    1657             :                                         /* Invalidate as we cleared the pte */
    1658           0 :                                         mmu_notifier_invalidate_range(mm,
    1659             :                                                 address, address + PAGE_SIZE);
    1660           0 :                                         dec_mm_counter(mm, MM_ANONPAGES);
    1661           0 :                                         goto discard;
    1662             :                                 }
    1663             : 
    1664             :                                 /*
    1665             :                                  * If the page was redirtied, it cannot be
    1666             :                                  * discarded. Remap the page to page table.
    1667             :                                  */
    1668           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1669           0 :                                 SetPageSwapBacked(page);
    1670           0 :                                 ret = false;
    1671           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1672             :                                 break;
    1673             :                         }
    1674             : 
    1675           0 :                         if (swap_duplicate(entry) < 0) {
    1676             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1677             :                                 ret = false;
    1678           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1679             :                                 break;
    1680             :                         }
    1681           0 :                         if (arch_unmap_one(mm, vma, address, pteval) < 0) {
    1682             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1683             :                                 ret = false;
    1684           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1685             :                                 break;
    1686             :                         }
    1687           0 :                         if (list_empty(&mm->mmlist)) {
    1688           0 :                                 spin_lock(&mmlist_lock);
    1689           0 :                                 if (list_empty(&mm->mmlist))
    1690           0 :                                         list_add(&mm->mmlist, &init_mm.mmlist);
    1691           0 :                                 spin_unlock(&mmlist_lock);
    1692             :                         }
    1693           0 :                         dec_mm_counter(mm, MM_ANONPAGES);
    1694           0 :                         inc_mm_counter(mm, MM_SWAPENTS);
    1695           0 :                         swp_pte = swp_entry_to_pte(entry);
    1696           0 :                         if (pte_soft_dirty(pteval))
    1697             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    1698           0 :                         if (pte_uffd_wp(pteval))
    1699             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    1700           0 :                         set_pte_at(mm, address, pvmw.pte, swp_pte);
    1701             :                         /* Invalidate as we cleared the pte */
    1702           0 :                         mmu_notifier_invalidate_range(mm, address,
    1703             :                                                       address + PAGE_SIZE);
    1704             :                 } else {
    1705             :                         /*
    1706             :                          * This is a locked file-backed page, thus it cannot
    1707             :                          * be removed from the page cache and replaced by a new
    1708             :                          * page before mmu_notifier_invalidate_range_end, so no
    1709             :                          * concurrent thread might update its page table to
    1710             :                          * point at new page while a device still is using this
    1711             :                          * page.
    1712             :                          *
    1713             :                          * See Documentation/vm/mmu_notifier.rst
    1714             :                          */
    1715           0 :                         dec_mm_counter(mm, mm_counter_file(page));
    1716             :                 }
    1717           0 : discard:
    1718             :                 /*
    1719             :                  * No need to call mmu_notifier_invalidate_range() it has be
    1720             :                  * done above for all cases requiring it to happen under page
    1721             :                  * table lock before mmu_notifier_invalidate_range_end()
    1722             :                  *
    1723             :                  * See Documentation/vm/mmu_notifier.rst
    1724             :                  */
    1725           0 :                 page_remove_rmap(subpage, PageHuge(page));
    1726           0 :                 put_page(page);
    1727             :         }
    1728             : 
    1729           0 :         mmu_notifier_invalidate_range_end(&range);
    1730             : 
    1731             :         return ret;
    1732             : }
    1733             : 
    1734           0 : static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
    1735             : {
    1736           0 :         return vma_is_temporary_stack(vma);
    1737             : }
    1738             : 
    1739           0 : static int page_not_mapped(struct page *page)
    1740             : {
    1741           0 :         return !page_mapped(page);
    1742             : }
    1743             : 
    1744             : /**
    1745             :  * try_to_unmap - try to remove all page table mappings to a page
    1746             :  * @page: the page to get unmapped
    1747             :  * @flags: action and flags
    1748             :  *
    1749             :  * Tries to remove all the page table entries which are mapping this
    1750             :  * page, used in the pageout path.  Caller must hold the page lock.
    1751             :  *
    1752             :  * If unmap is successful, return true. Otherwise, false.
    1753             :  */
    1754           0 : bool try_to_unmap(struct page *page, enum ttu_flags flags)
    1755             : {
    1756           0 :         struct rmap_walk_control rwc = {
    1757             :                 .rmap_one = try_to_unmap_one,
    1758           0 :                 .arg = (void *)flags,
    1759             :                 .done = page_not_mapped,
    1760             :                 .anon_lock = page_lock_anon_vma_read,
    1761             :         };
    1762             : 
    1763             :         /*
    1764             :          * During exec, a temporary VMA is setup and later moved.
    1765             :          * The VMA is moved under the anon_vma lock but not the
    1766             :          * page tables leading to a race where migration cannot
    1767             :          * find the migration ptes. Rather than increasing the
    1768             :          * locking requirements of exec(), migration skips
    1769             :          * temporary VMAs until after exec() completes.
    1770             :          */
    1771           0 :         if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
    1772           0 :             && !PageKsm(page) && PageAnon(page))
    1773           0 :                 rwc.invalid_vma = invalid_migration_vma;
    1774             : 
    1775           0 :         if (flags & TTU_RMAP_LOCKED)
    1776           0 :                 rmap_walk_locked(page, &rwc);
    1777             :         else
    1778           0 :                 rmap_walk(page, &rwc);
    1779             : 
    1780           0 :         return !page_mapcount(page) ? true : false;
    1781             : }
    1782             : 
    1783             : /**
    1784             :  * try_to_munlock - try to munlock a page
    1785             :  * @page: the page to be munlocked
    1786             :  *
    1787             :  * Called from munlock code.  Checks all of the VMAs mapping the page
    1788             :  * to make sure nobody else has this page mlocked. The page will be
    1789             :  * returned with PG_mlocked cleared if no other vmas have it mlocked.
    1790             :  */
    1791             : 
    1792           0 : void try_to_munlock(struct page *page)
    1793             : {
    1794           0 :         struct rmap_walk_control rwc = {
    1795             :                 .rmap_one = try_to_unmap_one,
    1796             :                 .arg = (void *)TTU_MUNLOCK,
    1797             :                 .done = page_not_mapped,
    1798             :                 .anon_lock = page_lock_anon_vma_read,
    1799             : 
    1800             :         };
    1801             : 
    1802           0 :         VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
    1803           0 :         VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
    1804             : 
    1805           0 :         rmap_walk(page, &rwc);
    1806           0 : }
    1807             : 
    1808       39903 : void __put_anon_vma(struct anon_vma *anon_vma)
    1809             : {
    1810       39903 :         struct anon_vma *root = anon_vma->root;
    1811             : 
    1812       39903 :         anon_vma_free(anon_vma);
    1813       66871 :         if (root != anon_vma && atomic_dec_and_test(&root->refcount))
    1814           0 :                 anon_vma_free(root);
    1815       39904 : }
    1816             : 
    1817           0 : static struct anon_vma *rmap_walk_anon_lock(struct page *page,
    1818             :                                         struct rmap_walk_control *rwc)
    1819             : {
    1820           0 :         struct anon_vma *anon_vma;
    1821             : 
    1822           0 :         if (rwc->anon_lock)
    1823           0 :                 return rwc->anon_lock(page);
    1824             : 
    1825             :         /*
    1826             :          * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
    1827             :          * because that depends on page_mapped(); but not all its usages
    1828             :          * are holding mmap_lock. Users without mmap_lock are required to
    1829             :          * take a reference count to prevent the anon_vma disappearing
    1830             :          */
    1831           0 :         anon_vma = page_anon_vma(page);
    1832           0 :         if (!anon_vma)
    1833             :                 return NULL;
    1834             : 
    1835           0 :         anon_vma_lock_read(anon_vma);
    1836           0 :         return anon_vma;
    1837             : }
    1838             : 
    1839             : /*
    1840             :  * rmap_walk_anon - do something to anonymous page using the object-based
    1841             :  * rmap method
    1842             :  * @page: the page to be handled
    1843             :  * @rwc: control variable according to each walk type
    1844             :  *
    1845             :  * Find all the mappings of a page using the mapping pointer and the vma chains
    1846             :  * contained in the anon_vma struct it points to.
    1847             :  *
    1848             :  * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
    1849             :  * where the page was found will be held for write.  So, we won't recheck
    1850             :  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
    1851             :  * LOCKED.
    1852             :  */
    1853           0 : static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
    1854             :                 bool locked)
    1855             : {
    1856           0 :         struct anon_vma *anon_vma;
    1857           0 :         pgoff_t pgoff_start, pgoff_end;
    1858           0 :         struct anon_vma_chain *avc;
    1859             : 
    1860           0 :         if (locked) {
    1861           0 :                 anon_vma = page_anon_vma(page);
    1862             :                 /* anon_vma disappear under us? */
    1863           0 :                 VM_BUG_ON_PAGE(!anon_vma, page);
    1864             :         } else {
    1865           0 :                 anon_vma = rmap_walk_anon_lock(page, rwc);
    1866             :         }
    1867           0 :         if (!anon_vma)
    1868             :                 return;
    1869             : 
    1870           0 :         pgoff_start = page_to_pgoff(page);
    1871           0 :         pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
    1872           0 :         anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
    1873             :                         pgoff_start, pgoff_end) {
    1874           0 :                 struct vm_area_struct *vma = avc->vma;
    1875           0 :                 unsigned long address = vma_address(page, vma);
    1876             : 
    1877           0 :                 cond_resched();
    1878             : 
    1879           0 :                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
    1880           0 :                         continue;
    1881             : 
    1882           0 :                 if (!rwc->rmap_one(page, vma, address, rwc->arg))
    1883             :                         break;
    1884           0 :                 if (rwc->done && rwc->done(page))
    1885             :                         break;
    1886             :         }
    1887             : 
    1888           0 :         if (!locked)
    1889           0 :                 anon_vma_unlock_read(anon_vma);
    1890             : }
    1891             : 
    1892             : /*
    1893             :  * rmap_walk_file - do something to file page using the object-based rmap method
    1894             :  * @page: the page to be handled
    1895             :  * @rwc: control variable according to each walk type
    1896             :  *
    1897             :  * Find all the mappings of a page using the mapping pointer and the vma chains
    1898             :  * contained in the address_space struct it points to.
    1899             :  *
    1900             :  * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
    1901             :  * where the page was found will be held for write.  So, we won't recheck
    1902             :  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
    1903             :  * LOCKED.
    1904             :  */
    1905          68 : static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
    1906             :                 bool locked)
    1907             : {
    1908          68 :         struct address_space *mapping = page_mapping(page);
    1909          68 :         pgoff_t pgoff_start, pgoff_end;
    1910          68 :         struct vm_area_struct *vma;
    1911             : 
    1912             :         /*
    1913             :          * The page lock not only makes sure that page->mapping cannot
    1914             :          * suddenly be NULLified by truncation, it makes sure that the
    1915             :          * structure at mapping cannot be freed and reused yet,
    1916             :          * so we can safely take mapping->i_mmap_rwsem.
    1917             :          */
    1918         136 :         VM_BUG_ON_PAGE(!PageLocked(page), page);
    1919             : 
    1920          68 :         if (!mapping)
    1921             :                 return;
    1922             : 
    1923          68 :         pgoff_start = page_to_pgoff(page);
    1924          68 :         pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
    1925          68 :         if (!locked)
    1926          68 :                 i_mmap_lock_read(mapping);
    1927         136 :         vma_interval_tree_foreach(vma, &mapping->i_mmap,
    1928             :                         pgoff_start, pgoff_end) {
    1929          68 :                 unsigned long address = vma_address(page, vma);
    1930             : 
    1931          68 :                 cond_resched();
    1932             : 
    1933          68 :                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
    1934           0 :                         continue;
    1935             : 
    1936          68 :                 if (!rwc->rmap_one(page, vma, address, rwc->arg))
    1937           0 :                         goto done;
    1938          68 :                 if (rwc->done && rwc->done(page))
    1939           0 :                         goto done;
    1940             :         }
    1941             : 
    1942          68 : done:
    1943          68 :         if (!locked)
    1944          68 :                 i_mmap_unlock_read(mapping);
    1945             : }
    1946             : 
    1947          68 : void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
    1948             : {
    1949          68 :         if (unlikely(PageKsm(page)))
    1950           0 :                 rmap_walk_ksm(page, rwc);
    1951          68 :         else if (PageAnon(page))
    1952           0 :                 rmap_walk_anon(page, rwc, false);
    1953             :         else
    1954          68 :                 rmap_walk_file(page, rwc, false);
    1955          68 : }
    1956             : 
    1957             : /* Like rmap_walk, but caller holds relevant rmap lock */
    1958           0 : void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
    1959             : {
    1960             :         /* no ksm support for now */
    1961           0 :         VM_BUG_ON_PAGE(PageKsm(page), page);
    1962           0 :         if (PageAnon(page))
    1963           0 :                 rmap_walk_anon(page, rwc, true);
    1964             :         else
    1965           0 :                 rmap_walk_file(page, rwc, true);
    1966           0 : }
    1967             : 
    1968             : #ifdef CONFIG_HUGETLB_PAGE
    1969             : /*
    1970             :  * The following two functions are for anonymous (private mapped) hugepages.
    1971             :  * Unlike common anonymous pages, anonymous hugepages have no accounting code
    1972             :  * and no lru code, because we handle hugepages differently from common pages.
    1973             :  */
    1974             : void hugepage_add_anon_rmap(struct page *page,
    1975             :                             struct vm_area_struct *vma, unsigned long address)
    1976             : {
    1977             :         struct anon_vma *anon_vma = vma->anon_vma;
    1978             :         int first;
    1979             : 
    1980             :         BUG_ON(!PageLocked(page));
    1981             :         BUG_ON(!anon_vma);
    1982             :         /* address might be in next vma when migration races vma_adjust */
    1983             :         first = atomic_inc_and_test(compound_mapcount_ptr(page));
    1984             :         if (first)
    1985             :                 __page_set_anon_rmap(page, vma, address, 0);
    1986             : }
    1987             : 
    1988             : void hugepage_add_new_anon_rmap(struct page *page,
    1989             :                         struct vm_area_struct *vma, unsigned long address)
    1990             : {
    1991             :         BUG_ON(address < vma->vm_start || address >= vma->vm_end);
    1992             :         atomic_set(compound_mapcount_ptr(page), 0);
    1993             :         if (hpage_pincount_available(page))
    1994             :                 atomic_set(compound_pincount_ptr(page), 0);
    1995             : 
    1996             :         __page_set_anon_rmap(page, vma, address, 1);
    1997             : }
    1998             : #endif /* CONFIG_HUGETLB_PAGE */

Generated by: LCOV version 1.14