Line data Source code
1 : /*
2 : * mm/rmap.c - physical to virtual reverse mappings
3 : *
4 : * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 : * Released under the General Public License (GPL).
6 : *
7 : * Simple, low overhead reverse mapping scheme.
8 : * Please try to keep this thing as modular as possible.
9 : *
10 : * Provides methods for unmapping each kind of mapped page:
11 : * the anon methods track anonymous pages, and
12 : * the file methods track pages belonging to an inode.
13 : *
14 : * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 : * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 : * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 : * Contributions by Hugh Dickins 2003, 2004
18 : */
19 :
20 : /*
21 : * Lock ordering in mm:
22 : *
23 : * inode->i_mutex (while writing or truncating, not reading or faulting)
24 : * mm->mmap_lock
25 : * page->flags PG_locked (lock_page) * (see huegtlbfs below)
26 : * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
27 : * mapping->i_mmap_rwsem
28 : * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
29 : * anon_vma->rwsem
30 : * mm->page_table_lock or pte_lock
31 : * swap_lock (in swap_duplicate, swap_info_get)
32 : * mmlist_lock (in mmput, drain_mmlist and others)
33 : * mapping->private_lock (in __set_page_dirty_buffers)
34 : * lock_page_memcg move_lock (in __set_page_dirty_buffers)
35 : * i_pages lock (widely used)
36 : * lruvec->lru_lock (in lock_page_lruvec_irq)
37 : * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
38 : * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
39 : * sb_lock (within inode_lock in fs/fs-writeback.c)
40 : * i_pages lock (widely used, in set_page_dirty,
41 : * in arch-dependent flush_dcache_mmap_lock,
42 : * within bdi.wb->list_lock in __sync_single_inode)
43 : *
44 : * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
45 : * ->tasklist_lock
46 : * pte map lock
47 : *
48 : * * hugetlbfs PageHuge() pages take locks in this order:
49 : * mapping->i_mmap_rwsem
50 : * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
51 : * page->flags PG_locked (lock_page)
52 : */
53 :
54 : #include <linux/mm.h>
55 : #include <linux/sched/mm.h>
56 : #include <linux/sched/task.h>
57 : #include <linux/pagemap.h>
58 : #include <linux/swap.h>
59 : #include <linux/swapops.h>
60 : #include <linux/slab.h>
61 : #include <linux/init.h>
62 : #include <linux/ksm.h>
63 : #include <linux/rmap.h>
64 : #include <linux/rcupdate.h>
65 : #include <linux/export.h>
66 : #include <linux/memcontrol.h>
67 : #include <linux/mmu_notifier.h>
68 : #include <linux/migrate.h>
69 : #include <linux/hugetlb.h>
70 : #include <linux/huge_mm.h>
71 : #include <linux/backing-dev.h>
72 : #include <linux/page_idle.h>
73 : #include <linux/memremap.h>
74 : #include <linux/userfaultfd_k.h>
75 :
76 : #include <asm/tlbflush.h>
77 :
78 : #include <trace/events/tlb.h>
79 :
80 : #include "internal.h"
81 :
82 : static struct kmem_cache *anon_vma_cachep;
83 : static struct kmem_cache *anon_vma_chain_cachep;
84 :
85 40833 : static inline struct anon_vma *anon_vma_alloc(void)
86 : {
87 40833 : struct anon_vma *anon_vma;
88 :
89 40833 : anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
90 40833 : if (anon_vma) {
91 40833 : atomic_set(&anon_vma->refcount, 1);
92 40833 : anon_vma->degree = 1; /* Reference for first vma */
93 40833 : anon_vma->parent = anon_vma;
94 : /*
95 : * Initialise the anon_vma root to point to itself. If called
96 : * from fork, the root will be reset to the parents anon_vma.
97 : */
98 40833 : anon_vma->root = anon_vma;
99 : }
100 :
101 40833 : return anon_vma;
102 : }
103 :
104 39927 : static inline void anon_vma_free(struct anon_vma *anon_vma)
105 : {
106 39927 : VM_BUG_ON(atomic_read(&anon_vma->refcount));
107 :
108 : /*
109 : * Synchronize against page_lock_anon_vma_read() such that
110 : * we can safely hold the lock without the anon_vma getting
111 : * freed.
112 : *
113 : * Relies on the full mb implied by the atomic_dec_and_test() from
114 : * put_anon_vma() against the acquire barrier implied by
115 : * down_read_trylock() from page_lock_anon_vma_read(). This orders:
116 : *
117 : * page_lock_anon_vma_read() VS put_anon_vma()
118 : * down_read_trylock() atomic_dec_and_test()
119 : * LOCK MB
120 : * atomic_read() rwsem_is_locked()
121 : *
122 : * LOCK should suffice since the actual taking of the lock must
123 : * happen _before_ what follows.
124 : */
125 39927 : might_sleep();
126 39927 : if (rwsem_is_locked(&anon_vma->root->rwsem)) {
127 18 : anon_vma_lock_write(anon_vma);
128 18 : anon_vma_unlock_write(anon_vma);
129 : }
130 :
131 39926 : kmem_cache_free(anon_vma_cachep, anon_vma);
132 39925 : }
133 :
134 82839 : static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
135 : {
136 82839 : return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
137 : }
138 :
139 81411 : static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
140 : {
141 81411 : kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
142 41490 : }
143 :
144 82840 : static void anon_vma_chain_link(struct vm_area_struct *vma,
145 : struct anon_vma_chain *avc,
146 : struct anon_vma *anon_vma)
147 : {
148 82840 : avc->vma = vma;
149 82840 : avc->anon_vma = anon_vma;
150 82840 : list_add(&avc->same_vma, &vma->anon_vma_chain);
151 82840 : anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
152 : }
153 :
154 : /**
155 : * __anon_vma_prepare - attach an anon_vma to a memory region
156 : * @vma: the memory region in question
157 : *
158 : * This makes sure the memory mapping described by 'vma' has
159 : * an 'anon_vma' attached to it, so that we can associate the
160 : * anonymous pages mapped into it with that anon_vma.
161 : *
162 : * The common case will be that we already have one, which
163 : * is handled inline by anon_vma_prepare(). But if
164 : * not we either need to find an adjacent mapping that we
165 : * can re-use the anon_vma from (very common when the only
166 : * reason for splitting a vma has been mprotect()), or we
167 : * allocate a new one.
168 : *
169 : * Anon-vma allocations are very subtle, because we may have
170 : * optimistically looked up an anon_vma in page_lock_anon_vma_read()
171 : * and that may actually touch the rwsem even in the newly
172 : * allocated vma (it depends on RCU to make sure that the
173 : * anon_vma isn't actually destroyed).
174 : *
175 : * As a result, we need to do proper anon_vma locking even
176 : * for the new allocation. At the same time, we do not want
177 : * to do any locking for the common case of already having
178 : * an anon_vma.
179 : *
180 : * This must be called with the mmap_lock held for reading.
181 : */
182 13582 : int __anon_vma_prepare(struct vm_area_struct *vma)
183 : {
184 13582 : struct mm_struct *mm = vma->vm_mm;
185 13582 : struct anon_vma *anon_vma, *allocated;
186 13582 : struct anon_vma_chain *avc;
187 :
188 13582 : might_sleep();
189 :
190 13582 : avc = anon_vma_chain_alloc(GFP_KERNEL);
191 13582 : if (!avc)
192 0 : goto out_enomem;
193 :
194 13582 : anon_vma = find_mergeable_anon_vma(vma);
195 13582 : allocated = NULL;
196 13582 : if (!anon_vma) {
197 13582 : anon_vma = anon_vma_alloc();
198 13582 : if (unlikely(!anon_vma))
199 0 : goto out_enomem_free_avc;
200 : allocated = anon_vma;
201 : }
202 :
203 13582 : anon_vma_lock_write(anon_vma);
204 : /* page_table_lock to protect against threads */
205 13582 : spin_lock(&mm->page_table_lock);
206 13582 : if (likely(!vma->anon_vma)) {
207 13582 : vma->anon_vma = anon_vma;
208 13582 : anon_vma_chain_link(vma, avc, anon_vma);
209 : /* vma reference or self-parent link for new root */
210 13582 : anon_vma->degree++;
211 13582 : allocated = NULL;
212 13582 : avc = NULL;
213 : }
214 13582 : spin_unlock(&mm->page_table_lock);
215 13582 : anon_vma_unlock_write(anon_vma);
216 :
217 13582 : if (unlikely(allocated))
218 0 : put_anon_vma(allocated);
219 13582 : if (unlikely(avc))
220 0 : anon_vma_chain_free(avc);
221 :
222 : return 0;
223 :
224 0 : out_enomem_free_avc:
225 0 : anon_vma_chain_free(avc);
226 : out_enomem:
227 : return -ENOMEM;
228 : }
229 :
230 : /*
231 : * This is a useful helper function for locking the anon_vma root as
232 : * we traverse the vma->anon_vma_chain, looping over anon_vma's that
233 : * have the same vma.
234 : *
235 : * Such anon_vma's should have the same root, so you'd expect to see
236 : * just a single mutex_lock for the whole traversal.
237 : */
238 123426 : static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
239 : {
240 123426 : struct anon_vma *new_root = anon_vma->root;
241 123426 : if (new_root != root) {
242 79238 : if (WARN_ON_ONCE(root))
243 0 : up_write(&root->rwsem);
244 79238 : root = new_root;
245 79238 : down_write(&root->rwsem);
246 : }
247 123421 : return root;
248 : }
249 :
250 168391 : static inline void unlock_anon_vma_root(struct anon_vma *root)
251 : {
252 168391 : if (root)
253 79235 : up_write(&root->rwsem);
254 : }
255 :
256 : /*
257 : * Attach the anon_vmas from src to dst.
258 : * Returns 0 on success, -ENOMEM on failure.
259 : *
260 : * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
261 : * anon_vma_fork(). The first three want an exact copy of src, while the last
262 : * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
263 : * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
264 : * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
265 : *
266 : * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
267 : * and reuse existing anon_vma which has no vmas and only one child anon_vma.
268 : * This prevents degradation of anon_vma hierarchy to endless linear chain in
269 : * case of constantly forking task. On the other hand, an anon_vma with more
270 : * than one child isn't reused even if there was no alive vma, thus rmap
271 : * walker has a good chance of avoiding scanning the whole hierarchy when it
272 : * searches where page is mapped.
273 : */
274 52824 : int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
275 : {
276 52824 : struct anon_vma_chain *avc, *pavc;
277 52824 : struct anon_vma *root = NULL;
278 :
279 94830 : list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
280 42006 : struct anon_vma *anon_vma;
281 :
282 42006 : avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
283 42007 : if (unlikely(!avc)) {
284 0 : unlock_anon_vma_root(root);
285 0 : root = NULL;
286 0 : avc = anon_vma_chain_alloc(GFP_KERNEL);
287 0 : if (!avc)
288 0 : goto enomem_failure;
289 : }
290 42007 : anon_vma = pavc->anon_vma;
291 42007 : root = lock_anon_vma_root(root, anon_vma);
292 42007 : anon_vma_chain_link(dst, avc, anon_vma);
293 :
294 : /*
295 : * Reuse existing anon_vma if its degree lower than two,
296 : * that means it has no vma and only one anon_vma child.
297 : *
298 : * Do not chose parent anon_vma, otherwise first child
299 : * will always reuse it. Root anon_vma is never reused:
300 : * it has self-parent reference and at least one child.
301 : */
302 42006 : if (!dst->anon_vma && src->anon_vma &&
303 8603 : anon_vma != src->anon_vma && anon_vma->degree < 2)
304 0 : dst->anon_vma = anon_vma;
305 : }
306 52824 : if (dst->anon_vma)
307 6152 : dst->anon_vma->degree++;
308 52824 : unlock_anon_vma_root(root);
309 : return 0;
310 :
311 0 : enomem_failure:
312 : /*
313 : * dst->anon_vma is dropped here otherwise its degree can be incorrectly
314 : * decremented in unlink_anon_vmas().
315 : * We can safely do this because callers of anon_vma_clone() don't care
316 : * about dst->anon_vma if anon_vma_clone() failed.
317 : */
318 0 : dst->anon_vma = NULL;
319 0 : unlink_anon_vmas(dst);
320 0 : return -ENOMEM;
321 : }
322 :
323 : /*
324 : * Attach vma to its own anon_vma, as well as to the anon_vmas that
325 : * the corresponding VMA in the parent process is attached to.
326 : * Returns 0 on success, non-zero on failure.
327 : */
328 60180 : int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
329 : {
330 60180 : struct anon_vma_chain *avc;
331 60180 : struct anon_vma *anon_vma;
332 60180 : int error;
333 :
334 : /* Don't bother if the parent process has no anon_vma here. */
335 60180 : if (!pvma->anon_vma)
336 : return 0;
337 :
338 : /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
339 27251 : vma->anon_vma = NULL;
340 :
341 : /*
342 : * First, attach the new VMA to the parent VMA's anon_vmas,
343 : * so rmap can find non-COWed pages in child processes.
344 : */
345 27251 : error = anon_vma_clone(vma, pvma);
346 27251 : if (error)
347 : return error;
348 :
349 : /* An existing anon_vma has been reused, all done then. */
350 27251 : if (vma->anon_vma)
351 : return 0;
352 :
353 : /* Then add our own anon_vma. */
354 27251 : anon_vma = anon_vma_alloc();
355 27251 : if (!anon_vma)
356 0 : goto out_error;
357 27251 : avc = anon_vma_chain_alloc(GFP_KERNEL);
358 27251 : if (!avc)
359 0 : goto out_error_free_anon_vma;
360 :
361 : /*
362 : * The root anon_vma's rwsem is the lock actually used when we
363 : * lock any of the anon_vmas in this anon_vma tree.
364 : */
365 27251 : anon_vma->root = pvma->anon_vma->root;
366 27251 : anon_vma->parent = pvma->anon_vma;
367 : /*
368 : * With refcounts, an anon_vma can stay around longer than the
369 : * process it belongs to. The root anon_vma needs to be pinned until
370 : * this anon_vma is freed, because the lock lives in the root.
371 : */
372 27251 : get_anon_vma(anon_vma->root);
373 : /* Mark this anon_vma as the one where our new (COWed) pages go. */
374 27251 : vma->anon_vma = anon_vma;
375 27251 : anon_vma_lock_write(anon_vma);
376 27251 : anon_vma_chain_link(vma, avc, anon_vma);
377 27251 : anon_vma->parent->degree++;
378 27251 : anon_vma_unlock_write(anon_vma);
379 :
380 27251 : return 0;
381 :
382 0 : out_error_free_anon_vma:
383 0 : put_anon_vma(anon_vma);
384 0 : out_error:
385 0 : unlink_anon_vmas(vma);
386 0 : return -ENOMEM;
387 : }
388 :
389 115566 : void unlink_anon_vmas(struct vm_area_struct *vma)
390 : {
391 115566 : struct anon_vma_chain *avc, *next;
392 115566 : struct anon_vma *root = NULL;
393 :
394 : /*
395 : * Unlink each anon_vma chained to the VMA. This list is ordered
396 : * from newest to oldest, ensuring the root anon_vma gets freed last.
397 : */
398 196970 : list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
399 81403 : struct anon_vma *anon_vma = avc->anon_vma;
400 :
401 81403 : root = lock_anon_vma_root(root, anon_vma);
402 81404 : anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
403 :
404 : /*
405 : * Leave empty anon_vmas on the list - we'll need
406 : * to free them outside the lock.
407 : */
408 81411 : if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
409 39914 : anon_vma->parent->degree--;
410 39914 : continue;
411 : }
412 :
413 41497 : list_del(&avc->same_vma);
414 122901 : anon_vma_chain_free(avc);
415 : }
416 115567 : if (vma->anon_vma) {
417 45831 : vma->anon_vma->degree--;
418 :
419 : /*
420 : * vma would still be needed after unlink, and anon_vma will be prepared
421 : * when handle fault.
422 : */
423 45831 : vma->anon_vma = NULL;
424 : }
425 115567 : unlock_anon_vma_root(root);
426 :
427 : /*
428 : * Iterate the list once more, it now only contains empty and unlinked
429 : * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
430 : * needing to write-acquire the anon_vma->root->rwsem.
431 : */
432 155474 : list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
433 39915 : struct anon_vma *anon_vma = avc->anon_vma;
434 :
435 39915 : VM_WARN_ON(anon_vma->degree);
436 39915 : put_anon_vma(anon_vma);
437 :
438 39914 : list_del(&avc->same_vma);
439 39914 : anon_vma_chain_free(avc);
440 : }
441 115559 : }
442 :
443 4464 : static void anon_vma_ctor(void *data)
444 : {
445 4464 : struct anon_vma *anon_vma = data;
446 :
447 4464 : init_rwsem(&anon_vma->rwsem);
448 4464 : atomic_set(&anon_vma->refcount, 0);
449 4463 : anon_vma->rb_root = RB_ROOT_CACHED;
450 4463 : }
451 :
452 1 : void __init anon_vma_init(void)
453 : {
454 1 : anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
455 : 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
456 : anon_vma_ctor);
457 1 : anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
458 : SLAB_PANIC|SLAB_ACCOUNT);
459 1 : }
460 :
461 : /*
462 : * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
463 : *
464 : * Since there is no serialization what so ever against page_remove_rmap()
465 : * the best this function can do is return a refcount increased anon_vma
466 : * that might have been relevant to this page.
467 : *
468 : * The page might have been remapped to a different anon_vma or the anon_vma
469 : * returned may already be freed (and even reused).
470 : *
471 : * In case it was remapped to a different anon_vma, the new anon_vma will be a
472 : * child of the old anon_vma, and the anon_vma lifetime rules will therefore
473 : * ensure that any anon_vma obtained from the page will still be valid for as
474 : * long as we observe page_mapped() [ hence all those page_mapped() tests ].
475 : *
476 : * All users of this function must be very careful when walking the anon_vma
477 : * chain and verify that the page in question is indeed mapped in it
478 : * [ something equivalent to page_mapped_in_vma() ].
479 : *
480 : * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
481 : * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
482 : * if there is a mapcount, we can dereference the anon_vma after observing
483 : * those.
484 : */
485 0 : struct anon_vma *page_get_anon_vma(struct page *page)
486 : {
487 0 : struct anon_vma *anon_vma = NULL;
488 0 : unsigned long anon_mapping;
489 :
490 0 : rcu_read_lock();
491 0 : anon_mapping = (unsigned long)READ_ONCE(page->mapping);
492 0 : if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
493 0 : goto out;
494 0 : if (!page_mapped(page))
495 0 : goto out;
496 :
497 0 : anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
498 0 : if (!atomic_inc_not_zero(&anon_vma->refcount)) {
499 0 : anon_vma = NULL;
500 0 : goto out;
501 : }
502 :
503 : /*
504 : * If this page is still mapped, then its anon_vma cannot have been
505 : * freed. But if it has been unmapped, we have no security against the
506 : * anon_vma structure being freed and reused (for another anon_vma:
507 : * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
508 : * above cannot corrupt).
509 : */
510 0 : if (!page_mapped(page)) {
511 0 : rcu_read_unlock();
512 0 : put_anon_vma(anon_vma);
513 0 : return NULL;
514 : }
515 0 : out:
516 0 : rcu_read_unlock();
517 :
518 0 : return anon_vma;
519 : }
520 :
521 : /*
522 : * Similar to page_get_anon_vma() except it locks the anon_vma.
523 : *
524 : * Its a little more complex as it tries to keep the fast path to a single
525 : * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
526 : * reference like with page_get_anon_vma() and then block on the mutex.
527 : */
528 0 : struct anon_vma *page_lock_anon_vma_read(struct page *page)
529 : {
530 0 : struct anon_vma *anon_vma = NULL;
531 0 : struct anon_vma *root_anon_vma;
532 0 : unsigned long anon_mapping;
533 :
534 0 : rcu_read_lock();
535 0 : anon_mapping = (unsigned long)READ_ONCE(page->mapping);
536 0 : if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
537 0 : goto out;
538 0 : if (!page_mapped(page))
539 0 : goto out;
540 :
541 0 : anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
542 0 : root_anon_vma = READ_ONCE(anon_vma->root);
543 0 : if (down_read_trylock(&root_anon_vma->rwsem)) {
544 : /*
545 : * If the page is still mapped, then this anon_vma is still
546 : * its anon_vma, and holding the mutex ensures that it will
547 : * not go away, see anon_vma_free().
548 : */
549 0 : if (!page_mapped(page)) {
550 0 : up_read(&root_anon_vma->rwsem);
551 0 : anon_vma = NULL;
552 : }
553 0 : goto out;
554 : }
555 :
556 : /* trylock failed, we got to sleep */
557 0 : if (!atomic_inc_not_zero(&anon_vma->refcount)) {
558 0 : anon_vma = NULL;
559 0 : goto out;
560 : }
561 :
562 0 : if (!page_mapped(page)) {
563 0 : rcu_read_unlock();
564 0 : put_anon_vma(anon_vma);
565 0 : return NULL;
566 : }
567 :
568 : /* we pinned the anon_vma, its safe to sleep */
569 0 : rcu_read_unlock();
570 0 : anon_vma_lock_read(anon_vma);
571 :
572 0 : if (atomic_dec_and_test(&anon_vma->refcount)) {
573 : /*
574 : * Oops, we held the last refcount, release the lock
575 : * and bail -- can't simply use put_anon_vma() because
576 : * we'll deadlock on the anon_vma_lock_write() recursion.
577 : */
578 0 : anon_vma_unlock_read(anon_vma);
579 0 : __put_anon_vma(anon_vma);
580 0 : anon_vma = NULL;
581 : }
582 :
583 : return anon_vma;
584 :
585 0 : out:
586 0 : rcu_read_unlock();
587 0 : return anon_vma;
588 : }
589 :
590 0 : void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
591 : {
592 0 : anon_vma_unlock_read(anon_vma);
593 0 : }
594 :
595 : #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
596 : /*
597 : * Flush TLB entries for recently unmapped pages from remote CPUs. It is
598 : * important if a PTE was dirty when it was unmapped that it's flushed
599 : * before any IO is initiated on the page to prevent lost writes. Similarly,
600 : * it must be flushed before freeing to prevent data leakage.
601 : */
602 0 : void try_to_unmap_flush(void)
603 : {
604 0 : struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
605 :
606 0 : if (!tlb_ubc->flush_required)
607 : return;
608 :
609 0 : arch_tlbbatch_flush(&tlb_ubc->arch);
610 0 : tlb_ubc->flush_required = false;
611 0 : tlb_ubc->writable = false;
612 : }
613 :
614 : /* Flush iff there are potentially writable TLB entries that can race with IO */
615 0 : void try_to_unmap_flush_dirty(void)
616 : {
617 0 : struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
618 :
619 0 : if (tlb_ubc->writable)
620 0 : try_to_unmap_flush();
621 0 : }
622 :
623 0 : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
624 : {
625 0 : struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
626 :
627 0 : arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
628 0 : tlb_ubc->flush_required = true;
629 :
630 : /*
631 : * Ensure compiler does not re-order the setting of tlb_flush_batched
632 : * before the PTE is cleared.
633 : */
634 0 : barrier();
635 0 : mm->tlb_flush_batched = true;
636 :
637 : /*
638 : * If the PTE was dirty then it's best to assume it's writable. The
639 : * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
640 : * before the page is queued for IO.
641 : */
642 0 : if (writable)
643 0 : tlb_ubc->writable = true;
644 0 : }
645 :
646 : /*
647 : * Returns true if the TLB flush should be deferred to the end of a batch of
648 : * unmap operations to reduce IPIs.
649 : */
650 0 : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
651 : {
652 0 : bool should_defer = false;
653 :
654 0 : if (!(flags & TTU_BATCH_FLUSH))
655 : return false;
656 :
657 : /* If remote CPUs need to be flushed then defer batch the flush */
658 0 : if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
659 0 : should_defer = true;
660 0 : put_cpu();
661 :
662 0 : return should_defer;
663 : }
664 :
665 : /*
666 : * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
667 : * releasing the PTL if TLB flushes are batched. It's possible for a parallel
668 : * operation such as mprotect or munmap to race between reclaim unmapping
669 : * the page and flushing the page. If this race occurs, it potentially allows
670 : * access to data via a stale TLB entry. Tracking all mm's that have TLB
671 : * batching in flight would be expensive during reclaim so instead track
672 : * whether TLB batching occurred in the past and if so then do a flush here
673 : * if required. This will cost one additional flush per reclaim cycle paid
674 : * by the first operation at risk such as mprotect and mumap.
675 : *
676 : * This must be called under the PTL so that an access to tlb_flush_batched
677 : * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
678 : * via the PTL.
679 : */
680 128052 : void flush_tlb_batched_pending(struct mm_struct *mm)
681 : {
682 128052 : if (data_race(mm->tlb_flush_batched)) {
683 0 : flush_tlb_mm(mm);
684 :
685 : /*
686 : * Do not allow the compiler to re-order the clearing of
687 : * tlb_flush_batched before the tlb is flushed.
688 : */
689 0 : barrier();
690 0 : mm->tlb_flush_batched = false;
691 : }
692 128052 : }
693 : #else
694 : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
695 : {
696 : }
697 :
698 : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
699 : {
700 : return false;
701 : }
702 : #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
703 :
704 : /*
705 : * At what user virtual address is page expected in vma?
706 : * Caller should check the page is actually part of the vma.
707 : */
708 0 : unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
709 : {
710 0 : unsigned long address;
711 0 : if (PageAnon(page)) {
712 0 : struct anon_vma *page__anon_vma = page_anon_vma(page);
713 : /*
714 : * Note: swapoff's unuse_vma() is more efficient with this
715 : * check, and needs it to match anon_vma when KSM is active.
716 : */
717 0 : if (!vma->anon_vma || !page__anon_vma ||
718 0 : vma->anon_vma->root != page__anon_vma->root)
719 : return -EFAULT;
720 0 : } else if (page->mapping) {
721 0 : if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
722 : return -EFAULT;
723 : } else
724 : return -EFAULT;
725 0 : address = __vma_address(page, vma);
726 0 : if (unlikely(address < vma->vm_start || address >= vma->vm_end))
727 0 : return -EFAULT;
728 : return address;
729 : }
730 :
731 28 : pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
732 : {
733 28 : pgd_t *pgd;
734 28 : p4d_t *p4d;
735 28 : pud_t *pud;
736 28 : pmd_t *pmd = NULL;
737 28 : pmd_t pmde;
738 :
739 28 : pgd = pgd_offset(mm, address);
740 28 : if (!pgd_present(*pgd))
741 : goto out;
742 :
743 28 : p4d = p4d_offset(pgd, address);
744 28 : if (!p4d_present(*p4d))
745 0 : goto out;
746 :
747 28 : pud = pud_offset(p4d, address);
748 56 : if (!pud_present(*pud))
749 0 : goto out;
750 :
751 28 : pmd = pmd_offset(pud, address);
752 : /*
753 : * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
754 : * without holding anon_vma lock for write. So when looking for a
755 : * genuine pmde (in which to find pte), test present and !THP together.
756 : */
757 28 : pmde = *pmd;
758 28 : barrier();
759 56 : if (!pmd_present(pmde) || pmd_trans_huge(pmde))
760 : pmd = NULL;
761 6 : out:
762 28 : return pmd;
763 : }
764 :
765 : struct page_referenced_arg {
766 : int mapcount;
767 : int referenced;
768 : unsigned long vm_flags;
769 : struct mem_cgroup *memcg;
770 : };
771 : /*
772 : * arg: page_referenced_arg will be passed
773 : */
774 0 : static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
775 : unsigned long address, void *arg)
776 : {
777 0 : struct page_referenced_arg *pra = arg;
778 0 : struct page_vma_mapped_walk pvmw = {
779 : .page = page,
780 : .vma = vma,
781 : .address = address,
782 : };
783 0 : int referenced = 0;
784 :
785 0 : while (page_vma_mapped_walk(&pvmw)) {
786 0 : address = pvmw.address;
787 :
788 0 : if (vma->vm_flags & VM_LOCKED) {
789 0 : page_vma_mapped_walk_done(&pvmw);
790 0 : pra->vm_flags |= VM_LOCKED;
791 0 : return false; /* To break the loop */
792 : }
793 :
794 0 : if (pvmw.pte) {
795 0 : if (ptep_clear_flush_young_notify(vma, address,
796 : pvmw.pte)) {
797 : /*
798 : * Don't treat a reference through
799 : * a sequentially read mapping as such.
800 : * If the page has been used in another mapping,
801 : * we will catch it; if this other mapping is
802 : * already gone, the unmap path will have set
803 : * PG_referenced or activated the page.
804 : */
805 0 : if (likely(!(vma->vm_flags & VM_SEQ_READ)))
806 0 : referenced++;
807 : }
808 0 : } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
809 0 : if (pmdp_clear_flush_young_notify(vma, address,
810 : pvmw.pmd))
811 0 : referenced++;
812 : } else {
813 : /* unexpected pmd-mapped page? */
814 : WARN_ON_ONCE(1);
815 : }
816 :
817 0 : pra->mapcount--;
818 : }
819 :
820 0 : if (referenced)
821 0 : clear_page_idle(page);
822 0 : if (test_and_clear_page_young(page))
823 : referenced++;
824 :
825 0 : if (referenced) {
826 0 : pra->referenced++;
827 0 : pra->vm_flags |= vma->vm_flags;
828 : }
829 :
830 0 : if (!pra->mapcount)
831 0 : return false; /* To break the loop */
832 :
833 : return true;
834 : }
835 :
836 0 : static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
837 : {
838 0 : struct page_referenced_arg *pra = arg;
839 0 : struct mem_cgroup *memcg = pra->memcg;
840 :
841 0 : if (!mm_match_cgroup(vma->vm_mm, memcg))
842 : return true;
843 :
844 0 : return false;
845 : }
846 :
847 : /**
848 : * page_referenced - test if the page was referenced
849 : * @page: the page to test
850 : * @is_locked: caller holds lock on the page
851 : * @memcg: target memory cgroup
852 : * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
853 : *
854 : * Quick test_and_clear_referenced for all mappings to a page,
855 : * returns the number of ptes which referenced the page.
856 : */
857 0 : int page_referenced(struct page *page,
858 : int is_locked,
859 : struct mem_cgroup *memcg,
860 : unsigned long *vm_flags)
861 : {
862 0 : int we_locked = 0;
863 0 : struct page_referenced_arg pra = {
864 0 : .mapcount = total_mapcount(page),
865 : .memcg = memcg,
866 : };
867 0 : struct rmap_walk_control rwc = {
868 : .rmap_one = page_referenced_one,
869 : .arg = (void *)&pra,
870 : .anon_lock = page_lock_anon_vma_read,
871 : };
872 :
873 0 : *vm_flags = 0;
874 0 : if (!pra.mapcount)
875 : return 0;
876 :
877 0 : if (!page_rmapping(page))
878 : return 0;
879 :
880 0 : if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
881 0 : we_locked = trylock_page(page);
882 0 : if (!we_locked)
883 : return 1;
884 : }
885 :
886 : /*
887 : * If we are reclaiming on behalf of a cgroup, skip
888 : * counting on behalf of references from different
889 : * cgroups
890 : */
891 0 : if (memcg) {
892 0 : rwc.invalid_vma = invalid_page_referenced_vma;
893 : }
894 :
895 0 : rmap_walk(page, &rwc);
896 0 : *vm_flags = pra.vm_flags;
897 :
898 0 : if (we_locked)
899 0 : unlock_page(page);
900 :
901 0 : return pra.referenced;
902 : }
903 :
904 68 : static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
905 : unsigned long address, void *arg)
906 : {
907 68 : struct page_vma_mapped_walk pvmw = {
908 : .page = page,
909 : .vma = vma,
910 : .address = address,
911 : .flags = PVMW_SYNC,
912 : };
913 68 : struct mmu_notifier_range range;
914 68 : int *cleaned = arg;
915 :
916 : /*
917 : * We have to assume the worse case ie pmd for invalidation. Note that
918 : * the page can not be free from this function.
919 : */
920 68 : mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
921 : 0, vma, vma->vm_mm, address,
922 : min(vma->vm_end, address + page_size(page)));
923 136 : mmu_notifier_invalidate_range_start(&range);
924 :
925 136 : while (page_vma_mapped_walk(&pvmw)) {
926 68 : int ret = 0;
927 :
928 68 : address = pvmw.address;
929 68 : if (pvmw.pte) {
930 68 : pte_t entry;
931 68 : pte_t *pte = pvmw.pte;
932 :
933 68 : if (!pte_dirty(*pte) && !pte_write(*pte))
934 0 : continue;
935 :
936 68 : flush_cache_page(vma, address, pte_pfn(*pte));
937 68 : entry = ptep_clear_flush(vma, address, pte);
938 68 : entry = pte_wrprotect(entry);
939 68 : entry = pte_mkclean(entry);
940 68 : set_pte_at(vma->vm_mm, address, pte, entry);
941 68 : ret = 1;
942 : } else {
943 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
944 0 : pmd_t *pmd = pvmw.pmd;
945 0 : pmd_t entry;
946 :
947 0 : if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
948 0 : continue;
949 :
950 0 : flush_cache_page(vma, address, page_to_pfn(page));
951 0 : entry = pmdp_invalidate(vma, address, pmd);
952 0 : entry = pmd_wrprotect(entry);
953 0 : entry = pmd_mkclean(entry);
954 0 : set_pmd_at(vma->vm_mm, address, pmd, entry);
955 0 : ret = 1;
956 : #else
957 : /* unexpected pmd-mapped page? */
958 : WARN_ON_ONCE(1);
959 : #endif
960 : }
961 :
962 : /*
963 : * No need to call mmu_notifier_invalidate_range() as we are
964 : * downgrading page table protection not changing it to point
965 : * to a new page.
966 : *
967 : * See Documentation/vm/mmu_notifier.rst
968 : */
969 68 : if (ret)
970 68 : (*cleaned)++;
971 : }
972 :
973 68 : mmu_notifier_invalidate_range_end(&range);
974 :
975 68 : return true;
976 : }
977 :
978 68 : static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
979 : {
980 68 : if (vma->vm_flags & VM_SHARED)
981 68 : return false;
982 :
983 : return true;
984 : }
985 :
986 1286 : int page_mkclean(struct page *page)
987 : {
988 1286 : int cleaned = 0;
989 1286 : struct address_space *mapping;
990 1286 : struct rmap_walk_control rwc = {
991 : .arg = (void *)&cleaned,
992 : .rmap_one = page_mkclean_one,
993 : .invalid_vma = invalid_mkclean_vma,
994 : };
995 :
996 2572 : BUG_ON(!PageLocked(page));
997 :
998 1286 : if (!page_mapped(page))
999 : return 0;
1000 :
1001 68 : mapping = page_mapping(page);
1002 68 : if (!mapping)
1003 : return 0;
1004 :
1005 68 : rmap_walk(page, &rwc);
1006 :
1007 68 : return cleaned;
1008 : }
1009 : EXPORT_SYMBOL_GPL(page_mkclean);
1010 :
1011 : /**
1012 : * page_move_anon_rmap - move a page to our anon_vma
1013 : * @page: the page to move to our anon_vma
1014 : * @vma: the vma the page belongs to
1015 : *
1016 : * When a page belongs exclusively to one process after a COW event,
1017 : * that page can be moved into the anon_vma that belongs to just that
1018 : * process, so the rmap code will not search the parent or sibling
1019 : * processes.
1020 : */
1021 0 : void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
1022 : {
1023 0 : struct anon_vma *anon_vma = vma->anon_vma;
1024 :
1025 0 : page = compound_head(page);
1026 :
1027 0 : VM_BUG_ON_PAGE(!PageLocked(page), page);
1028 0 : VM_BUG_ON_VMA(!anon_vma, vma);
1029 :
1030 0 : anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1031 : /*
1032 : * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
1033 : * simultaneously, so a concurrent reader (eg page_referenced()'s
1034 : * PageAnon()) will not see one without the other.
1035 : */
1036 0 : WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
1037 0 : }
1038 :
1039 : /**
1040 : * __page_set_anon_rmap - set up new anonymous rmap
1041 : * @page: Page or Hugepage to add to rmap
1042 : * @vma: VM area to add page to.
1043 : * @address: User virtual address of the mapping
1044 : * @exclusive: the page is exclusively owned by the current process
1045 : */
1046 69131 : static void __page_set_anon_rmap(struct page *page,
1047 : struct vm_area_struct *vma, unsigned long address, int exclusive)
1048 : {
1049 69131 : struct anon_vma *anon_vma = vma->anon_vma;
1050 :
1051 69131 : BUG_ON(!anon_vma);
1052 :
1053 69131 : if (PageAnon(page))
1054 : return;
1055 :
1056 : /*
1057 : * If the page isn't exclusively mapped into this vma,
1058 : * we must use the _oldest_ possible anon_vma for the
1059 : * page mapping!
1060 : */
1061 69131 : if (!exclusive)
1062 0 : anon_vma = anon_vma->root;
1063 :
1064 : /*
1065 : * page_idle does a lockless/optimistic rmap scan on page->mapping.
1066 : * Make sure the compiler doesn't split the stores of anon_vma and
1067 : * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
1068 : * could mistake the mapping for a struct address_space and crash.
1069 : */
1070 69131 : anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1071 69131 : WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
1072 69131 : page->index = linear_page_index(vma, address);
1073 : }
1074 :
1075 : /**
1076 : * __page_check_anon_rmap - sanity check anonymous rmap addition
1077 : * @page: the page to add the mapping to
1078 : * @vma: the vm area in which the mapping is added
1079 : * @address: the user virtual address mapped
1080 : */
1081 0 : static void __page_check_anon_rmap(struct page *page,
1082 : struct vm_area_struct *vma, unsigned long address)
1083 : {
1084 : /*
1085 : * The page's anon-rmap details (mapping and index) are guaranteed to
1086 : * be set up correctly at this point.
1087 : *
1088 : * We have exclusion against page_add_anon_rmap because the caller
1089 : * always holds the page locked.
1090 : *
1091 : * We have exclusion against page_add_new_anon_rmap because those pages
1092 : * are initially only visible via the pagetables, and the pte is locked
1093 : * over the call to page_add_new_anon_rmap.
1094 : */
1095 0 : VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
1096 0 : VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
1097 : page);
1098 0 : }
1099 :
1100 : /**
1101 : * page_add_anon_rmap - add pte mapping to an anonymous page
1102 : * @page: the page to add the mapping to
1103 : * @vma: the vm area in which the mapping is added
1104 : * @address: the user virtual address mapped
1105 : * @compound: charge the page as compound or small page
1106 : *
1107 : * The caller needs to hold the pte lock, and the page must be locked in
1108 : * the anon_vma case: to serialize mapping,index checking after setting,
1109 : * and to ensure that PageAnon is not being upgraded racily to PageKsm
1110 : * (but PageKsm is never downgraded to PageAnon).
1111 : */
1112 0 : void page_add_anon_rmap(struct page *page,
1113 : struct vm_area_struct *vma, unsigned long address, bool compound)
1114 : {
1115 0 : do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
1116 0 : }
1117 :
1118 : /*
1119 : * Special version of the above for do_swap_page, which often runs
1120 : * into pages that are exclusively owned by the current process.
1121 : * Everybody else should continue to use page_add_anon_rmap above.
1122 : */
1123 0 : void do_page_add_anon_rmap(struct page *page,
1124 : struct vm_area_struct *vma, unsigned long address, int flags)
1125 : {
1126 0 : bool compound = flags & RMAP_COMPOUND;
1127 0 : bool first;
1128 :
1129 0 : if (unlikely(PageKsm(page)))
1130 0 : lock_page_memcg(page);
1131 : else
1132 0 : VM_BUG_ON_PAGE(!PageLocked(page), page);
1133 :
1134 0 : if (compound) {
1135 0 : atomic_t *mapcount;
1136 0 : VM_BUG_ON_PAGE(!PageLocked(page), page);
1137 0 : VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1138 0 : mapcount = compound_mapcount_ptr(page);
1139 0 : first = atomic_inc_and_test(mapcount);
1140 : } else {
1141 0 : first = atomic_inc_and_test(&page->_mapcount);
1142 : }
1143 :
1144 0 : if (first) {
1145 0 : int nr = compound ? thp_nr_pages(page) : 1;
1146 : /*
1147 : * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1148 : * these counters are not modified in interrupt context, and
1149 : * pte lock(a spinlock) is held, which implies preemption
1150 : * disabled.
1151 : */
1152 0 : if (compound)
1153 0 : __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
1154 0 : __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
1155 : }
1156 :
1157 0 : if (unlikely(PageKsm(page))) {
1158 0 : unlock_page_memcg(page);
1159 : return;
1160 : }
1161 :
1162 : /* address might be in next vma when migration races vma_adjust */
1163 0 : if (first)
1164 0 : __page_set_anon_rmap(page, vma, address,
1165 : flags & RMAP_EXCLUSIVE);
1166 : else
1167 0 : __page_check_anon_rmap(page, vma, address);
1168 : }
1169 :
1170 : /**
1171 : * page_add_new_anon_rmap - add pte mapping to a new anonymous page
1172 : * @page: the page to add the mapping to
1173 : * @vma: the vm area in which the mapping is added
1174 : * @address: the user virtual address mapped
1175 : * @compound: charge the page as compound or small page
1176 : *
1177 : * Same as page_add_anon_rmap but must only be called on *new* pages.
1178 : * This means the inc-and-test can be bypassed.
1179 : * Page does not have to be locked.
1180 : */
1181 69131 : void page_add_new_anon_rmap(struct page *page,
1182 : struct vm_area_struct *vma, unsigned long address, bool compound)
1183 : {
1184 69131 : int nr = compound ? thp_nr_pages(page) : 1;
1185 :
1186 69131 : VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
1187 69131 : __SetPageSwapBacked(page);
1188 69132 : if (compound) {
1189 19 : VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1190 : /* increment count (starts at -1) */
1191 19 : atomic_set(compound_mapcount_ptr(page), 0);
1192 19 : if (hpage_pincount_available(page))
1193 19 : atomic_set(compound_pincount_ptr(page), 0);
1194 :
1195 19 : __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
1196 : } else {
1197 : /* Anon THP always mapped first with PMD */
1198 69113 : VM_BUG_ON_PAGE(PageTransCompound(page), page);
1199 : /* increment count (starts at -1) */
1200 69113 : atomic_set(&page->_mapcount, 0);
1201 : }
1202 69132 : __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
1203 69133 : __page_set_anon_rmap(page, vma, address, 1);
1204 69132 : }
1205 :
1206 : /**
1207 : * page_add_file_rmap - add pte mapping to a file page
1208 : * @page: the page to add the mapping to
1209 : * @compound: charge the page as compound or small page
1210 : *
1211 : * The caller needs to hold the pte lock.
1212 : */
1213 798022 : void page_add_file_rmap(struct page *page, bool compound)
1214 : {
1215 798022 : int i, nr = 1;
1216 :
1217 798022 : VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
1218 798022 : lock_page_memcg(page);
1219 798022 : if (compound && PageTransHuge(page)) {
1220 0 : int nr_pages = thp_nr_pages(page);
1221 :
1222 0 : for (i = 0, nr = 0; i < nr_pages; i++) {
1223 0 : if (atomic_inc_and_test(&page[i]._mapcount))
1224 0 : nr++;
1225 : }
1226 0 : if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
1227 0 : goto out;
1228 0 : if (PageSwapBacked(page))
1229 0 : __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
1230 : nr_pages);
1231 : else
1232 0 : __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
1233 : nr_pages);
1234 : } else {
1235 798022 : if (PageTransCompound(page) && page_mapping(page)) {
1236 0 : VM_WARN_ON_ONCE(!PageLocked(page));
1237 :
1238 0 : SetPageDoubleMap(compound_head(page));
1239 0 : if (PageMlocked(page))
1240 0 : clear_page_mlock(compound_head(page));
1241 : }
1242 1596138 : if (!atomic_inc_and_test(&page->_mapcount))
1243 695126 : goto out;
1244 : }
1245 103007 : __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
1246 798133 : out:
1247 798133 : unlock_page_memcg(page);
1248 798133 : }
1249 :
1250 776429 : static void page_remove_file_rmap(struct page *page, bool compound)
1251 : {
1252 776429 : int i, nr = 1;
1253 :
1254 776429 : VM_BUG_ON_PAGE(compound && !PageHead(page), page);
1255 :
1256 : /* Hugepages are not counted in NR_FILE_MAPPED for now. */
1257 776429 : if (unlikely(PageHuge(page))) {
1258 : /* hugetlb pages are always mapped with pmds */
1259 : atomic_dec(compound_mapcount_ptr(page));
1260 : return;
1261 : }
1262 :
1263 : /* page still mapped by someone else? */
1264 776429 : if (compound && PageTransHuge(page)) {
1265 0 : int nr_pages = thp_nr_pages(page);
1266 :
1267 0 : for (i = 0, nr = 0; i < nr_pages; i++) {
1268 0 : if (atomic_add_negative(-1, &page[i]._mapcount))
1269 0 : nr++;
1270 : }
1271 0 : if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1272 : return;
1273 0 : if (PageSwapBacked(page))
1274 0 : __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
1275 : -nr_pages);
1276 : else
1277 0 : __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
1278 : -nr_pages);
1279 : } else {
1280 1553185 : if (!atomic_add_negative(-1, &page->_mapcount))
1281 : return;
1282 : }
1283 :
1284 : /*
1285 : * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
1286 : * these counters are not modified in interrupt context, and
1287 : * pte lock(a spinlock) is held, which implies preemption disabled.
1288 : */
1289 98037 : __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
1290 :
1291 196082 : if (unlikely(PageMlocked(page)))
1292 0 : clear_page_mlock(page);
1293 : }
1294 :
1295 17 : static void page_remove_anon_compound_rmap(struct page *page)
1296 : {
1297 17 : int i, nr;
1298 :
1299 34 : if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1300 : return;
1301 :
1302 : /* Hugepages are not counted in NR_ANON_PAGES for now. */
1303 17 : if (unlikely(PageHuge(page)))
1304 : return;
1305 :
1306 17 : if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1307 : return;
1308 :
1309 34 : __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
1310 :
1311 17 : if (TestClearPageDoubleMap(page)) {
1312 : /*
1313 : * Subpages can be mapped with PTEs too. Check how many of
1314 : * them are still mapped.
1315 : */
1316 0 : for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1317 0 : if (atomic_add_negative(-1, &page[i]._mapcount))
1318 0 : nr++;
1319 : }
1320 :
1321 : /*
1322 : * Queue the page for deferred split if at least one small
1323 : * page of the compound page is unmapped, but at least one
1324 : * small page is still mapped.
1325 : */
1326 0 : if (nr && nr < thp_nr_pages(page))
1327 0 : deferred_split_huge_page(page);
1328 : } else {
1329 17 : nr = thp_nr_pages(page);
1330 : }
1331 :
1332 34 : if (unlikely(PageMlocked(page)))
1333 0 : clear_page_mlock(page);
1334 :
1335 17 : if (nr)
1336 17 : __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
1337 : }
1338 :
1339 : /**
1340 : * page_remove_rmap - take down pte mapping from a page
1341 : * @page: page to remove mapping from
1342 : * @compound: uncharge the page as compound or small page
1343 : *
1344 : * The caller needs to hold the pte lock.
1345 : */
1346 923070 : void page_remove_rmap(struct page *page, bool compound)
1347 : {
1348 923070 : lock_page_memcg(page);
1349 :
1350 923070 : if (!PageAnon(page)) {
1351 776428 : page_remove_file_rmap(page, compound);
1352 776693 : goto out;
1353 : }
1354 :
1355 146642 : if (compound) {
1356 17 : page_remove_anon_compound_rmap(page);
1357 17 : goto out;
1358 : }
1359 :
1360 : /* page still mapped by someone else? */
1361 293262 : if (!atomic_add_negative(-1, &page->_mapcount))
1362 81577 : goto out;
1363 :
1364 : /*
1365 : * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1366 : * these counters are not modified in interrupt context, and
1367 : * pte lock(a spinlock) is held, which implies preemption disabled.
1368 : */
1369 65060 : __dec_lruvec_page_state(page, NR_ANON_MAPPED);
1370 :
1371 130117 : if (unlikely(PageMlocked(page)))
1372 0 : clear_page_mlock(page);
1373 :
1374 65057 : if (PageTransCompound(page))
1375 0 : deferred_split_huge_page(compound_head(page));
1376 :
1377 : /*
1378 : * It would be tidy to reset the PageAnon mapping here,
1379 : * but that might overwrite a racing page_add_anon_rmap
1380 : * which increments mapcount after us but sets mapping
1381 : * before us: so leave the reset to free_unref_page,
1382 : * and remember that it's only reliable while mapped.
1383 : * Leaving it set also helps swapoff to reinstate ptes
1384 : * faster for those pages still in swapcache.
1385 : */
1386 65059 : out:
1387 923346 : unlock_page_memcg(page);
1388 923346 : }
1389 :
1390 : /*
1391 : * @arg: enum ttu_flags will be passed to this argument
1392 : */
1393 0 : static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1394 : unsigned long address, void *arg)
1395 : {
1396 0 : struct mm_struct *mm = vma->vm_mm;
1397 0 : struct page_vma_mapped_walk pvmw = {
1398 : .page = page,
1399 : .vma = vma,
1400 : .address = address,
1401 : };
1402 0 : pte_t pteval;
1403 0 : struct page *subpage;
1404 0 : bool ret = true;
1405 0 : struct mmu_notifier_range range;
1406 0 : enum ttu_flags flags = (enum ttu_flags)(long)arg;
1407 :
1408 : /* munlock has nothing to gain from examining un-locked vmas */
1409 0 : if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
1410 : return true;
1411 :
1412 0 : if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1413 0 : is_zone_device_page(page) && !is_device_private_page(page))
1414 : return true;
1415 :
1416 0 : if (flags & TTU_SPLIT_HUGE_PMD) {
1417 0 : split_huge_pmd_address(vma, address,
1418 0 : flags & TTU_SPLIT_FREEZE, page);
1419 : }
1420 :
1421 : /*
1422 : * For THP, we have to assume the worse case ie pmd for invalidation.
1423 : * For hugetlb, it could be much worse if we need to do pud
1424 : * invalidation in the case of pmd sharing.
1425 : *
1426 : * Note that the page can not be free in this function as call of
1427 : * try_to_unmap() must hold a reference on the page.
1428 : */
1429 0 : mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1430 : address,
1431 : min(vma->vm_end, address + page_size(page)));
1432 0 : if (PageHuge(page)) {
1433 : /*
1434 : * If sharing is possible, start and end will be adjusted
1435 : * accordingly.
1436 : */
1437 0 : adjust_range_if_pmd_sharing_possible(vma, &range.start,
1438 : &range.end);
1439 : }
1440 0 : mmu_notifier_invalidate_range_start(&range);
1441 :
1442 0 : while (page_vma_mapped_walk(&pvmw)) {
1443 : #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1444 : /* PMD-mapped THP migration entry */
1445 0 : if (!pvmw.pte && (flags & TTU_MIGRATION)) {
1446 0 : VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
1447 :
1448 0 : set_pmd_migration_entry(&pvmw, page);
1449 0 : continue;
1450 : }
1451 : #endif
1452 :
1453 : /*
1454 : * If the page is mlock()d, we cannot swap it out.
1455 : * If it's recently referenced (perhaps page_referenced
1456 : * skipped over this mm) then we should reactivate it.
1457 : */
1458 0 : if (!(flags & TTU_IGNORE_MLOCK)) {
1459 0 : if (vma->vm_flags & VM_LOCKED) {
1460 : /* PTE-mapped THP are never mlocked */
1461 0 : if (!PageTransCompound(page)) {
1462 : /*
1463 : * Holding pte lock, we do *not* need
1464 : * mmap_lock here
1465 : */
1466 0 : mlock_vma_page(page);
1467 : }
1468 0 : ret = false;
1469 0 : page_vma_mapped_walk_done(&pvmw);
1470 : break;
1471 : }
1472 0 : if (flags & TTU_MUNLOCK)
1473 0 : continue;
1474 : }
1475 :
1476 : /* Unexpected PMD-mapped THP? */
1477 0 : VM_BUG_ON_PAGE(!pvmw.pte, page);
1478 :
1479 0 : subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
1480 0 : address = pvmw.address;
1481 :
1482 0 : if (PageHuge(page) && !PageAnon(page)) {
1483 : /*
1484 : * To call huge_pmd_unshare, i_mmap_rwsem must be
1485 : * held in write mode. Caller needs to explicitly
1486 : * do this outside rmap routines.
1487 : */
1488 : VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
1489 : if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
1490 : /*
1491 : * huge_pmd_unshare unmapped an entire PMD
1492 : * page. There is no way of knowing exactly
1493 : * which PMDs may be cached for this mm, so
1494 : * we must flush them all. start/end were
1495 : * already adjusted above to cover this range.
1496 : */
1497 : flush_cache_range(vma, range.start, range.end);
1498 : flush_tlb_range(vma, range.start, range.end);
1499 : mmu_notifier_invalidate_range(mm, range.start,
1500 : range.end);
1501 :
1502 : /*
1503 : * The ref count of the PMD page was dropped
1504 : * which is part of the way map counting
1505 : * is done for shared PMDs. Return 'true'
1506 : * here. When there is no other sharing,
1507 : * huge_pmd_unshare returns false and we will
1508 : * unmap the actual page and drop map count
1509 : * to zero.
1510 : */
1511 0 : page_vma_mapped_walk_done(&pvmw);
1512 : break;
1513 : }
1514 : }
1515 :
1516 0 : if (IS_ENABLED(CONFIG_MIGRATION) &&
1517 : (flags & TTU_MIGRATION) &&
1518 0 : is_zone_device_page(page)) {
1519 : swp_entry_t entry;
1520 : pte_t swp_pte;
1521 :
1522 : pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
1523 :
1524 : /*
1525 : * Store the pfn of the page in a special migration
1526 : * pte. do_swap_page() will wait until the migration
1527 : * pte is removed and then restart fault handling.
1528 : */
1529 : entry = make_migration_entry(page, 0);
1530 : swp_pte = swp_entry_to_pte(entry);
1531 :
1532 : /*
1533 : * pteval maps a zone device page and is therefore
1534 : * a swap pte.
1535 : */
1536 : if (pte_swp_soft_dirty(pteval))
1537 : swp_pte = pte_swp_mksoft_dirty(swp_pte);
1538 : if (pte_swp_uffd_wp(pteval))
1539 : swp_pte = pte_swp_mkuffd_wp(swp_pte);
1540 : set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
1541 : /*
1542 : * No need to invalidate here it will synchronize on
1543 : * against the special swap migration pte.
1544 : *
1545 : * The assignment to subpage above was computed from a
1546 : * swap PTE which results in an invalid pointer.
1547 : * Since only PAGE_SIZE pages can currently be
1548 : * migrated, just set it to page. This will need to be
1549 : * changed when hugepage migrations to device private
1550 : * memory are supported.
1551 : */
1552 : subpage = page;
1553 : goto discard;
1554 : }
1555 :
1556 : /* Nuke the page table entry. */
1557 0 : flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
1558 0 : if (should_defer_flush(mm, flags)) {
1559 : /*
1560 : * We clear the PTE but do not flush so potentially
1561 : * a remote CPU could still be writing to the page.
1562 : * If the entry was previously clean then the
1563 : * architecture must guarantee that a clear->dirty
1564 : * transition on a cached TLB entry is written through
1565 : * and traps if the PTE is unmapped.
1566 : */
1567 0 : pteval = ptep_get_and_clear(mm, address, pvmw.pte);
1568 :
1569 0 : set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
1570 : } else {
1571 0 : pteval = ptep_clear_flush(vma, address, pvmw.pte);
1572 : }
1573 :
1574 : /* Move the dirty bit to the page. Now the pte is gone. */
1575 0 : if (pte_dirty(pteval))
1576 0 : set_page_dirty(page);
1577 :
1578 : /* Update high watermark before we lower rss */
1579 0 : update_hiwater_rss(mm);
1580 :
1581 0 : if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1582 : pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
1583 : if (PageHuge(page)) {
1584 : hugetlb_count_sub(compound_nr(page), mm);
1585 : set_huge_swap_pte_at(mm, address,
1586 : pvmw.pte, pteval,
1587 : vma_mmu_pagesize(vma));
1588 : } else {
1589 : dec_mm_counter(mm, mm_counter(page));
1590 : set_pte_at(mm, address, pvmw.pte, pteval);
1591 : }
1592 :
1593 0 : } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
1594 : /*
1595 : * The guest indicated that the page content is of no
1596 : * interest anymore. Simply discard the pte, vmscan
1597 : * will take care of the rest.
1598 : * A future reference will then fault in a new zero
1599 : * page. When userfaultfd is active, we must not drop
1600 : * this page though, as its main user (postcopy
1601 : * migration) will not expect userfaults on already
1602 : * copied pages.
1603 : */
1604 : dec_mm_counter(mm, mm_counter(page));
1605 : /* We have to invalidate as we cleared the pte */
1606 0 : mmu_notifier_invalidate_range(mm, address,
1607 : address + PAGE_SIZE);
1608 0 : } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1609 0 : (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
1610 0 : swp_entry_t entry;
1611 0 : pte_t swp_pte;
1612 :
1613 0 : if (arch_unmap_one(mm, vma, address, pteval) < 0) {
1614 : set_pte_at(mm, address, pvmw.pte, pteval);
1615 : ret = false;
1616 : page_vma_mapped_walk_done(&pvmw);
1617 : break;
1618 : }
1619 :
1620 : /*
1621 : * Store the pfn of the page in a special migration
1622 : * pte. do_swap_page() will wait until the migration
1623 : * pte is removed and then restart fault handling.
1624 : */
1625 0 : entry = make_migration_entry(subpage,
1626 : pte_write(pteval));
1627 0 : swp_pte = swp_entry_to_pte(entry);
1628 0 : if (pte_soft_dirty(pteval))
1629 : swp_pte = pte_swp_mksoft_dirty(swp_pte);
1630 0 : if (pte_uffd_wp(pteval))
1631 : swp_pte = pte_swp_mkuffd_wp(swp_pte);
1632 0 : set_pte_at(mm, address, pvmw.pte, swp_pte);
1633 : /*
1634 : * No need to invalidate here it will synchronize on
1635 : * against the special swap migration pte.
1636 : */
1637 0 : } else if (PageAnon(page)) {
1638 0 : swp_entry_t entry = { .val = page_private(subpage) };
1639 0 : pte_t swp_pte;
1640 : /*
1641 : * Store the swap location in the pte.
1642 : * See handle_pte_fault() ...
1643 : */
1644 0 : if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
1645 0 : WARN_ON_ONCE(1);
1646 0 : ret = false;
1647 : /* We have to invalidate as we cleared the pte */
1648 0 : mmu_notifier_invalidate_range(mm, address,
1649 : address + PAGE_SIZE);
1650 0 : page_vma_mapped_walk_done(&pvmw);
1651 0 : break;
1652 : }
1653 :
1654 : /* MADV_FREE page check */
1655 0 : if (!PageSwapBacked(page)) {
1656 0 : if (!PageDirty(page)) {
1657 : /* Invalidate as we cleared the pte */
1658 0 : mmu_notifier_invalidate_range(mm,
1659 : address, address + PAGE_SIZE);
1660 0 : dec_mm_counter(mm, MM_ANONPAGES);
1661 0 : goto discard;
1662 : }
1663 :
1664 : /*
1665 : * If the page was redirtied, it cannot be
1666 : * discarded. Remap the page to page table.
1667 : */
1668 0 : set_pte_at(mm, address, pvmw.pte, pteval);
1669 0 : SetPageSwapBacked(page);
1670 0 : ret = false;
1671 0 : page_vma_mapped_walk_done(&pvmw);
1672 : break;
1673 : }
1674 :
1675 0 : if (swap_duplicate(entry) < 0) {
1676 : set_pte_at(mm, address, pvmw.pte, pteval);
1677 : ret = false;
1678 0 : page_vma_mapped_walk_done(&pvmw);
1679 : break;
1680 : }
1681 0 : if (arch_unmap_one(mm, vma, address, pteval) < 0) {
1682 : set_pte_at(mm, address, pvmw.pte, pteval);
1683 : ret = false;
1684 0 : page_vma_mapped_walk_done(&pvmw);
1685 : break;
1686 : }
1687 0 : if (list_empty(&mm->mmlist)) {
1688 0 : spin_lock(&mmlist_lock);
1689 0 : if (list_empty(&mm->mmlist))
1690 0 : list_add(&mm->mmlist, &init_mm.mmlist);
1691 0 : spin_unlock(&mmlist_lock);
1692 : }
1693 0 : dec_mm_counter(mm, MM_ANONPAGES);
1694 0 : inc_mm_counter(mm, MM_SWAPENTS);
1695 0 : swp_pte = swp_entry_to_pte(entry);
1696 0 : if (pte_soft_dirty(pteval))
1697 : swp_pte = pte_swp_mksoft_dirty(swp_pte);
1698 0 : if (pte_uffd_wp(pteval))
1699 : swp_pte = pte_swp_mkuffd_wp(swp_pte);
1700 0 : set_pte_at(mm, address, pvmw.pte, swp_pte);
1701 : /* Invalidate as we cleared the pte */
1702 0 : mmu_notifier_invalidate_range(mm, address,
1703 : address + PAGE_SIZE);
1704 : } else {
1705 : /*
1706 : * This is a locked file-backed page, thus it cannot
1707 : * be removed from the page cache and replaced by a new
1708 : * page before mmu_notifier_invalidate_range_end, so no
1709 : * concurrent thread might update its page table to
1710 : * point at new page while a device still is using this
1711 : * page.
1712 : *
1713 : * See Documentation/vm/mmu_notifier.rst
1714 : */
1715 0 : dec_mm_counter(mm, mm_counter_file(page));
1716 : }
1717 0 : discard:
1718 : /*
1719 : * No need to call mmu_notifier_invalidate_range() it has be
1720 : * done above for all cases requiring it to happen under page
1721 : * table lock before mmu_notifier_invalidate_range_end()
1722 : *
1723 : * See Documentation/vm/mmu_notifier.rst
1724 : */
1725 0 : page_remove_rmap(subpage, PageHuge(page));
1726 0 : put_page(page);
1727 : }
1728 :
1729 0 : mmu_notifier_invalidate_range_end(&range);
1730 :
1731 : return ret;
1732 : }
1733 :
1734 0 : static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
1735 : {
1736 0 : return vma_is_temporary_stack(vma);
1737 : }
1738 :
1739 0 : static int page_not_mapped(struct page *page)
1740 : {
1741 0 : return !page_mapped(page);
1742 : }
1743 :
1744 : /**
1745 : * try_to_unmap - try to remove all page table mappings to a page
1746 : * @page: the page to get unmapped
1747 : * @flags: action and flags
1748 : *
1749 : * Tries to remove all the page table entries which are mapping this
1750 : * page, used in the pageout path. Caller must hold the page lock.
1751 : *
1752 : * If unmap is successful, return true. Otherwise, false.
1753 : */
1754 0 : bool try_to_unmap(struct page *page, enum ttu_flags flags)
1755 : {
1756 0 : struct rmap_walk_control rwc = {
1757 : .rmap_one = try_to_unmap_one,
1758 0 : .arg = (void *)flags,
1759 : .done = page_not_mapped,
1760 : .anon_lock = page_lock_anon_vma_read,
1761 : };
1762 :
1763 : /*
1764 : * During exec, a temporary VMA is setup and later moved.
1765 : * The VMA is moved under the anon_vma lock but not the
1766 : * page tables leading to a race where migration cannot
1767 : * find the migration ptes. Rather than increasing the
1768 : * locking requirements of exec(), migration skips
1769 : * temporary VMAs until after exec() completes.
1770 : */
1771 0 : if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
1772 0 : && !PageKsm(page) && PageAnon(page))
1773 0 : rwc.invalid_vma = invalid_migration_vma;
1774 :
1775 0 : if (flags & TTU_RMAP_LOCKED)
1776 0 : rmap_walk_locked(page, &rwc);
1777 : else
1778 0 : rmap_walk(page, &rwc);
1779 :
1780 0 : return !page_mapcount(page) ? true : false;
1781 : }
1782 :
1783 : /**
1784 : * try_to_munlock - try to munlock a page
1785 : * @page: the page to be munlocked
1786 : *
1787 : * Called from munlock code. Checks all of the VMAs mapping the page
1788 : * to make sure nobody else has this page mlocked. The page will be
1789 : * returned with PG_mlocked cleared if no other vmas have it mlocked.
1790 : */
1791 :
1792 0 : void try_to_munlock(struct page *page)
1793 : {
1794 0 : struct rmap_walk_control rwc = {
1795 : .rmap_one = try_to_unmap_one,
1796 : .arg = (void *)TTU_MUNLOCK,
1797 : .done = page_not_mapped,
1798 : .anon_lock = page_lock_anon_vma_read,
1799 :
1800 : };
1801 :
1802 0 : VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
1803 0 : VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
1804 :
1805 0 : rmap_walk(page, &rwc);
1806 0 : }
1807 :
1808 39903 : void __put_anon_vma(struct anon_vma *anon_vma)
1809 : {
1810 39903 : struct anon_vma *root = anon_vma->root;
1811 :
1812 39903 : anon_vma_free(anon_vma);
1813 66871 : if (root != anon_vma && atomic_dec_and_test(&root->refcount))
1814 0 : anon_vma_free(root);
1815 39904 : }
1816 :
1817 0 : static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1818 : struct rmap_walk_control *rwc)
1819 : {
1820 0 : struct anon_vma *anon_vma;
1821 :
1822 0 : if (rwc->anon_lock)
1823 0 : return rwc->anon_lock(page);
1824 :
1825 : /*
1826 : * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1827 : * because that depends on page_mapped(); but not all its usages
1828 : * are holding mmap_lock. Users without mmap_lock are required to
1829 : * take a reference count to prevent the anon_vma disappearing
1830 : */
1831 0 : anon_vma = page_anon_vma(page);
1832 0 : if (!anon_vma)
1833 : return NULL;
1834 :
1835 0 : anon_vma_lock_read(anon_vma);
1836 0 : return anon_vma;
1837 : }
1838 :
1839 : /*
1840 : * rmap_walk_anon - do something to anonymous page using the object-based
1841 : * rmap method
1842 : * @page: the page to be handled
1843 : * @rwc: control variable according to each walk type
1844 : *
1845 : * Find all the mappings of a page using the mapping pointer and the vma chains
1846 : * contained in the anon_vma struct it points to.
1847 : *
1848 : * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
1849 : * where the page was found will be held for write. So, we won't recheck
1850 : * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1851 : * LOCKED.
1852 : */
1853 0 : static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
1854 : bool locked)
1855 : {
1856 0 : struct anon_vma *anon_vma;
1857 0 : pgoff_t pgoff_start, pgoff_end;
1858 0 : struct anon_vma_chain *avc;
1859 :
1860 0 : if (locked) {
1861 0 : anon_vma = page_anon_vma(page);
1862 : /* anon_vma disappear under us? */
1863 0 : VM_BUG_ON_PAGE(!anon_vma, page);
1864 : } else {
1865 0 : anon_vma = rmap_walk_anon_lock(page, rwc);
1866 : }
1867 0 : if (!anon_vma)
1868 : return;
1869 :
1870 0 : pgoff_start = page_to_pgoff(page);
1871 0 : pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
1872 0 : anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
1873 : pgoff_start, pgoff_end) {
1874 0 : struct vm_area_struct *vma = avc->vma;
1875 0 : unsigned long address = vma_address(page, vma);
1876 :
1877 0 : cond_resched();
1878 :
1879 0 : if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1880 0 : continue;
1881 :
1882 0 : if (!rwc->rmap_one(page, vma, address, rwc->arg))
1883 : break;
1884 0 : if (rwc->done && rwc->done(page))
1885 : break;
1886 : }
1887 :
1888 0 : if (!locked)
1889 0 : anon_vma_unlock_read(anon_vma);
1890 : }
1891 :
1892 : /*
1893 : * rmap_walk_file - do something to file page using the object-based rmap method
1894 : * @page: the page to be handled
1895 : * @rwc: control variable according to each walk type
1896 : *
1897 : * Find all the mappings of a page using the mapping pointer and the vma chains
1898 : * contained in the address_space struct it points to.
1899 : *
1900 : * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
1901 : * where the page was found will be held for write. So, we won't recheck
1902 : * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1903 : * LOCKED.
1904 : */
1905 68 : static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
1906 : bool locked)
1907 : {
1908 68 : struct address_space *mapping = page_mapping(page);
1909 68 : pgoff_t pgoff_start, pgoff_end;
1910 68 : struct vm_area_struct *vma;
1911 :
1912 : /*
1913 : * The page lock not only makes sure that page->mapping cannot
1914 : * suddenly be NULLified by truncation, it makes sure that the
1915 : * structure at mapping cannot be freed and reused yet,
1916 : * so we can safely take mapping->i_mmap_rwsem.
1917 : */
1918 136 : VM_BUG_ON_PAGE(!PageLocked(page), page);
1919 :
1920 68 : if (!mapping)
1921 : return;
1922 :
1923 68 : pgoff_start = page_to_pgoff(page);
1924 68 : pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
1925 68 : if (!locked)
1926 68 : i_mmap_lock_read(mapping);
1927 136 : vma_interval_tree_foreach(vma, &mapping->i_mmap,
1928 : pgoff_start, pgoff_end) {
1929 68 : unsigned long address = vma_address(page, vma);
1930 :
1931 68 : cond_resched();
1932 :
1933 68 : if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1934 0 : continue;
1935 :
1936 68 : if (!rwc->rmap_one(page, vma, address, rwc->arg))
1937 0 : goto done;
1938 68 : if (rwc->done && rwc->done(page))
1939 0 : goto done;
1940 : }
1941 :
1942 68 : done:
1943 68 : if (!locked)
1944 68 : i_mmap_unlock_read(mapping);
1945 : }
1946 :
1947 68 : void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
1948 : {
1949 68 : if (unlikely(PageKsm(page)))
1950 0 : rmap_walk_ksm(page, rwc);
1951 68 : else if (PageAnon(page))
1952 0 : rmap_walk_anon(page, rwc, false);
1953 : else
1954 68 : rmap_walk_file(page, rwc, false);
1955 68 : }
1956 :
1957 : /* Like rmap_walk, but caller holds relevant rmap lock */
1958 0 : void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
1959 : {
1960 : /* no ksm support for now */
1961 0 : VM_BUG_ON_PAGE(PageKsm(page), page);
1962 0 : if (PageAnon(page))
1963 0 : rmap_walk_anon(page, rwc, true);
1964 : else
1965 0 : rmap_walk_file(page, rwc, true);
1966 0 : }
1967 :
1968 : #ifdef CONFIG_HUGETLB_PAGE
1969 : /*
1970 : * The following two functions are for anonymous (private mapped) hugepages.
1971 : * Unlike common anonymous pages, anonymous hugepages have no accounting code
1972 : * and no lru code, because we handle hugepages differently from common pages.
1973 : */
1974 : void hugepage_add_anon_rmap(struct page *page,
1975 : struct vm_area_struct *vma, unsigned long address)
1976 : {
1977 : struct anon_vma *anon_vma = vma->anon_vma;
1978 : int first;
1979 :
1980 : BUG_ON(!PageLocked(page));
1981 : BUG_ON(!anon_vma);
1982 : /* address might be in next vma when migration races vma_adjust */
1983 : first = atomic_inc_and_test(compound_mapcount_ptr(page));
1984 : if (first)
1985 : __page_set_anon_rmap(page, vma, address, 0);
1986 : }
1987 :
1988 : void hugepage_add_new_anon_rmap(struct page *page,
1989 : struct vm_area_struct *vma, unsigned long address)
1990 : {
1991 : BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1992 : atomic_set(compound_mapcount_ptr(page), 0);
1993 : if (hpage_pincount_available(page))
1994 : atomic_set(compound_pincount_ptr(page), 0);
1995 :
1996 : __page_set_anon_rmap(page, vma, address, 1);
1997 : }
1998 : #endif /* CONFIG_HUGETLB_PAGE */
|