Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 :
4 : #include <linux/mm.h>
5 : #include <linux/sched.h>
6 : #include <linux/sched/mm.h>
7 : #include <linux/sched/coredump.h>
8 : #include <linux/mmu_notifier.h>
9 : #include <linux/rmap.h>
10 : #include <linux/swap.h>
11 : #include <linux/mm_inline.h>
12 : #include <linux/kthread.h>
13 : #include <linux/khugepaged.h>
14 : #include <linux/freezer.h>
15 : #include <linux/mman.h>
16 : #include <linux/hashtable.h>
17 : #include <linux/userfaultfd_k.h>
18 : #include <linux/page_idle.h>
19 : #include <linux/swapops.h>
20 : #include <linux/shmem_fs.h>
21 :
22 : #include <asm/tlb.h>
23 : #include <asm/pgalloc.h>
24 : #include "internal.h"
25 :
26 : enum scan_result {
27 : SCAN_FAIL,
28 : SCAN_SUCCEED,
29 : SCAN_PMD_NULL,
30 : SCAN_EXCEED_NONE_PTE,
31 : SCAN_EXCEED_SWAP_PTE,
32 : SCAN_EXCEED_SHARED_PTE,
33 : SCAN_PTE_NON_PRESENT,
34 : SCAN_PTE_UFFD_WP,
35 : SCAN_PAGE_RO,
36 : SCAN_LACK_REFERENCED_PAGE,
37 : SCAN_PAGE_NULL,
38 : SCAN_SCAN_ABORT,
39 : SCAN_PAGE_COUNT,
40 : SCAN_PAGE_LRU,
41 : SCAN_PAGE_LOCK,
42 : SCAN_PAGE_ANON,
43 : SCAN_PAGE_COMPOUND,
44 : SCAN_ANY_PROCESS,
45 : SCAN_VMA_NULL,
46 : SCAN_VMA_CHECK,
47 : SCAN_ADDRESS_RANGE,
48 : SCAN_SWAP_CACHE_PAGE,
49 : SCAN_DEL_PAGE_LRU,
50 : SCAN_ALLOC_HUGE_PAGE_FAIL,
51 : SCAN_CGROUP_CHARGE_FAIL,
52 : SCAN_TRUNCATED,
53 : SCAN_PAGE_HAS_PRIVATE,
54 : };
55 :
56 : #define CREATE_TRACE_POINTS
57 : #include <trace/events/huge_memory.h>
58 :
59 : static struct task_struct *khugepaged_thread __read_mostly;
60 : static DEFINE_MUTEX(khugepaged_mutex);
61 :
62 : /* default scan 8*512 pte (or vmas) every 30 second */
63 : static unsigned int khugepaged_pages_to_scan __read_mostly;
64 : static unsigned int khugepaged_pages_collapsed;
65 : static unsigned int khugepaged_full_scans;
66 : static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
67 : /* during fragmentation poll the hugepage allocator once every minute */
68 : static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
69 : static unsigned long khugepaged_sleep_expire;
70 : static DEFINE_SPINLOCK(khugepaged_mm_lock);
71 : static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
72 : /*
73 : * default collapse hugepages if there is at least one pte mapped like
74 : * it would have happened if the vma was large enough during page
75 : * fault.
76 : */
77 : static unsigned int khugepaged_max_ptes_none __read_mostly;
78 : static unsigned int khugepaged_max_ptes_swap __read_mostly;
79 : static unsigned int khugepaged_max_ptes_shared __read_mostly;
80 :
81 : #define MM_SLOTS_HASH_BITS 10
82 : static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
83 :
84 : static struct kmem_cache *mm_slot_cache __read_mostly;
85 :
86 : #define MAX_PTE_MAPPED_THP 8
87 :
88 : /**
89 : * struct mm_slot - hash lookup from mm to mm_slot
90 : * @hash: hash collision list
91 : * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
92 : * @mm: the mm that this information is valid for
93 : * @nr_pte_mapped_thp: number of pte mapped THP
94 : * @pte_mapped_thp: address array corresponding pte mapped THP
95 : */
96 : struct mm_slot {
97 : struct hlist_node hash;
98 : struct list_head mm_node;
99 : struct mm_struct *mm;
100 :
101 : /* pte-mapped THP in this mm */
102 : int nr_pte_mapped_thp;
103 : unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
104 : };
105 :
106 : /**
107 : * struct khugepaged_scan - cursor for scanning
108 : * @mm_head: the head of the mm list to scan
109 : * @mm_slot: the current mm_slot we are scanning
110 : * @address: the next address inside that to be scanned
111 : *
112 : * There is only the one khugepaged_scan instance of this cursor structure.
113 : */
114 : struct khugepaged_scan {
115 : struct list_head mm_head;
116 : struct mm_slot *mm_slot;
117 : unsigned long address;
118 : };
119 :
120 : static struct khugepaged_scan khugepaged_scan = {
121 : .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
122 : };
123 :
124 : #ifdef CONFIG_SYSFS
125 0 : static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
126 : struct kobj_attribute *attr,
127 : char *buf)
128 : {
129 0 : return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
130 : }
131 :
132 0 : static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
133 : struct kobj_attribute *attr,
134 : const char *buf, size_t count)
135 : {
136 0 : unsigned int msecs;
137 0 : int err;
138 :
139 0 : err = kstrtouint(buf, 10, &msecs);
140 0 : if (err)
141 : return -EINVAL;
142 :
143 0 : khugepaged_scan_sleep_millisecs = msecs;
144 0 : khugepaged_sleep_expire = 0;
145 0 : wake_up_interruptible(&khugepaged_wait);
146 :
147 0 : return count;
148 : }
149 : static struct kobj_attribute scan_sleep_millisecs_attr =
150 : __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
151 : scan_sleep_millisecs_store);
152 :
153 0 : static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
154 : struct kobj_attribute *attr,
155 : char *buf)
156 : {
157 0 : return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
158 : }
159 :
160 0 : static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
161 : struct kobj_attribute *attr,
162 : const char *buf, size_t count)
163 : {
164 0 : unsigned int msecs;
165 0 : int err;
166 :
167 0 : err = kstrtouint(buf, 10, &msecs);
168 0 : if (err)
169 : return -EINVAL;
170 :
171 0 : khugepaged_alloc_sleep_millisecs = msecs;
172 0 : khugepaged_sleep_expire = 0;
173 0 : wake_up_interruptible(&khugepaged_wait);
174 :
175 0 : return count;
176 : }
177 : static struct kobj_attribute alloc_sleep_millisecs_attr =
178 : __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
179 : alloc_sleep_millisecs_store);
180 :
181 0 : static ssize_t pages_to_scan_show(struct kobject *kobj,
182 : struct kobj_attribute *attr,
183 : char *buf)
184 : {
185 0 : return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
186 : }
187 0 : static ssize_t pages_to_scan_store(struct kobject *kobj,
188 : struct kobj_attribute *attr,
189 : const char *buf, size_t count)
190 : {
191 0 : unsigned int pages;
192 0 : int err;
193 :
194 0 : err = kstrtouint(buf, 10, &pages);
195 0 : if (err || !pages)
196 : return -EINVAL;
197 :
198 0 : khugepaged_pages_to_scan = pages;
199 :
200 0 : return count;
201 : }
202 : static struct kobj_attribute pages_to_scan_attr =
203 : __ATTR(pages_to_scan, 0644, pages_to_scan_show,
204 : pages_to_scan_store);
205 :
206 0 : static ssize_t pages_collapsed_show(struct kobject *kobj,
207 : struct kobj_attribute *attr,
208 : char *buf)
209 : {
210 0 : return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
211 : }
212 : static struct kobj_attribute pages_collapsed_attr =
213 : __ATTR_RO(pages_collapsed);
214 :
215 0 : static ssize_t full_scans_show(struct kobject *kobj,
216 : struct kobj_attribute *attr,
217 : char *buf)
218 : {
219 0 : return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
220 : }
221 : static struct kobj_attribute full_scans_attr =
222 : __ATTR_RO(full_scans);
223 :
224 0 : static ssize_t khugepaged_defrag_show(struct kobject *kobj,
225 : struct kobj_attribute *attr, char *buf)
226 : {
227 0 : return single_hugepage_flag_show(kobj, attr, buf,
228 : TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
229 : }
230 0 : static ssize_t khugepaged_defrag_store(struct kobject *kobj,
231 : struct kobj_attribute *attr,
232 : const char *buf, size_t count)
233 : {
234 0 : return single_hugepage_flag_store(kobj, attr, buf, count,
235 : TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
236 : }
237 : static struct kobj_attribute khugepaged_defrag_attr =
238 : __ATTR(defrag, 0644, khugepaged_defrag_show,
239 : khugepaged_defrag_store);
240 :
241 : /*
242 : * max_ptes_none controls if khugepaged should collapse hugepages over
243 : * any unmapped ptes in turn potentially increasing the memory
244 : * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
245 : * reduce the available free memory in the system as it
246 : * runs. Increasing max_ptes_none will instead potentially reduce the
247 : * free memory in the system during the khugepaged scan.
248 : */
249 0 : static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
250 : struct kobj_attribute *attr,
251 : char *buf)
252 : {
253 0 : return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
254 : }
255 0 : static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
256 : struct kobj_attribute *attr,
257 : const char *buf, size_t count)
258 : {
259 0 : int err;
260 0 : unsigned long max_ptes_none;
261 :
262 0 : err = kstrtoul(buf, 10, &max_ptes_none);
263 0 : if (err || max_ptes_none > HPAGE_PMD_NR-1)
264 : return -EINVAL;
265 :
266 0 : khugepaged_max_ptes_none = max_ptes_none;
267 :
268 0 : return count;
269 : }
270 : static struct kobj_attribute khugepaged_max_ptes_none_attr =
271 : __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
272 : khugepaged_max_ptes_none_store);
273 :
274 0 : static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
275 : struct kobj_attribute *attr,
276 : char *buf)
277 : {
278 0 : return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
279 : }
280 :
281 0 : static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
282 : struct kobj_attribute *attr,
283 : const char *buf, size_t count)
284 : {
285 0 : int err;
286 0 : unsigned long max_ptes_swap;
287 :
288 0 : err = kstrtoul(buf, 10, &max_ptes_swap);
289 0 : if (err || max_ptes_swap > HPAGE_PMD_NR-1)
290 : return -EINVAL;
291 :
292 0 : khugepaged_max_ptes_swap = max_ptes_swap;
293 :
294 0 : return count;
295 : }
296 :
297 : static struct kobj_attribute khugepaged_max_ptes_swap_attr =
298 : __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
299 : khugepaged_max_ptes_swap_store);
300 :
301 0 : static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
302 : struct kobj_attribute *attr,
303 : char *buf)
304 : {
305 0 : return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
306 : }
307 :
308 0 : static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
309 : struct kobj_attribute *attr,
310 : const char *buf, size_t count)
311 : {
312 0 : int err;
313 0 : unsigned long max_ptes_shared;
314 :
315 0 : err = kstrtoul(buf, 10, &max_ptes_shared);
316 0 : if (err || max_ptes_shared > HPAGE_PMD_NR-1)
317 : return -EINVAL;
318 :
319 0 : khugepaged_max_ptes_shared = max_ptes_shared;
320 :
321 0 : return count;
322 : }
323 :
324 : static struct kobj_attribute khugepaged_max_ptes_shared_attr =
325 : __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
326 : khugepaged_max_ptes_shared_store);
327 :
328 : static struct attribute *khugepaged_attr[] = {
329 : &khugepaged_defrag_attr.attr,
330 : &khugepaged_max_ptes_none_attr.attr,
331 : &khugepaged_max_ptes_swap_attr.attr,
332 : &khugepaged_max_ptes_shared_attr.attr,
333 : &pages_to_scan_attr.attr,
334 : &pages_collapsed_attr.attr,
335 : &full_scans_attr.attr,
336 : &scan_sleep_millisecs_attr.attr,
337 : &alloc_sleep_millisecs_attr.attr,
338 : NULL,
339 : };
340 :
341 : struct attribute_group khugepaged_attr_group = {
342 : .attrs = khugepaged_attr,
343 : .name = "khugepaged",
344 : };
345 : #endif /* CONFIG_SYSFS */
346 :
347 0 : int hugepage_madvise(struct vm_area_struct *vma,
348 : unsigned long *vm_flags, int advice)
349 : {
350 0 : switch (advice) {
351 0 : case MADV_HUGEPAGE:
352 : #ifdef CONFIG_S390
353 : /*
354 : * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
355 : * can't handle this properly after s390_enable_sie, so we simply
356 : * ignore the madvise to prevent qemu from causing a SIGSEGV.
357 : */
358 : if (mm_has_pgste(vma->vm_mm))
359 : return 0;
360 : #endif
361 0 : *vm_flags &= ~VM_NOHUGEPAGE;
362 0 : *vm_flags |= VM_HUGEPAGE;
363 : /*
364 : * If the vma become good for khugepaged to scan,
365 : * register it here without waiting a page fault that
366 : * may not happen any time soon.
367 : */
368 0 : if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
369 0 : khugepaged_enter_vma_merge(vma, *vm_flags))
370 0 : return -ENOMEM;
371 : break;
372 0 : case MADV_NOHUGEPAGE:
373 0 : *vm_flags &= ~VM_HUGEPAGE;
374 0 : *vm_flags |= VM_NOHUGEPAGE;
375 : /*
376 : * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
377 : * this vma even if we leave the mm registered in khugepaged if
378 : * it got registered before VM_NOHUGEPAGE was set.
379 : */
380 0 : break;
381 : }
382 :
383 : return 0;
384 : }
385 :
386 1 : int __init khugepaged_init(void)
387 : {
388 1 : mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
389 : sizeof(struct mm_slot),
390 : __alignof__(struct mm_slot), 0, NULL);
391 1 : if (!mm_slot_cache)
392 : return -ENOMEM;
393 :
394 1 : khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
395 1 : khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
396 1 : khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
397 1 : khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
398 :
399 1 : return 0;
400 : }
401 :
402 0 : void __init khugepaged_destroy(void)
403 : {
404 0 : kmem_cache_destroy(mm_slot_cache);
405 0 : }
406 :
407 25 : static inline struct mm_slot *alloc_mm_slot(void)
408 : {
409 25 : if (!mm_slot_cache) /* initialization failed */
410 : return NULL;
411 25 : return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
412 : }
413 :
414 20 : static inline void free_mm_slot(struct mm_slot *mm_slot)
415 : {
416 20 : kmem_cache_free(mm_slot_cache, mm_slot);
417 : }
418 :
419 20 : static struct mm_slot *get_mm_slot(struct mm_struct *mm)
420 : {
421 20 : struct mm_slot *mm_slot;
422 :
423 40 : hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
424 20 : if (mm == mm_slot->mm)
425 20 : return mm_slot;
426 :
427 : return NULL;
428 : }
429 :
430 25 : static void insert_to_mm_slots_hash(struct mm_struct *mm,
431 : struct mm_slot *mm_slot)
432 : {
433 25 : mm_slot->mm = mm;
434 25 : hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
435 25 : }
436 :
437 1768 : static inline int khugepaged_test_exit(struct mm_struct *mm)
438 : {
439 3536 : return atomic_read(&mm->mm_users) == 0;
440 : }
441 :
442 6312 : static bool hugepage_vma_check(struct vm_area_struct *vma,
443 : unsigned long vm_flags)
444 : {
445 : /* Explicitly disabled through madvise. */
446 6312 : if ((vm_flags & VM_NOHUGEPAGE) ||
447 6312 : test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
448 0 : return false;
449 :
450 : /* Enabled via shmem mount options or sysfs settings. */
451 9906 : if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
452 0 : return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
453 : HPAGE_PMD_NR);
454 : }
455 :
456 : /* THP settings require madvise. */
457 6312 : if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
458 : return false;
459 :
460 : /* Read-only file mappings need to be aligned for THP to work. */
461 6312 : if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
462 : (vm_flags & VM_DENYWRITE)) {
463 : return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
464 : HPAGE_PMD_NR);
465 : }
466 :
467 6312 : if (!vma->anon_vma || vma->vm_ops)
468 : return false;
469 2649 : if (vma_is_temporary_stack(vma))
470 : return false;
471 2649 : return !(vm_flags & VM_NO_KHUGEPAGED);
472 : }
473 :
474 25 : int __khugepaged_enter(struct mm_struct *mm)
475 : {
476 25 : struct mm_slot *mm_slot;
477 25 : int wakeup;
478 :
479 25 : mm_slot = alloc_mm_slot();
480 25 : if (!mm_slot)
481 : return -ENOMEM;
482 :
483 : /* __khugepaged_exit() must not run from under us */
484 25 : VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
485 25 : if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
486 0 : free_mm_slot(mm_slot);
487 0 : return 0;
488 : }
489 :
490 25 : spin_lock(&khugepaged_mm_lock);
491 25 : insert_to_mm_slots_hash(mm, mm_slot);
492 : /*
493 : * Insert just behind the scanning cursor, to let the area settle
494 : * down a little.
495 : */
496 25 : wakeup = list_empty(&khugepaged_scan.mm_head);
497 25 : list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
498 25 : spin_unlock(&khugepaged_mm_lock);
499 :
500 25 : mmgrab(mm);
501 25 : if (wakeup)
502 1 : wake_up_interruptible(&khugepaged_wait);
503 :
504 : return 0;
505 : }
506 :
507 4615 : int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
508 : unsigned long vm_flags)
509 : {
510 4615 : unsigned long hstart, hend;
511 :
512 : /*
513 : * khugepaged only supports read-only files for non-shmem files.
514 : * khugepaged does not yet work on special mappings. And
515 : * file-private shmem THP is not supported.
516 : */
517 4615 : if (!hugepage_vma_check(vma, vm_flags))
518 : return 0;
519 :
520 2476 : hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
521 2476 : hend = vma->vm_end & HPAGE_PMD_MASK;
522 2476 : if (hstart < hend)
523 9 : return khugepaged_enter(vma, vm_flags);
524 : return 0;
525 : }
526 :
527 20 : void __khugepaged_exit(struct mm_struct *mm)
528 : {
529 20 : struct mm_slot *mm_slot;
530 20 : int free = 0;
531 :
532 20 : spin_lock(&khugepaged_mm_lock);
533 20 : mm_slot = get_mm_slot(mm);
534 20 : if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
535 20 : hash_del(&mm_slot->hash);
536 20 : list_del(&mm_slot->mm_node);
537 20 : free = 1;
538 : }
539 20 : spin_unlock(&khugepaged_mm_lock);
540 :
541 20 : if (free) {
542 20 : clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
543 20 : free_mm_slot(mm_slot);
544 20 : mmdrop(mm);
545 0 : } else if (mm_slot) {
546 : /*
547 : * This is required to serialize against
548 : * khugepaged_test_exit() (which is guaranteed to run
549 : * under mmap sem read mode). Stop here (after we
550 : * return all pagetables will be destroyed) until
551 : * khugepaged has finished working on the pagetables
552 : * under the mmap_lock.
553 : */
554 0 : mmap_write_lock(mm);
555 0 : mmap_write_unlock(mm);
556 : }
557 20 : }
558 :
559 10 : static void release_pte_page(struct page *page)
560 : {
561 30 : mod_node_page_state(page_pgdat(page),
562 10 : NR_ISOLATED_ANON + page_is_file_lru(page),
563 10 : -compound_nr(page));
564 10 : unlock_page(page);
565 10 : putback_lru_page(page);
566 10 : }
567 :
568 0 : static void release_pte_pages(pte_t *pte, pte_t *_pte,
569 : struct list_head *compound_pagelist)
570 : {
571 0 : struct page *page, *tmp;
572 :
573 0 : while (--_pte >= pte) {
574 0 : pte_t pteval = *_pte;
575 :
576 0 : page = pte_page(pteval);
577 0 : if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
578 0 : !PageCompound(page))
579 0 : release_pte_page(page);
580 : }
581 :
582 0 : list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
583 0 : list_del(&page->lru);
584 0 : release_pte_page(page);
585 : }
586 0 : }
587 :
588 20 : static bool is_refcount_suitable(struct page *page)
589 : {
590 20 : int expected_refcount;
591 :
592 20 : expected_refcount = total_mapcount(page);
593 20 : if (PageSwapCache(page))
594 : expected_refcount += compound_nr(page);
595 :
596 20 : return page_count(page) == expected_refcount;
597 : }
598 :
599 2 : static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
600 : unsigned long address,
601 : pte_t *pte,
602 : struct list_head *compound_pagelist)
603 : {
604 2 : struct page *page = NULL;
605 2 : pte_t *_pte;
606 2 : int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
607 2 : bool writable = false;
608 :
609 1026 : for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
610 1024 : _pte++, address += PAGE_SIZE) {
611 1024 : pte_t pteval = *_pte;
612 1024 : if (pte_none(pteval) || (pte_present(pteval) &&
613 10 : is_zero_pfn(pte_pfn(pteval)))) {
614 1014 : if (!userfaultfd_armed(vma) &&
615 1014 : ++none_or_zero <= khugepaged_max_ptes_none) {
616 1014 : continue;
617 : } else {
618 0 : result = SCAN_EXCEED_NONE_PTE;
619 0 : goto out;
620 : }
621 : }
622 10 : if (!pte_present(pteval)) {
623 0 : result = SCAN_PTE_NON_PRESENT;
624 0 : goto out;
625 : }
626 10 : page = vm_normal_page(vma, address, pteval);
627 10 : if (unlikely(!page)) {
628 0 : result = SCAN_PAGE_NULL;
629 0 : goto out;
630 : }
631 :
632 10 : VM_BUG_ON_PAGE(!PageAnon(page), page);
633 :
634 10 : if (page_mapcount(page) > 1 &&
635 4 : ++shared > khugepaged_max_ptes_shared) {
636 0 : result = SCAN_EXCEED_SHARED_PTE;
637 0 : goto out;
638 : }
639 :
640 20 : if (PageCompound(page)) {
641 0 : struct page *p;
642 0 : page = compound_head(page);
643 :
644 : /*
645 : * Check if we have dealt with the compound page
646 : * already
647 : */
648 0 : list_for_each_entry(p, compound_pagelist, lru) {
649 0 : if (page == p)
650 0 : goto next;
651 : }
652 : }
653 :
654 : /*
655 : * We can do it before isolate_lru_page because the
656 : * page can't be freed from under us. NOTE: PG_lock
657 : * is needed to serialize against split_huge_page
658 : * when invoked from the VM.
659 : */
660 10 : if (!trylock_page(page)) {
661 0 : result = SCAN_PAGE_LOCK;
662 0 : goto out;
663 : }
664 :
665 : /*
666 : * Check if the page has any GUP (or other external) pins.
667 : *
668 : * The page table that maps the page has been already unlinked
669 : * from the page table tree and this process cannot get
670 : * an additinal pin on the page.
671 : *
672 : * New pins can come later if the page is shared across fork,
673 : * but not from this process. The other process cannot write to
674 : * the page, only trigger CoW.
675 : */
676 10 : if (!is_refcount_suitable(page)) {
677 0 : unlock_page(page);
678 0 : result = SCAN_PAGE_COUNT;
679 0 : goto out;
680 : }
681 10 : if (!pte_write(pteval) && PageSwapCache(page) &&
682 : !reuse_swap_page(page, NULL)) {
683 : /*
684 : * Page is in the swap cache and cannot be re-used.
685 : * It cannot be collapsed into a THP.
686 : */
687 : unlock_page(page);
688 : result = SCAN_SWAP_CACHE_PAGE;
689 : goto out;
690 : }
691 :
692 : /*
693 : * Isolate the page to avoid collapsing an hugepage
694 : * currently in use by the VM.
695 : */
696 10 : if (isolate_lru_page(page)) {
697 0 : unlock_page(page);
698 0 : result = SCAN_DEL_PAGE_LRU;
699 0 : goto out;
700 : }
701 30 : mod_node_page_state(page_pgdat(page),
702 10 : NR_ISOLATED_ANON + page_is_file_lru(page),
703 10 : compound_nr(page));
704 20 : VM_BUG_ON_PAGE(!PageLocked(page), page);
705 20 : VM_BUG_ON_PAGE(PageLRU(page), page);
706 :
707 20 : if (PageCompound(page))
708 0 : list_add_tail(&page->lru, compound_pagelist);
709 10 : next:
710 : /* There should be enough young pte to collapse the page */
711 10 : if (pte_young(pteval) ||
712 6 : page_is_young(page) || PageReferenced(page) ||
713 10 : mmu_notifier_test_young(vma->vm_mm, address))
714 8 : referenced++;
715 :
716 10 : if (pte_write(pteval))
717 2 : writable = true;
718 : }
719 2 : if (likely(writable)) {
720 2 : if (likely(referenced)) {
721 2 : result = SCAN_SUCCEED;
722 2 : trace_mm_collapse_huge_page_isolate(page, none_or_zero,
723 : referenced, writable, result);
724 2 : return 1;
725 : }
726 : } else {
727 : result = SCAN_PAGE_RO;
728 : }
729 :
730 0 : out:
731 0 : release_pte_pages(pte, _pte, compound_pagelist);
732 0 : trace_mm_collapse_huge_page_isolate(page, none_or_zero,
733 : referenced, writable, result);
734 0 : return 0;
735 : }
736 :
737 2 : static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
738 : struct vm_area_struct *vma,
739 : unsigned long address,
740 : spinlock_t *ptl,
741 : struct list_head *compound_pagelist)
742 : {
743 2 : struct page *src_page, *tmp;
744 2 : pte_t *_pte;
745 1026 : for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
746 1024 : _pte++, page++, address += PAGE_SIZE) {
747 1024 : pte_t pteval = *_pte;
748 :
749 1024 : if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
750 1014 : clear_user_highpage(page, address);
751 1014 : add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
752 1014 : if (is_zero_pfn(pte_pfn(pteval))) {
753 : /*
754 : * ptl mostly unnecessary.
755 : */
756 0 : spin_lock(ptl);
757 : /*
758 : * paravirt calls inside pte_clear here are
759 : * superfluous.
760 : */
761 0 : pte_clear(vma->vm_mm, address, _pte);
762 1024 : spin_unlock(ptl);
763 : }
764 : } else {
765 10 : src_page = pte_page(pteval);
766 10 : copy_user_highpage(page, src_page, address, vma);
767 20 : if (!PageCompound(src_page))
768 10 : release_pte_page(src_page);
769 : /*
770 : * ptl mostly unnecessary, but preempt has to
771 : * be disabled to update the per-cpu stats
772 : * inside page_remove_rmap().
773 : */
774 10 : spin_lock(ptl);
775 : /*
776 : * paravirt calls inside pte_clear here are
777 : * superfluous.
778 : */
779 10 : pte_clear(vma->vm_mm, address, _pte);
780 10 : page_remove_rmap(src_page, false);
781 10 : spin_unlock(ptl);
782 10 : free_page_and_swap_cache(src_page);
783 : }
784 : }
785 :
786 2 : list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
787 0 : list_del(&src_page->lru);
788 0 : release_pte_page(src_page);
789 : }
790 2 : }
791 :
792 0 : static void khugepaged_alloc_sleep(void)
793 : {
794 0 : DEFINE_WAIT(wait);
795 :
796 0 : add_wait_queue(&khugepaged_wait, &wait);
797 0 : freezable_schedule_timeout_interruptible(
798 : msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
799 0 : remove_wait_queue(&khugepaged_wait, &wait);
800 0 : }
801 :
802 : static int khugepaged_node_load[MAX_NUMNODES];
803 :
804 10 : static bool khugepaged_scan_abort(int nid)
805 : {
806 10 : int i;
807 :
808 : /*
809 : * If node_reclaim_mode is disabled, then no extra effort is made to
810 : * allocate memory locally.
811 : */
812 10 : if (!node_reclaim_mode)
813 : return false;
814 :
815 : /* If there is a count for this node already, it must be acceptable */
816 0 : if (khugepaged_node_load[nid])
817 : return false;
818 :
819 0 : for (i = 0; i < MAX_NUMNODES; i++) {
820 0 : if (!khugepaged_node_load[i])
821 0 : continue;
822 0 : if (node_distance(nid, i) > node_reclaim_distance)
823 : return true;
824 : }
825 : return false;
826 : }
827 :
828 : /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
829 2 : static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
830 : {
831 2 : return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
832 : }
833 :
834 : #ifdef CONFIG_NUMA
835 2 : static int khugepaged_find_target_node(void)
836 : {
837 2 : static int last_khugepaged_target_node = NUMA_NO_NODE;
838 2 : int nid, target_node = 0, max_value = 0;
839 :
840 : /* find first node with max normal pages hit */
841 130 : for (nid = 0; nid < MAX_NUMNODES; nid++)
842 128 : if (khugepaged_node_load[nid] > max_value) {
843 2 : max_value = khugepaged_node_load[nid];
844 2 : target_node = nid;
845 : }
846 :
847 : /* do some balance if several nodes have the same hit record */
848 2 : if (target_node <= last_khugepaged_target_node)
849 64 : for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
850 63 : nid++)
851 63 : if (max_value == khugepaged_node_load[nid]) {
852 : target_node = nid;
853 : break;
854 : }
855 :
856 2 : last_khugepaged_target_node = target_node;
857 2 : return target_node;
858 : }
859 :
860 17 : static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
861 : {
862 17 : if (IS_ERR(*hpage)) {
863 0 : if (!*wait)
864 : return false;
865 :
866 0 : *wait = false;
867 0 : *hpage = NULL;
868 0 : khugepaged_alloc_sleep();
869 17 : } else if (*hpage) {
870 0 : put_page(*hpage);
871 0 : *hpage = NULL;
872 : }
873 :
874 : return true;
875 : }
876 :
877 : static struct page *
878 2 : khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
879 : {
880 2 : VM_BUG_ON_PAGE(*hpage, *hpage);
881 :
882 2 : *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
883 2 : if (unlikely(!*hpage)) {
884 0 : count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
885 0 : *hpage = ERR_PTR(-ENOMEM);
886 0 : return NULL;
887 : }
888 :
889 2 : prep_transhuge_page(*hpage);
890 2 : count_vm_event(THP_COLLAPSE_ALLOC);
891 2 : return *hpage;
892 : }
893 : #else
894 : static int khugepaged_find_target_node(void)
895 : {
896 : return 0;
897 : }
898 :
899 : static inline struct page *alloc_khugepaged_hugepage(void)
900 : {
901 : struct page *page;
902 :
903 : page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
904 : HPAGE_PMD_ORDER);
905 : if (page)
906 : prep_transhuge_page(page);
907 : return page;
908 : }
909 :
910 : static struct page *khugepaged_alloc_hugepage(bool *wait)
911 : {
912 : struct page *hpage;
913 :
914 : do {
915 : hpage = alloc_khugepaged_hugepage();
916 : if (!hpage) {
917 : count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
918 : if (!*wait)
919 : return NULL;
920 :
921 : *wait = false;
922 : khugepaged_alloc_sleep();
923 : } else
924 : count_vm_event(THP_COLLAPSE_ALLOC);
925 : } while (unlikely(!hpage) && likely(khugepaged_enabled()));
926 :
927 : return hpage;
928 : }
929 :
930 : static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
931 : {
932 : /*
933 : * If the hpage allocated earlier was briefly exposed in page cache
934 : * before collapse_file() failed, it is possible that racing lookups
935 : * have not yet completed, and would then be unpleasantly surprised by
936 : * finding the hpage reused for the same mapping at a different offset.
937 : * Just release the previous allocation if there is any danger of that.
938 : */
939 : if (*hpage && page_count(*hpage) > 1) {
940 : put_page(*hpage);
941 : *hpage = NULL;
942 : }
943 :
944 : if (!*hpage)
945 : *hpage = khugepaged_alloc_hugepage(wait);
946 :
947 : if (unlikely(!*hpage))
948 : return false;
949 :
950 : return true;
951 : }
952 :
953 : static struct page *
954 : khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
955 : {
956 : VM_BUG_ON(!*hpage);
957 :
958 : return *hpage;
959 : }
960 : #endif
961 :
962 : /*
963 : * If mmap_lock temporarily dropped, revalidate vma
964 : * before taking mmap_lock.
965 : * Return 0 if succeeds, otherwise return none-zero
966 : * value (scan code).
967 : */
968 :
969 4 : static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
970 : struct vm_area_struct **vmap)
971 : {
972 4 : struct vm_area_struct *vma;
973 4 : unsigned long hstart, hend;
974 :
975 4 : if (unlikely(khugepaged_test_exit(mm)))
976 : return SCAN_ANY_PROCESS;
977 :
978 4 : *vmap = vma = find_vma(mm, address);
979 4 : if (!vma)
980 : return SCAN_VMA_NULL;
981 :
982 4 : hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
983 4 : hend = vma->vm_end & HPAGE_PMD_MASK;
984 4 : if (address < hstart || address + HPAGE_PMD_SIZE > hend)
985 : return SCAN_ADDRESS_RANGE;
986 4 : if (!hugepage_vma_check(vma, vma->vm_flags))
987 : return SCAN_VMA_CHECK;
988 : /* Anon VMA expected */
989 4 : if (!vma->anon_vma || vma->vm_ops)
990 0 : return SCAN_VMA_CHECK;
991 : return 0;
992 : }
993 :
994 : /*
995 : * Bring missing pages in from swap, to complete THP collapse.
996 : * Only done if khugepaged_scan_pmd believes it is worthwhile.
997 : *
998 : * Called and returns without pte mapped or spinlocks held,
999 : * but with mmap_lock held to protect against vma changes.
1000 : */
1001 :
1002 0 : static bool __collapse_huge_page_swapin(struct mm_struct *mm,
1003 : struct vm_area_struct *vma,
1004 : unsigned long haddr, pmd_t *pmd,
1005 : int referenced)
1006 : {
1007 0 : int swapped_in = 0;
1008 0 : vm_fault_t ret = 0;
1009 0 : unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
1010 :
1011 0 : for (address = haddr; address < end; address += PAGE_SIZE) {
1012 0 : struct vm_fault vmf = {
1013 : .vma = vma,
1014 : .address = address,
1015 0 : .pgoff = linear_page_index(vma, haddr),
1016 : .flags = FAULT_FLAG_ALLOW_RETRY,
1017 : .pmd = pmd,
1018 : };
1019 :
1020 0 : vmf.pte = pte_offset_map(pmd, address);
1021 0 : vmf.orig_pte = *vmf.pte;
1022 0 : if (!is_swap_pte(vmf.orig_pte)) {
1023 0 : pte_unmap(vmf.pte);
1024 0 : continue;
1025 : }
1026 0 : swapped_in++;
1027 0 : ret = do_swap_page(&vmf);
1028 :
1029 : /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
1030 0 : if (ret & VM_FAULT_RETRY) {
1031 0 : mmap_read_lock(mm);
1032 0 : if (hugepage_vma_revalidate(mm, haddr, &vma)) {
1033 : /* vma is no longer available, don't continue to swapin */
1034 0 : trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
1035 0 : return false;
1036 : }
1037 : /* check if the pmd is still valid */
1038 0 : if (mm_find_pmd(mm, haddr) != pmd) {
1039 0 : trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
1040 0 : return false;
1041 : }
1042 : }
1043 0 : if (ret & VM_FAULT_ERROR) {
1044 0 : trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
1045 0 : return false;
1046 : }
1047 : }
1048 :
1049 : /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
1050 0 : if (swapped_in)
1051 0 : lru_add_drain();
1052 :
1053 0 : trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
1054 0 : return true;
1055 : }
1056 :
1057 2 : static void collapse_huge_page(struct mm_struct *mm,
1058 : unsigned long address,
1059 : struct page **hpage,
1060 : int node, int referenced, int unmapped)
1061 : {
1062 2 : LIST_HEAD(compound_pagelist);
1063 2 : pmd_t *pmd, _pmd;
1064 2 : pte_t *pte;
1065 2 : pgtable_t pgtable;
1066 2 : struct page *new_page;
1067 2 : spinlock_t *pmd_ptl, *pte_ptl;
1068 2 : int isolated = 0, result = 0;
1069 2 : struct vm_area_struct *vma;
1070 2 : struct mmu_notifier_range range;
1071 2 : gfp_t gfp;
1072 :
1073 2 : VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1074 :
1075 : /* Only allocate from the target node */
1076 2 : gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
1077 :
1078 : /*
1079 : * Before allocating the hugepage, release the mmap_lock read lock.
1080 : * The allocation can take potentially a long time if it involves
1081 : * sync compaction, and we do not need to hold the mmap_lock during
1082 : * that. We will recheck the vma after taking it again in write mode.
1083 : */
1084 2 : mmap_read_unlock(mm);
1085 2 : new_page = khugepaged_alloc_page(hpage, gfp, node);
1086 2 : if (!new_page) {
1087 0 : result = SCAN_ALLOC_HUGE_PAGE_FAIL;
1088 0 : goto out_nolock;
1089 : }
1090 :
1091 2 : if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
1092 : result = SCAN_CGROUP_CHARGE_FAIL;
1093 : goto out_nolock;
1094 : }
1095 2 : count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
1096 :
1097 2 : mmap_read_lock(mm);
1098 2 : result = hugepage_vma_revalidate(mm, address, &vma);
1099 2 : if (result) {
1100 0 : mmap_read_unlock(mm);
1101 0 : goto out_nolock;
1102 : }
1103 :
1104 2 : pmd = mm_find_pmd(mm, address);
1105 2 : if (!pmd) {
1106 0 : result = SCAN_PMD_NULL;
1107 0 : mmap_read_unlock(mm);
1108 0 : goto out_nolock;
1109 : }
1110 :
1111 : /*
1112 : * __collapse_huge_page_swapin always returns with mmap_lock locked.
1113 : * If it fails, we release mmap_lock and jump out_nolock.
1114 : * Continuing to collapse causes inconsistency.
1115 : */
1116 2 : if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
1117 : pmd, referenced)) {
1118 0 : mmap_read_unlock(mm);
1119 0 : goto out_nolock;
1120 : }
1121 :
1122 2 : mmap_read_unlock(mm);
1123 : /*
1124 : * Prevent all access to pagetables with the exception of
1125 : * gup_fast later handled by the ptep_clear_flush and the VM
1126 : * handled by the anon_vma lock + PG_lock.
1127 : */
1128 2 : mmap_write_lock(mm);
1129 2 : result = hugepage_vma_revalidate(mm, address, &vma);
1130 2 : if (result)
1131 0 : goto out;
1132 : /* check if the pmd is still valid */
1133 2 : if (mm_find_pmd(mm, address) != pmd)
1134 0 : goto out;
1135 :
1136 2 : anon_vma_lock_write(vma->anon_vma);
1137 :
1138 2 : mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
1139 : address, address + HPAGE_PMD_SIZE);
1140 2 : mmu_notifier_invalidate_range_start(&range);
1141 :
1142 2 : pte = pte_offset_map(pmd, address);
1143 2 : pte_ptl = pte_lockptr(mm, pmd);
1144 :
1145 2 : pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1146 : /*
1147 : * After this gup_fast can't run anymore. This also removes
1148 : * any huge TLB entry from the CPU so we won't allow
1149 : * huge and small TLB entries for the same virtual address
1150 : * to avoid the risk of CPU bugs in that area.
1151 : */
1152 2 : _pmd = pmdp_collapse_flush(vma, address, pmd);
1153 2 : spin_unlock(pmd_ptl);
1154 2 : mmu_notifier_invalidate_range_end(&range);
1155 :
1156 2 : spin_lock(pte_ptl);
1157 2 : isolated = __collapse_huge_page_isolate(vma, address, pte,
1158 : &compound_pagelist);
1159 2 : spin_unlock(pte_ptl);
1160 :
1161 2 : if (unlikely(!isolated)) {
1162 0 : pte_unmap(pte);
1163 0 : spin_lock(pmd_ptl);
1164 0 : BUG_ON(!pmd_none(*pmd));
1165 : /*
1166 : * We can only use set_pmd_at when establishing
1167 : * hugepmds and never for establishing regular pmds that
1168 : * points to regular pagetables. Use pmd_populate for that
1169 : */
1170 0 : pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1171 0 : spin_unlock(pmd_ptl);
1172 0 : anon_vma_unlock_write(vma->anon_vma);
1173 0 : result = SCAN_FAIL;
1174 0 : goto out;
1175 : }
1176 :
1177 : /*
1178 : * All pages are isolated and locked so anon_vma rmap
1179 : * can't run anymore.
1180 : */
1181 2 : anon_vma_unlock_write(vma->anon_vma);
1182 :
1183 2 : __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
1184 : &compound_pagelist);
1185 2 : pte_unmap(pte);
1186 2 : __SetPageUptodate(new_page);
1187 2 : pgtable = pmd_pgtable(_pmd);
1188 :
1189 2 : _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
1190 2 : _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1191 :
1192 : /*
1193 : * spin_lock() below is not the equivalent of smp_wmb(), so
1194 : * this is needed to avoid the copy_huge_page writes to become
1195 : * visible after the set_pmd_at() write.
1196 : */
1197 2 : smp_wmb();
1198 :
1199 2 : spin_lock(pmd_ptl);
1200 2 : BUG_ON(!pmd_none(*pmd));
1201 2 : page_add_new_anon_rmap(new_page, vma, address, true);
1202 2 : lru_cache_add_inactive_or_unevictable(new_page, vma);
1203 2 : pgtable_trans_huge_deposit(mm, pmd, pgtable);
1204 2 : set_pmd_at(mm, address, pmd, _pmd);
1205 2 : update_mmu_cache_pmd(vma, address, pmd);
1206 2 : spin_unlock(pmd_ptl);
1207 :
1208 2 : *hpage = NULL;
1209 :
1210 2 : khugepaged_pages_collapsed++;
1211 2 : result = SCAN_SUCCEED;
1212 2 : out_up_write:
1213 2 : mmap_write_unlock(mm);
1214 2 : out_nolock:
1215 2 : if (!IS_ERR_OR_NULL(*hpage))
1216 2 : mem_cgroup_uncharge(*hpage);
1217 2 : trace_mm_collapse_huge_page(mm, isolated, result);
1218 2 : return;
1219 0 : out:
1220 0 : goto out_up_write;
1221 : }
1222 :
1223 32 : static int khugepaged_scan_pmd(struct mm_struct *mm,
1224 : struct vm_area_struct *vma,
1225 : unsigned long address,
1226 : struct page **hpage)
1227 : {
1228 32 : pmd_t *pmd;
1229 32 : pte_t *pte, *_pte;
1230 32 : int ret = 0, result = 0, referenced = 0;
1231 32 : int none_or_zero = 0, shared = 0;
1232 32 : struct page *page = NULL;
1233 32 : unsigned long _address;
1234 32 : spinlock_t *ptl;
1235 32 : int node = NUMA_NO_NODE, unmapped = 0;
1236 32 : bool writable = false;
1237 :
1238 32 : VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1239 :
1240 32 : pmd = mm_find_pmd(mm, address);
1241 32 : if (!pmd) {
1242 30 : result = SCAN_PMD_NULL;
1243 30 : goto out;
1244 : }
1245 :
1246 2 : memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1247 4 : pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1248 1028 : for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1249 1024 : _pte++, _address += PAGE_SIZE) {
1250 1024 : pte_t pteval = *_pte;
1251 1024 : if (is_swap_pte(pteval)) {
1252 0 : if (++unmapped <= khugepaged_max_ptes_swap) {
1253 : /*
1254 : * Always be strict with uffd-wp
1255 : * enabled swap entries. Please see
1256 : * comment below for pte_uffd_wp().
1257 : */
1258 0 : if (pte_swp_uffd_wp(pteval)) {
1259 : result = SCAN_PTE_UFFD_WP;
1260 0 : goto out_unmap;
1261 : }
1262 1014 : continue;
1263 : } else {
1264 0 : result = SCAN_EXCEED_SWAP_PTE;
1265 0 : goto out_unmap;
1266 : }
1267 : }
1268 1024 : if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
1269 1014 : if (!userfaultfd_armed(vma) &&
1270 1014 : ++none_or_zero <= khugepaged_max_ptes_none) {
1271 1014 : continue;
1272 : } else {
1273 0 : result = SCAN_EXCEED_NONE_PTE;
1274 0 : goto out_unmap;
1275 : }
1276 : }
1277 10 : if (!pte_present(pteval)) {
1278 0 : result = SCAN_PTE_NON_PRESENT;
1279 0 : goto out_unmap;
1280 : }
1281 10 : if (pte_uffd_wp(pteval)) {
1282 : /*
1283 : * Don't collapse the page if any of the small
1284 : * PTEs are armed with uffd write protection.
1285 : * Here we can also mark the new huge pmd as
1286 : * write protected if any of the small ones is
1287 : * marked but that could bring unknown
1288 : * userfault messages that falls outside of
1289 : * the registered range. So, just be simple.
1290 : */
1291 : result = SCAN_PTE_UFFD_WP;
1292 : goto out_unmap;
1293 : }
1294 10 : if (pte_write(pteval))
1295 2 : writable = true;
1296 :
1297 10 : page = vm_normal_page(vma, _address, pteval);
1298 10 : if (unlikely(!page)) {
1299 0 : result = SCAN_PAGE_NULL;
1300 0 : goto out_unmap;
1301 : }
1302 :
1303 10 : if (page_mapcount(page) > 1 &&
1304 4 : ++shared > khugepaged_max_ptes_shared) {
1305 0 : result = SCAN_EXCEED_SHARED_PTE;
1306 0 : goto out_unmap;
1307 : }
1308 :
1309 10 : page = compound_head(page);
1310 :
1311 : /*
1312 : * Record which node the original page is from and save this
1313 : * information to khugepaged_node_load[].
1314 : * Khupaged will allocate hugepage from the node has the max
1315 : * hit record.
1316 : */
1317 10 : node = page_to_nid(page);
1318 10 : if (khugepaged_scan_abort(node)) {
1319 0 : result = SCAN_SCAN_ABORT;
1320 0 : goto out_unmap;
1321 : }
1322 10 : khugepaged_node_load[node]++;
1323 20 : if (!PageLRU(page)) {
1324 0 : result = SCAN_PAGE_LRU;
1325 0 : goto out_unmap;
1326 : }
1327 20 : if (PageLocked(page)) {
1328 0 : result = SCAN_PAGE_LOCK;
1329 0 : goto out_unmap;
1330 : }
1331 10 : if (!PageAnon(page)) {
1332 0 : result = SCAN_PAGE_ANON;
1333 0 : goto out_unmap;
1334 : }
1335 :
1336 : /*
1337 : * Check if the page has any GUP (or other external) pins.
1338 : *
1339 : * Here the check is racy it may see totmal_mapcount > refcount
1340 : * in some cases.
1341 : * For example, one process with one forked child process.
1342 : * The parent has the PMD split due to MADV_DONTNEED, then
1343 : * the child is trying unmap the whole PMD, but khugepaged
1344 : * may be scanning the parent between the child has
1345 : * PageDoubleMap flag cleared and dec the mapcount. So
1346 : * khugepaged may see total_mapcount > refcount.
1347 : *
1348 : * But such case is ephemeral we could always retry collapse
1349 : * later. However it may report false positive if the page
1350 : * has excessive GUP pins (i.e. 512). Anyway the same check
1351 : * will be done again later the risk seems low.
1352 : */
1353 10 : if (!is_refcount_suitable(page)) {
1354 0 : result = SCAN_PAGE_COUNT;
1355 0 : goto out_unmap;
1356 : }
1357 10 : if (pte_young(pteval) ||
1358 6 : page_is_young(page) || PageReferenced(page) ||
1359 10 : mmu_notifier_test_young(vma->vm_mm, address))
1360 8 : referenced++;
1361 : }
1362 2 : if (!writable) {
1363 : result = SCAN_PAGE_RO;
1364 2 : } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
1365 : result = SCAN_LACK_REFERENCED_PAGE;
1366 : } else {
1367 2 : result = SCAN_SUCCEED;
1368 2 : ret = 1;
1369 : }
1370 2 : out_unmap:
1371 2 : pte_unmap_unlock(pte, ptl);
1372 2 : if (ret) {
1373 2 : node = khugepaged_find_target_node();
1374 : /* collapse_huge_page will return with the mmap_lock released */
1375 2 : collapse_huge_page(mm, address, hpage, node,
1376 : referenced, unmapped);
1377 : }
1378 0 : out:
1379 32 : trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
1380 : none_or_zero, result, unmapped);
1381 32 : return ret;
1382 : }
1383 :
1384 9 : static void collect_mm_slot(struct mm_slot *mm_slot)
1385 : {
1386 9 : struct mm_struct *mm = mm_slot->mm;
1387 :
1388 27 : lockdep_assert_held(&khugepaged_mm_lock);
1389 :
1390 9 : if (khugepaged_test_exit(mm)) {
1391 : /* free mm_slot */
1392 0 : hash_del(&mm_slot->hash);
1393 0 : list_del(&mm_slot->mm_node);
1394 :
1395 : /*
1396 : * Not strictly needed because the mm exited already.
1397 : *
1398 : * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1399 : */
1400 :
1401 : /* khugepaged_mm_lock actually not necessary for the below */
1402 0 : free_mm_slot(mm_slot);
1403 0 : mmdrop(mm);
1404 : }
1405 9 : }
1406 :
1407 : #ifdef CONFIG_SHMEM
1408 : /*
1409 : * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
1410 : * khugepaged should try to collapse the page table.
1411 : */
1412 0 : static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
1413 : unsigned long addr)
1414 : {
1415 0 : struct mm_slot *mm_slot;
1416 :
1417 0 : VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
1418 :
1419 0 : spin_lock(&khugepaged_mm_lock);
1420 0 : mm_slot = get_mm_slot(mm);
1421 0 : if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
1422 0 : mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
1423 0 : spin_unlock(&khugepaged_mm_lock);
1424 0 : return 0;
1425 : }
1426 :
1427 : /**
1428 : * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
1429 : * address haddr.
1430 : *
1431 : * @mm: process address space where collapse happens
1432 : * @addr: THP collapse address
1433 : *
1434 : * This function checks whether all the PTEs in the PMD are pointing to the
1435 : * right THP. If so, retract the page table so the THP can refault in with
1436 : * as pmd-mapped.
1437 : */
1438 0 : void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
1439 : {
1440 0 : unsigned long haddr = addr & HPAGE_PMD_MASK;
1441 0 : struct vm_area_struct *vma = find_vma(mm, haddr);
1442 0 : struct page *hpage;
1443 0 : pte_t *start_pte, *pte;
1444 0 : pmd_t *pmd, _pmd;
1445 0 : spinlock_t *ptl;
1446 0 : int count = 0;
1447 0 : int i;
1448 :
1449 0 : if (!vma || !vma->vm_file ||
1450 0 : vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
1451 : return;
1452 :
1453 : /*
1454 : * This vm_flags may not have VM_HUGEPAGE if the page was not
1455 : * collapsed by this mm. But we can still collapse if the page is
1456 : * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
1457 : * will not fail the vma for missing VM_HUGEPAGE
1458 : */
1459 0 : if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
1460 : return;
1461 :
1462 0 : hpage = find_lock_page(vma->vm_file->f_mapping,
1463 : linear_page_index(vma, haddr));
1464 0 : if (!hpage)
1465 : return;
1466 :
1467 0 : if (!PageHead(hpage))
1468 0 : goto drop_hpage;
1469 :
1470 0 : pmd = mm_find_pmd(mm, haddr);
1471 0 : if (!pmd)
1472 0 : goto drop_hpage;
1473 :
1474 0 : start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
1475 :
1476 : /* step 1: check all mapped PTEs are to the right huge page */
1477 0 : for (i = 0, addr = haddr, pte = start_pte;
1478 0 : i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1479 0 : struct page *page;
1480 :
1481 : /* empty pte, skip */
1482 0 : if (pte_none(*pte))
1483 0 : continue;
1484 :
1485 : /* page swapped out, abort */
1486 0 : if (!pte_present(*pte))
1487 0 : goto abort;
1488 :
1489 0 : page = vm_normal_page(vma, addr, *pte);
1490 :
1491 : /*
1492 : * Note that uprobe, debugger, or MAP_PRIVATE may change the
1493 : * page table, but the new page will not be a subpage of hpage.
1494 : */
1495 0 : if (hpage + i != page)
1496 0 : goto abort;
1497 0 : count++;
1498 : }
1499 :
1500 : /* step 2: adjust rmap */
1501 0 : for (i = 0, addr = haddr, pte = start_pte;
1502 0 : i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1503 0 : struct page *page;
1504 :
1505 0 : if (pte_none(*pte))
1506 0 : continue;
1507 0 : page = vm_normal_page(vma, addr, *pte);
1508 0 : page_remove_rmap(page, false);
1509 : }
1510 :
1511 0 : pte_unmap_unlock(start_pte, ptl);
1512 :
1513 : /* step 3: set proper refcount and mm_counters. */
1514 0 : if (count) {
1515 0 : page_ref_sub(hpage, count);
1516 0 : add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
1517 : }
1518 :
1519 : /* step 4: collapse pmd */
1520 0 : ptl = pmd_lock(vma->vm_mm, pmd);
1521 0 : _pmd = pmdp_collapse_flush(vma, haddr, pmd);
1522 0 : spin_unlock(ptl);
1523 0 : mm_dec_nr_ptes(mm);
1524 0 : pte_free(mm, pmd_pgtable(_pmd));
1525 :
1526 0 : drop_hpage:
1527 0 : unlock_page(hpage);
1528 0 : put_page(hpage);
1529 0 : return;
1530 :
1531 0 : abort:
1532 0 : pte_unmap_unlock(start_pte, ptl);
1533 0 : goto drop_hpage;
1534 : }
1535 :
1536 15 : static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
1537 : {
1538 15 : struct mm_struct *mm = mm_slot->mm;
1539 15 : int i;
1540 :
1541 15 : if (likely(mm_slot->nr_pte_mapped_thp == 0))
1542 : return 0;
1543 :
1544 0 : if (!mmap_write_trylock(mm))
1545 : return -EBUSY;
1546 :
1547 0 : if (unlikely(khugepaged_test_exit(mm)))
1548 0 : goto out;
1549 :
1550 0 : for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
1551 0 : collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
1552 :
1553 0 : out:
1554 0 : mm_slot->nr_pte_mapped_thp = 0;
1555 0 : mmap_write_unlock(mm);
1556 0 : return 0;
1557 : }
1558 :
1559 0 : static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1560 : {
1561 0 : struct vm_area_struct *vma;
1562 0 : struct mm_struct *mm;
1563 0 : unsigned long addr;
1564 0 : pmd_t *pmd, _pmd;
1565 :
1566 0 : i_mmap_lock_write(mapping);
1567 0 : vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1568 : /*
1569 : * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1570 : * got written to. These VMAs are likely not worth investing
1571 : * mmap_write_lock(mm) as PMD-mapping is likely to be split
1572 : * later.
1573 : *
1574 : * Not that vma->anon_vma check is racy: it can be set up after
1575 : * the check but before we took mmap_lock by the fault path.
1576 : * But page lock would prevent establishing any new ptes of the
1577 : * page, so we are safe.
1578 : *
1579 : * An alternative would be drop the check, but check that page
1580 : * table is clear before calling pmdp_collapse_flush() under
1581 : * ptl. It has higher chance to recover THP for the VMA, but
1582 : * has higher cost too.
1583 : */
1584 0 : if (vma->anon_vma)
1585 0 : continue;
1586 0 : addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1587 0 : if (addr & ~HPAGE_PMD_MASK)
1588 0 : continue;
1589 0 : if (vma->vm_end < addr + HPAGE_PMD_SIZE)
1590 0 : continue;
1591 0 : mm = vma->vm_mm;
1592 0 : pmd = mm_find_pmd(mm, addr);
1593 0 : if (!pmd)
1594 0 : continue;
1595 : /*
1596 : * We need exclusive mmap_lock to retract page table.
1597 : *
1598 : * We use trylock due to lock inversion: we need to acquire
1599 : * mmap_lock while holding page lock. Fault path does it in
1600 : * reverse order. Trylock is a way to avoid deadlock.
1601 : */
1602 0 : if (mmap_write_trylock(mm)) {
1603 0 : if (!khugepaged_test_exit(mm)) {
1604 0 : spinlock_t *ptl = pmd_lock(mm, pmd);
1605 : /* assume page table is clear */
1606 0 : _pmd = pmdp_collapse_flush(vma, addr, pmd);
1607 0 : spin_unlock(ptl);
1608 0 : mm_dec_nr_ptes(mm);
1609 0 : pte_free(mm, pmd_pgtable(_pmd));
1610 : }
1611 0 : mmap_write_unlock(mm);
1612 : } else {
1613 : /* Try again later */
1614 0 : khugepaged_add_pte_mapped_thp(mm, addr);
1615 : }
1616 : }
1617 0 : i_mmap_unlock_write(mapping);
1618 0 : }
1619 :
1620 : /**
1621 : * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1622 : *
1623 : * @mm: process address space where collapse happens
1624 : * @file: file that collapse on
1625 : * @start: collapse start address
1626 : * @hpage: new allocated huge page for collapse
1627 : * @node: appointed node the new huge page allocate from
1628 : *
1629 : * Basic scheme is simple, details are more complex:
1630 : * - allocate and lock a new huge page;
1631 : * - scan page cache replacing old pages with the new one
1632 : * + swap/gup in pages if necessary;
1633 : * + fill in gaps;
1634 : * + keep old pages around in case rollback is required;
1635 : * - if replacing succeeds:
1636 : * + copy data over;
1637 : * + free old pages;
1638 : * + unlock huge page;
1639 : * - if replacing failed;
1640 : * + put all pages back and unfreeze them;
1641 : * + restore gaps in the page cache;
1642 : * + unlock and free huge page;
1643 : */
1644 0 : static void collapse_file(struct mm_struct *mm,
1645 : struct file *file, pgoff_t start,
1646 : struct page **hpage, int node)
1647 : {
1648 0 : struct address_space *mapping = file->f_mapping;
1649 0 : gfp_t gfp;
1650 0 : struct page *new_page;
1651 0 : pgoff_t index, end = start + HPAGE_PMD_NR;
1652 0 : LIST_HEAD(pagelist);
1653 0 : XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1654 0 : int nr_none = 0, result = SCAN_SUCCEED;
1655 0 : bool is_shmem = shmem_file(file);
1656 0 : int nr;
1657 :
1658 0 : VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
1659 0 : VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
1660 :
1661 : /* Only allocate from the target node */
1662 0 : gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
1663 :
1664 0 : new_page = khugepaged_alloc_page(hpage, gfp, node);
1665 0 : if (!new_page) {
1666 0 : result = SCAN_ALLOC_HUGE_PAGE_FAIL;
1667 0 : goto out;
1668 : }
1669 :
1670 0 : if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
1671 : result = SCAN_CGROUP_CHARGE_FAIL;
1672 : goto out;
1673 : }
1674 0 : count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
1675 :
1676 : /* This will be less messy when we use multi-index entries */
1677 0 : do {
1678 0 : xas_lock_irq(&xas);
1679 0 : xas_create_range(&xas);
1680 0 : if (!xas_error(&xas))
1681 : break;
1682 0 : xas_unlock_irq(&xas);
1683 0 : if (!xas_nomem(&xas, GFP_KERNEL)) {
1684 0 : result = SCAN_FAIL;
1685 0 : goto out;
1686 : }
1687 : } while (1);
1688 :
1689 0 : __SetPageLocked(new_page);
1690 0 : if (is_shmem)
1691 0 : __SetPageSwapBacked(new_page);
1692 0 : new_page->index = start;
1693 0 : new_page->mapping = mapping;
1694 :
1695 : /*
1696 : * At this point the new_page is locked and not up-to-date.
1697 : * It's safe to insert it into the page cache, because nobody would
1698 : * be able to map it or use it in another way until we unlock it.
1699 : */
1700 :
1701 0 : xas_set(&xas, start);
1702 0 : for (index = start; index < end; index++) {
1703 0 : struct page *page = xas_next(&xas);
1704 :
1705 0 : VM_BUG_ON(index != xas.xa_index);
1706 0 : if (is_shmem) {
1707 0 : if (!page) {
1708 : /*
1709 : * Stop if extent has been truncated or
1710 : * hole-punched, and is now completely
1711 : * empty.
1712 : */
1713 0 : if (index == start) {
1714 0 : if (!xas_next_entry(&xas, end - 1)) {
1715 0 : result = SCAN_TRUNCATED;
1716 0 : goto xa_locked;
1717 : }
1718 0 : xas_set(&xas, index);
1719 : }
1720 0 : if (!shmem_charge(mapping->host, 1)) {
1721 0 : result = SCAN_FAIL;
1722 0 : goto xa_locked;
1723 : }
1724 0 : xas_store(&xas, new_page);
1725 0 : nr_none++;
1726 0 : continue;
1727 : }
1728 :
1729 0 : if (xa_is_value(page) || !PageUptodate(page)) {
1730 0 : xas_unlock_irq(&xas);
1731 : /* swap in or instantiate fallocated page */
1732 0 : if (shmem_getpage(mapping->host, index, &page,
1733 : SGP_NOHUGE)) {
1734 0 : result = SCAN_FAIL;
1735 0 : goto xa_unlocked;
1736 : }
1737 0 : } else if (trylock_page(page)) {
1738 0 : get_page(page);
1739 0 : xas_unlock_irq(&xas);
1740 : } else {
1741 0 : result = SCAN_PAGE_LOCK;
1742 0 : goto xa_locked;
1743 : }
1744 : } else { /* !is_shmem */
1745 0 : if (!page || xa_is_value(page)) {
1746 0 : xas_unlock_irq(&xas);
1747 0 : page_cache_sync_readahead(mapping, &file->f_ra,
1748 : file, index,
1749 : end - index);
1750 : /* drain pagevecs to help isolate_lru_page() */
1751 0 : lru_add_drain();
1752 0 : page = find_lock_page(mapping, index);
1753 0 : if (unlikely(page == NULL)) {
1754 0 : result = SCAN_FAIL;
1755 0 : goto xa_unlocked;
1756 : }
1757 0 : } else if (PageDirty(page)) {
1758 : /*
1759 : * khugepaged only works on read-only fd,
1760 : * so this page is dirty because it hasn't
1761 : * been flushed since first write. There
1762 : * won't be new dirty pages.
1763 : *
1764 : * Trigger async flush here and hope the
1765 : * writeback is done when khugepaged
1766 : * revisits this page.
1767 : *
1768 : * This is a one-off situation. We are not
1769 : * forcing writeback in loop.
1770 : */
1771 0 : xas_unlock_irq(&xas);
1772 0 : filemap_flush(mapping);
1773 0 : result = SCAN_FAIL;
1774 0 : goto xa_unlocked;
1775 0 : } else if (trylock_page(page)) {
1776 0 : get_page(page);
1777 0 : xas_unlock_irq(&xas);
1778 : } else {
1779 0 : result = SCAN_PAGE_LOCK;
1780 0 : goto xa_locked;
1781 : }
1782 : }
1783 :
1784 : /*
1785 : * The page must be locked, so we can drop the i_pages lock
1786 : * without racing with truncate.
1787 : */
1788 0 : VM_BUG_ON_PAGE(!PageLocked(page), page);
1789 :
1790 : /* make sure the page is up to date */
1791 0 : if (unlikely(!PageUptodate(page))) {
1792 0 : result = SCAN_FAIL;
1793 0 : goto out_unlock;
1794 : }
1795 :
1796 : /*
1797 : * If file was truncated then extended, or hole-punched, before
1798 : * we locked the first page, then a THP might be there already.
1799 : */
1800 0 : if (PageTransCompound(page)) {
1801 0 : result = SCAN_PAGE_COMPOUND;
1802 0 : goto out_unlock;
1803 : }
1804 :
1805 0 : if (page_mapping(page) != mapping) {
1806 0 : result = SCAN_TRUNCATED;
1807 0 : goto out_unlock;
1808 : }
1809 :
1810 0 : if (!is_shmem && PageDirty(page)) {
1811 : /*
1812 : * khugepaged only works on read-only fd, so this
1813 : * page is dirty because it hasn't been flushed
1814 : * since first write.
1815 : */
1816 0 : result = SCAN_FAIL;
1817 0 : goto out_unlock;
1818 : }
1819 :
1820 0 : if (isolate_lru_page(page)) {
1821 0 : result = SCAN_DEL_PAGE_LRU;
1822 0 : goto out_unlock;
1823 : }
1824 :
1825 0 : if (page_has_private(page) &&
1826 0 : !try_to_release_page(page, GFP_KERNEL)) {
1827 0 : result = SCAN_PAGE_HAS_PRIVATE;
1828 0 : putback_lru_page(page);
1829 0 : goto out_unlock;
1830 : }
1831 :
1832 0 : if (page_mapped(page))
1833 0 : unmap_mapping_pages(mapping, index, 1, false);
1834 :
1835 0 : xas_lock_irq(&xas);
1836 0 : xas_set(&xas, index);
1837 :
1838 0 : VM_BUG_ON_PAGE(page != xas_load(&xas), page);
1839 0 : VM_BUG_ON_PAGE(page_mapped(page), page);
1840 :
1841 : /*
1842 : * The page is expected to have page_count() == 3:
1843 : * - we hold a pin on it;
1844 : * - one reference from page cache;
1845 : * - one from isolate_lru_page;
1846 : */
1847 0 : if (!page_ref_freeze(page, 3)) {
1848 0 : result = SCAN_PAGE_COUNT;
1849 0 : xas_unlock_irq(&xas);
1850 0 : putback_lru_page(page);
1851 0 : goto out_unlock;
1852 : }
1853 :
1854 : /*
1855 : * Add the page to the list to be able to undo the collapse if
1856 : * something go wrong.
1857 : */
1858 0 : list_add_tail(&page->lru, &pagelist);
1859 :
1860 : /* Finally, replace with the new page. */
1861 0 : xas_store(&xas, new_page);
1862 0 : continue;
1863 0 : out_unlock:
1864 0 : unlock_page(page);
1865 0 : put_page(page);
1866 0 : goto xa_unlocked;
1867 : }
1868 0 : nr = thp_nr_pages(new_page);
1869 :
1870 0 : if (is_shmem)
1871 0 : __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
1872 : else {
1873 0 : __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
1874 0 : filemap_nr_thps_inc(mapping);
1875 : }
1876 :
1877 0 : if (nr_none) {
1878 0 : __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
1879 0 : if (is_shmem)
1880 0 : __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
1881 : }
1882 :
1883 0 : xa_locked:
1884 0 : xas_unlock_irq(&xas);
1885 0 : xa_unlocked:
1886 :
1887 0 : if (result == SCAN_SUCCEED) {
1888 0 : struct page *page, *tmp;
1889 :
1890 : /*
1891 : * Replacing old pages with new one has succeeded, now we
1892 : * need to copy the content and free the old pages.
1893 : */
1894 0 : index = start;
1895 0 : list_for_each_entry_safe(page, tmp, &pagelist, lru) {
1896 0 : while (index < page->index) {
1897 0 : clear_highpage(new_page + (index % HPAGE_PMD_NR));
1898 0 : index++;
1899 : }
1900 0 : copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
1901 : page);
1902 0 : list_del(&page->lru);
1903 0 : page->mapping = NULL;
1904 0 : page_ref_unfreeze(page, 1);
1905 0 : ClearPageActive(page);
1906 0 : ClearPageUnevictable(page);
1907 0 : unlock_page(page);
1908 0 : put_page(page);
1909 0 : index++;
1910 : }
1911 0 : while (index < end) {
1912 0 : clear_highpage(new_page + (index % HPAGE_PMD_NR));
1913 0 : index++;
1914 : }
1915 :
1916 0 : SetPageUptodate(new_page);
1917 0 : page_ref_add(new_page, HPAGE_PMD_NR - 1);
1918 0 : if (is_shmem)
1919 0 : set_page_dirty(new_page);
1920 0 : lru_cache_add(new_page);
1921 :
1922 : /*
1923 : * Remove pte page tables, so we can re-fault the page as huge.
1924 : */
1925 0 : retract_page_tables(mapping, start);
1926 0 : *hpage = NULL;
1927 :
1928 0 : khugepaged_pages_collapsed++;
1929 : } else {
1930 0 : struct page *page;
1931 :
1932 : /* Something went wrong: roll back page cache changes */
1933 0 : xas_lock_irq(&xas);
1934 0 : mapping->nrpages -= nr_none;
1935 :
1936 0 : if (is_shmem)
1937 0 : shmem_uncharge(mapping->host, nr_none);
1938 :
1939 0 : xas_set(&xas, start);
1940 0 : xas_for_each(&xas, page, end - 1) {
1941 0 : page = list_first_entry_or_null(&pagelist,
1942 : struct page, lru);
1943 0 : if (!page || xas.xa_index < page->index) {
1944 0 : if (!nr_none)
1945 : break;
1946 0 : nr_none--;
1947 : /* Put holes back where they were */
1948 0 : xas_store(&xas, NULL);
1949 0 : continue;
1950 : }
1951 :
1952 0 : VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
1953 :
1954 : /* Unfreeze the page. */
1955 0 : list_del(&page->lru);
1956 0 : page_ref_unfreeze(page, 2);
1957 0 : xas_store(&xas, page);
1958 0 : xas_pause(&xas);
1959 0 : xas_unlock_irq(&xas);
1960 0 : unlock_page(page);
1961 0 : putback_lru_page(page);
1962 0 : xas_lock_irq(&xas);
1963 : }
1964 0 : VM_BUG_ON(nr_none);
1965 0 : xas_unlock_irq(&xas);
1966 :
1967 0 : new_page->mapping = NULL;
1968 : }
1969 :
1970 0 : unlock_page(new_page);
1971 0 : out:
1972 0 : VM_BUG_ON(!list_empty(&pagelist));
1973 0 : if (!IS_ERR_OR_NULL(*hpage))
1974 0 : mem_cgroup_uncharge(*hpage);
1975 : /* TODO: tracepoints */
1976 0 : }
1977 :
1978 0 : static void khugepaged_scan_file(struct mm_struct *mm,
1979 : struct file *file, pgoff_t start, struct page **hpage)
1980 : {
1981 0 : struct page *page = NULL;
1982 0 : struct address_space *mapping = file->f_mapping;
1983 0 : XA_STATE(xas, &mapping->i_pages, start);
1984 0 : int present, swap;
1985 0 : int node = NUMA_NO_NODE;
1986 0 : int result = SCAN_SUCCEED;
1987 :
1988 0 : present = 0;
1989 0 : swap = 0;
1990 0 : memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1991 0 : rcu_read_lock();
1992 0 : xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
1993 0 : if (xas_retry(&xas, page))
1994 0 : continue;
1995 :
1996 0 : if (xa_is_value(page)) {
1997 0 : if (++swap > khugepaged_max_ptes_swap) {
1998 : result = SCAN_EXCEED_SWAP_PTE;
1999 : break;
2000 : }
2001 0 : continue;
2002 : }
2003 :
2004 0 : if (PageTransCompound(page)) {
2005 : result = SCAN_PAGE_COMPOUND;
2006 : break;
2007 : }
2008 :
2009 0 : node = page_to_nid(page);
2010 0 : if (khugepaged_scan_abort(node)) {
2011 : result = SCAN_SCAN_ABORT;
2012 : break;
2013 : }
2014 0 : khugepaged_node_load[node]++;
2015 :
2016 0 : if (!PageLRU(page)) {
2017 : result = SCAN_PAGE_LRU;
2018 : break;
2019 : }
2020 :
2021 0 : if (page_count(page) !=
2022 0 : 1 + page_mapcount(page) + page_has_private(page)) {
2023 : result = SCAN_PAGE_COUNT;
2024 : break;
2025 : }
2026 :
2027 : /*
2028 : * We probably should check if the page is referenced here, but
2029 : * nobody would transfer pte_young() to PageReferenced() for us.
2030 : * And rmap walk here is just too costly...
2031 : */
2032 :
2033 0 : present++;
2034 :
2035 0 : if (need_resched()) {
2036 0 : xas_pause(&xas);
2037 0 : cond_resched_rcu();
2038 : }
2039 : }
2040 0 : rcu_read_unlock();
2041 :
2042 0 : if (result == SCAN_SUCCEED) {
2043 0 : if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
2044 0 : result = SCAN_EXCEED_NONE_PTE;
2045 : } else {
2046 0 : node = khugepaged_find_target_node();
2047 0 : collapse_file(mm, file, start, hpage, node);
2048 : }
2049 : }
2050 :
2051 : /* TODO: tracepoints */
2052 0 : }
2053 : #else
2054 : static void khugepaged_scan_file(struct mm_struct *mm,
2055 : struct file *file, pgoff_t start, struct page **hpage)
2056 : {
2057 : BUILD_BUG();
2058 : }
2059 :
2060 : static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
2061 : {
2062 : return 0;
2063 : }
2064 : #endif
2065 :
2066 15 : static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2067 : struct page **hpage)
2068 : __releases(&khugepaged_mm_lock)
2069 : __acquires(&khugepaged_mm_lock)
2070 : {
2071 15 : struct mm_slot *mm_slot;
2072 15 : struct mm_struct *mm;
2073 15 : struct vm_area_struct *vma;
2074 15 : int progress = 0;
2075 :
2076 15 : VM_BUG_ON(!pages);
2077 45 : lockdep_assert_held(&khugepaged_mm_lock);
2078 :
2079 15 : if (khugepaged_scan.mm_slot)
2080 : mm_slot = khugepaged_scan.mm_slot;
2081 : else {
2082 3 : mm_slot = list_entry(khugepaged_scan.mm_head.next,
2083 : struct mm_slot, mm_node);
2084 3 : khugepaged_scan.address = 0;
2085 3 : khugepaged_scan.mm_slot = mm_slot;
2086 : }
2087 15 : spin_unlock(&khugepaged_mm_lock);
2088 15 : khugepaged_collapse_pte_mapped_thps(mm_slot);
2089 :
2090 15 : mm = mm_slot->mm;
2091 : /*
2092 : * Don't wait for semaphore (to avoid long wait times). Just move to
2093 : * the next mm on the list.
2094 : */
2095 15 : vma = NULL;
2096 15 : if (unlikely(!mmap_read_trylock(mm)))
2097 0 : goto breakouterloop_mmap_lock;
2098 15 : if (likely(!khugepaged_test_exit(mm)))
2099 15 : vma = find_vma(mm, khugepaged_scan.address);
2100 :
2101 : progress++;
2102 1702 : for (; vma; vma = vma->vm_next) {
2103 1693 : unsigned long hstart, hend;
2104 :
2105 1693 : cond_resched();
2106 1693 : if (unlikely(khugepaged_test_exit(mm))) {
2107 0 : progress++;
2108 0 : break;
2109 : }
2110 1693 : if (!hugepage_vma_check(vma, vma->vm_flags)) {
2111 1524 : skip:
2112 1675 : progress++;
2113 1675 : continue;
2114 : }
2115 169 : hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2116 169 : hend = vma->vm_end & HPAGE_PMD_MASK;
2117 169 : if (hstart >= hend)
2118 151 : goto skip;
2119 18 : if (khugepaged_scan.address > hend)
2120 0 : goto skip;
2121 18 : if (khugepaged_scan.address < hstart)
2122 13 : khugepaged_scan.address = hstart;
2123 18 : VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2124 18 : if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
2125 0 : goto skip;
2126 :
2127 44 : while (khugepaged_scan.address < hend) {
2128 32 : int ret;
2129 32 : cond_resched();
2130 32 : if (unlikely(khugepaged_test_exit(mm)))
2131 0 : goto breakouterloop;
2132 :
2133 32 : VM_BUG_ON(khugepaged_scan.address < hstart ||
2134 : khugepaged_scan.address + HPAGE_PMD_SIZE >
2135 : hend);
2136 32 : if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
2137 0 : struct file *file = get_file(vma->vm_file);
2138 0 : pgoff_t pgoff = linear_page_index(vma,
2139 : khugepaged_scan.address);
2140 :
2141 0 : mmap_read_unlock(mm);
2142 0 : ret = 1;
2143 0 : khugepaged_scan_file(mm, file, pgoff, hpage);
2144 0 : fput(file);
2145 : } else {
2146 32 : ret = khugepaged_scan_pmd(mm, vma,
2147 : khugepaged_scan.address,
2148 : hpage);
2149 : }
2150 : /* move to next address */
2151 32 : khugepaged_scan.address += HPAGE_PMD_SIZE;
2152 32 : progress += HPAGE_PMD_NR;
2153 32 : if (ret)
2154 : /* we released mmap_lock so break loop */
2155 2 : goto breakouterloop_mmap_lock;
2156 30 : if (progress >= pages)
2157 4 : goto breakouterloop;
2158 : }
2159 : }
2160 9 : breakouterloop:
2161 13 : mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2162 15 : breakouterloop_mmap_lock:
2163 :
2164 15 : spin_lock(&khugepaged_mm_lock);
2165 15 : VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2166 : /*
2167 : * Release the current mm_slot if this mm is about to die, or
2168 : * if we scanned all vmas of this mm.
2169 : */
2170 15 : if (khugepaged_test_exit(mm) || !vma) {
2171 : /*
2172 : * Make sure that if mm_users is reaching zero while
2173 : * khugepaged runs here, khugepaged_exit will find
2174 : * mm_slot not pointing to the exiting mm.
2175 : */
2176 9 : if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2177 7 : khugepaged_scan.mm_slot = list_entry(
2178 : mm_slot->mm_node.next,
2179 : struct mm_slot, mm_node);
2180 7 : khugepaged_scan.address = 0;
2181 : } else {
2182 2 : khugepaged_scan.mm_slot = NULL;
2183 2 : khugepaged_full_scans++;
2184 : }
2185 :
2186 9 : collect_mm_slot(mm_slot);
2187 : }
2188 :
2189 15 : return progress;
2190 : }
2191 :
2192 23 : static int khugepaged_has_work(void)
2193 : {
2194 44 : return !list_empty(&khugepaged_scan.mm_head) &&
2195 21 : khugepaged_enabled();
2196 : }
2197 :
2198 3 : static int khugepaged_wait_event(void)
2199 : {
2200 5 : return !list_empty(&khugepaged_scan.mm_head) ||
2201 2 : kthread_should_stop();
2202 : }
2203 :
2204 6 : static void khugepaged_do_scan(void)
2205 : {
2206 6 : struct page *hpage = NULL;
2207 6 : unsigned int progress = 0, pass_through_head = 0;
2208 6 : unsigned int pages = khugepaged_pages_to_scan;
2209 6 : bool wait = true;
2210 :
2211 6 : barrier(); /* write khugepaged_pages_to_scan to local stack */
2212 :
2213 6 : lru_add_drain_all();
2214 :
2215 6 : while (progress < pages) {
2216 17 : if (!khugepaged_prealloc_page(&hpage, &wait))
2217 : break;
2218 :
2219 17 : cond_resched();
2220 :
2221 17 : if (unlikely(kthread_should_stop() || try_to_freeze()))
2222 : break;
2223 :
2224 17 : spin_lock(&khugepaged_mm_lock);
2225 17 : if (!khugepaged_scan.mm_slot)
2226 5 : pass_through_head++;
2227 33 : if (khugepaged_has_work() &&
2228 : pass_through_head < 2)
2229 15 : progress += khugepaged_scan_mm_slot(pages - progress,
2230 15 : &hpage);
2231 : else
2232 : progress = pages;
2233 40 : spin_unlock(&khugepaged_mm_lock);
2234 : }
2235 :
2236 6 : if (!IS_ERR_OR_NULL(hpage))
2237 0 : put_page(hpage);
2238 6 : }
2239 :
2240 14 : static bool khugepaged_should_wakeup(void)
2241 : {
2242 14 : return kthread_should_stop() ||
2243 14 : time_after_eq(jiffies, khugepaged_sleep_expire);
2244 : }
2245 :
2246 6 : static void khugepaged_wait_work(void)
2247 : {
2248 6 : if (khugepaged_has_work()) {
2249 5 : const unsigned long scan_sleep_jiffies =
2250 5 : msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
2251 :
2252 5 : if (!scan_sleep_jiffies)
2253 : return;
2254 :
2255 5 : khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
2256 9 : wait_event_freezable_timeout(khugepaged_wait,
2257 : khugepaged_should_wakeup(),
2258 : scan_sleep_jiffies);
2259 4 : return;
2260 : }
2261 :
2262 1 : if (khugepaged_enabled())
2263 2 : wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2264 : }
2265 :
2266 1 : static int khugepaged(void *none)
2267 : {
2268 1 : struct mm_slot *mm_slot;
2269 :
2270 1 : set_freezable();
2271 1 : set_user_nice(current, MAX_NICE);
2272 :
2273 6 : while (!kthread_should_stop()) {
2274 6 : khugepaged_do_scan();
2275 6 : khugepaged_wait_work();
2276 : }
2277 :
2278 0 : spin_lock(&khugepaged_mm_lock);
2279 0 : mm_slot = khugepaged_scan.mm_slot;
2280 0 : khugepaged_scan.mm_slot = NULL;
2281 0 : if (mm_slot)
2282 0 : collect_mm_slot(mm_slot);
2283 0 : spin_unlock(&khugepaged_mm_lock);
2284 0 : return 0;
2285 : }
2286 :
2287 1 : static void set_recommended_min_free_kbytes(void)
2288 : {
2289 1 : struct zone *zone;
2290 1 : int nr_zones = 0;
2291 1 : unsigned long recommended_min;
2292 :
2293 4 : for_each_populated_zone(zone) {
2294 : /*
2295 : * We don't need to worry about fragmentation of
2296 : * ZONE_MOVABLE since it only has movable pages.
2297 : */
2298 1 : if (zone_idx(zone) > gfp_zone(GFP_USER))
2299 0 : continue;
2300 :
2301 1 : nr_zones++;
2302 : }
2303 :
2304 : /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
2305 1 : recommended_min = pageblock_nr_pages * nr_zones * 2;
2306 :
2307 : /*
2308 : * Make sure that on average at least two pageblocks are almost free
2309 : * of another type, one for a migratetype to fall back to and a
2310 : * second to avoid subsequent fallbacks of other types There are 3
2311 : * MIGRATE_TYPES we care about.
2312 : */
2313 1 : recommended_min += pageblock_nr_pages * nr_zones *
2314 : MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
2315 :
2316 : /* don't ever allow to reserve more than 5% of the lowmem */
2317 1 : recommended_min = min(recommended_min,
2318 : (unsigned long) nr_free_buffer_pages() / 20);
2319 1 : recommended_min <<= (PAGE_SHIFT-10);
2320 :
2321 1 : if (recommended_min > min_free_kbytes) {
2322 1 : if (user_min_free_kbytes >= 0)
2323 0 : pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
2324 : min_free_kbytes, recommended_min);
2325 :
2326 1 : min_free_kbytes = recommended_min;
2327 : }
2328 1 : setup_per_zone_wmarks();
2329 1 : }
2330 :
2331 1 : int start_stop_khugepaged(void)
2332 : {
2333 1 : int err = 0;
2334 :
2335 1 : mutex_lock(&khugepaged_mutex);
2336 1 : if (khugepaged_enabled()) {
2337 1 : if (!khugepaged_thread)
2338 1 : khugepaged_thread = kthread_run(khugepaged, NULL,
2339 : "khugepaged");
2340 1 : if (IS_ERR(khugepaged_thread)) {
2341 0 : pr_err("khugepaged: kthread_run(khugepaged) failed\n");
2342 0 : err = PTR_ERR(khugepaged_thread);
2343 0 : khugepaged_thread = NULL;
2344 0 : goto fail;
2345 : }
2346 :
2347 1 : if (!list_empty(&khugepaged_scan.mm_head))
2348 0 : wake_up_interruptible(&khugepaged_wait);
2349 :
2350 1 : set_recommended_min_free_kbytes();
2351 0 : } else if (khugepaged_thread) {
2352 0 : kthread_stop(khugepaged_thread);
2353 0 : khugepaged_thread = NULL;
2354 : }
2355 0 : fail:
2356 1 : mutex_unlock(&khugepaged_mutex);
2357 1 : return err;
2358 : }
2359 :
2360 1 : void khugepaged_min_free_kbytes_update(void)
2361 : {
2362 1 : mutex_lock(&khugepaged_mutex);
2363 1 : if (khugepaged_enabled() && khugepaged_thread)
2364 0 : set_recommended_min_free_kbytes();
2365 1 : mutex_unlock(&khugepaged_mutex);
2366 1 : }
|