LCOV - code coverage report
Current view: top level - mm - khugepaged.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 458 1041 44.0 %
Date: 2021-04-22 12:43:58 Functions: 30 56 53.6 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
       3             : 
       4             : #include <linux/mm.h>
       5             : #include <linux/sched.h>
       6             : #include <linux/sched/mm.h>
       7             : #include <linux/sched/coredump.h>
       8             : #include <linux/mmu_notifier.h>
       9             : #include <linux/rmap.h>
      10             : #include <linux/swap.h>
      11             : #include <linux/mm_inline.h>
      12             : #include <linux/kthread.h>
      13             : #include <linux/khugepaged.h>
      14             : #include <linux/freezer.h>
      15             : #include <linux/mman.h>
      16             : #include <linux/hashtable.h>
      17             : #include <linux/userfaultfd_k.h>
      18             : #include <linux/page_idle.h>
      19             : #include <linux/swapops.h>
      20             : #include <linux/shmem_fs.h>
      21             : 
      22             : #include <asm/tlb.h>
      23             : #include <asm/pgalloc.h>
      24             : #include "internal.h"
      25             : 
      26             : enum scan_result {
      27             :         SCAN_FAIL,
      28             :         SCAN_SUCCEED,
      29             :         SCAN_PMD_NULL,
      30             :         SCAN_EXCEED_NONE_PTE,
      31             :         SCAN_EXCEED_SWAP_PTE,
      32             :         SCAN_EXCEED_SHARED_PTE,
      33             :         SCAN_PTE_NON_PRESENT,
      34             :         SCAN_PTE_UFFD_WP,
      35             :         SCAN_PAGE_RO,
      36             :         SCAN_LACK_REFERENCED_PAGE,
      37             :         SCAN_PAGE_NULL,
      38             :         SCAN_SCAN_ABORT,
      39             :         SCAN_PAGE_COUNT,
      40             :         SCAN_PAGE_LRU,
      41             :         SCAN_PAGE_LOCK,
      42             :         SCAN_PAGE_ANON,
      43             :         SCAN_PAGE_COMPOUND,
      44             :         SCAN_ANY_PROCESS,
      45             :         SCAN_VMA_NULL,
      46             :         SCAN_VMA_CHECK,
      47             :         SCAN_ADDRESS_RANGE,
      48             :         SCAN_SWAP_CACHE_PAGE,
      49             :         SCAN_DEL_PAGE_LRU,
      50             :         SCAN_ALLOC_HUGE_PAGE_FAIL,
      51             :         SCAN_CGROUP_CHARGE_FAIL,
      52             :         SCAN_TRUNCATED,
      53             :         SCAN_PAGE_HAS_PRIVATE,
      54             : };
      55             : 
      56             : #define CREATE_TRACE_POINTS
      57             : #include <trace/events/huge_memory.h>
      58             : 
      59             : static struct task_struct *khugepaged_thread __read_mostly;
      60             : static DEFINE_MUTEX(khugepaged_mutex);
      61             : 
      62             : /* default scan 8*512 pte (or vmas) every 30 second */
      63             : static unsigned int khugepaged_pages_to_scan __read_mostly;
      64             : static unsigned int khugepaged_pages_collapsed;
      65             : static unsigned int khugepaged_full_scans;
      66             : static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
      67             : /* during fragmentation poll the hugepage allocator once every minute */
      68             : static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
      69             : static unsigned long khugepaged_sleep_expire;
      70             : static DEFINE_SPINLOCK(khugepaged_mm_lock);
      71             : static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
      72             : /*
      73             :  * default collapse hugepages if there is at least one pte mapped like
      74             :  * it would have happened if the vma was large enough during page
      75             :  * fault.
      76             :  */
      77             : static unsigned int khugepaged_max_ptes_none __read_mostly;
      78             : static unsigned int khugepaged_max_ptes_swap __read_mostly;
      79             : static unsigned int khugepaged_max_ptes_shared __read_mostly;
      80             : 
      81             : #define MM_SLOTS_HASH_BITS 10
      82             : static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
      83             : 
      84             : static struct kmem_cache *mm_slot_cache __read_mostly;
      85             : 
      86             : #define MAX_PTE_MAPPED_THP 8
      87             : 
      88             : /**
      89             :  * struct mm_slot - hash lookup from mm to mm_slot
      90             :  * @hash: hash collision list
      91             :  * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
      92             :  * @mm: the mm that this information is valid for
      93             :  * @nr_pte_mapped_thp: number of pte mapped THP
      94             :  * @pte_mapped_thp: address array corresponding pte mapped THP
      95             :  */
      96             : struct mm_slot {
      97             :         struct hlist_node hash;
      98             :         struct list_head mm_node;
      99             :         struct mm_struct *mm;
     100             : 
     101             :         /* pte-mapped THP in this mm */
     102             :         int nr_pte_mapped_thp;
     103             :         unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
     104             : };
     105             : 
     106             : /**
     107             :  * struct khugepaged_scan - cursor for scanning
     108             :  * @mm_head: the head of the mm list to scan
     109             :  * @mm_slot: the current mm_slot we are scanning
     110             :  * @address: the next address inside that to be scanned
     111             :  *
     112             :  * There is only the one khugepaged_scan instance of this cursor structure.
     113             :  */
     114             : struct khugepaged_scan {
     115             :         struct list_head mm_head;
     116             :         struct mm_slot *mm_slot;
     117             :         unsigned long address;
     118             : };
     119             : 
     120             : static struct khugepaged_scan khugepaged_scan = {
     121             :         .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
     122             : };
     123             : 
     124             : #ifdef CONFIG_SYSFS
     125           0 : static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
     126             :                                          struct kobj_attribute *attr,
     127             :                                          char *buf)
     128             : {
     129           0 :         return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
     130             : }
     131             : 
     132           0 : static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
     133             :                                           struct kobj_attribute *attr,
     134             :                                           const char *buf, size_t count)
     135             : {
     136           0 :         unsigned int msecs;
     137           0 :         int err;
     138             : 
     139           0 :         err = kstrtouint(buf, 10, &msecs);
     140           0 :         if (err)
     141             :                 return -EINVAL;
     142             : 
     143           0 :         khugepaged_scan_sleep_millisecs = msecs;
     144           0 :         khugepaged_sleep_expire = 0;
     145           0 :         wake_up_interruptible(&khugepaged_wait);
     146             : 
     147           0 :         return count;
     148             : }
     149             : static struct kobj_attribute scan_sleep_millisecs_attr =
     150             :         __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
     151             :                scan_sleep_millisecs_store);
     152             : 
     153           0 : static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
     154             :                                           struct kobj_attribute *attr,
     155             :                                           char *buf)
     156             : {
     157           0 :         return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
     158             : }
     159             : 
     160           0 : static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
     161             :                                            struct kobj_attribute *attr,
     162             :                                            const char *buf, size_t count)
     163             : {
     164           0 :         unsigned int msecs;
     165           0 :         int err;
     166             : 
     167           0 :         err = kstrtouint(buf, 10, &msecs);
     168           0 :         if (err)
     169             :                 return -EINVAL;
     170             : 
     171           0 :         khugepaged_alloc_sleep_millisecs = msecs;
     172           0 :         khugepaged_sleep_expire = 0;
     173           0 :         wake_up_interruptible(&khugepaged_wait);
     174             : 
     175           0 :         return count;
     176             : }
     177             : static struct kobj_attribute alloc_sleep_millisecs_attr =
     178             :         __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
     179             :                alloc_sleep_millisecs_store);
     180             : 
     181           0 : static ssize_t pages_to_scan_show(struct kobject *kobj,
     182             :                                   struct kobj_attribute *attr,
     183             :                                   char *buf)
     184             : {
     185           0 :         return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
     186             : }
     187           0 : static ssize_t pages_to_scan_store(struct kobject *kobj,
     188             :                                    struct kobj_attribute *attr,
     189             :                                    const char *buf, size_t count)
     190             : {
     191           0 :         unsigned int pages;
     192           0 :         int err;
     193             : 
     194           0 :         err = kstrtouint(buf, 10, &pages);
     195           0 :         if (err || !pages)
     196             :                 return -EINVAL;
     197             : 
     198           0 :         khugepaged_pages_to_scan = pages;
     199             : 
     200           0 :         return count;
     201             : }
     202             : static struct kobj_attribute pages_to_scan_attr =
     203             :         __ATTR(pages_to_scan, 0644, pages_to_scan_show,
     204             :                pages_to_scan_store);
     205             : 
     206           0 : static ssize_t pages_collapsed_show(struct kobject *kobj,
     207             :                                     struct kobj_attribute *attr,
     208             :                                     char *buf)
     209             : {
     210           0 :         return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
     211             : }
     212             : static struct kobj_attribute pages_collapsed_attr =
     213             :         __ATTR_RO(pages_collapsed);
     214             : 
     215           0 : static ssize_t full_scans_show(struct kobject *kobj,
     216             :                                struct kobj_attribute *attr,
     217             :                                char *buf)
     218             : {
     219           0 :         return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
     220             : }
     221             : static struct kobj_attribute full_scans_attr =
     222             :         __ATTR_RO(full_scans);
     223             : 
     224           0 : static ssize_t khugepaged_defrag_show(struct kobject *kobj,
     225             :                                       struct kobj_attribute *attr, char *buf)
     226             : {
     227           0 :         return single_hugepage_flag_show(kobj, attr, buf,
     228             :                                          TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
     229             : }
     230           0 : static ssize_t khugepaged_defrag_store(struct kobject *kobj,
     231             :                                        struct kobj_attribute *attr,
     232             :                                        const char *buf, size_t count)
     233             : {
     234           0 :         return single_hugepage_flag_store(kobj, attr, buf, count,
     235             :                                  TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
     236             : }
     237             : static struct kobj_attribute khugepaged_defrag_attr =
     238             :         __ATTR(defrag, 0644, khugepaged_defrag_show,
     239             :                khugepaged_defrag_store);
     240             : 
     241             : /*
     242             :  * max_ptes_none controls if khugepaged should collapse hugepages over
     243             :  * any unmapped ptes in turn potentially increasing the memory
     244             :  * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
     245             :  * reduce the available free memory in the system as it
     246             :  * runs. Increasing max_ptes_none will instead potentially reduce the
     247             :  * free memory in the system during the khugepaged scan.
     248             :  */
     249           0 : static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
     250             :                                              struct kobj_attribute *attr,
     251             :                                              char *buf)
     252             : {
     253           0 :         return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
     254             : }
     255           0 : static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
     256             :                                               struct kobj_attribute *attr,
     257             :                                               const char *buf, size_t count)
     258             : {
     259           0 :         int err;
     260           0 :         unsigned long max_ptes_none;
     261             : 
     262           0 :         err = kstrtoul(buf, 10, &max_ptes_none);
     263           0 :         if (err || max_ptes_none > HPAGE_PMD_NR-1)
     264             :                 return -EINVAL;
     265             : 
     266           0 :         khugepaged_max_ptes_none = max_ptes_none;
     267             : 
     268           0 :         return count;
     269             : }
     270             : static struct kobj_attribute khugepaged_max_ptes_none_attr =
     271             :         __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
     272             :                khugepaged_max_ptes_none_store);
     273             : 
     274           0 : static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
     275             :                                              struct kobj_attribute *attr,
     276             :                                              char *buf)
     277             : {
     278           0 :         return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
     279             : }
     280             : 
     281           0 : static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
     282             :                                               struct kobj_attribute *attr,
     283             :                                               const char *buf, size_t count)
     284             : {
     285           0 :         int err;
     286           0 :         unsigned long max_ptes_swap;
     287             : 
     288           0 :         err  = kstrtoul(buf, 10, &max_ptes_swap);
     289           0 :         if (err || max_ptes_swap > HPAGE_PMD_NR-1)
     290             :                 return -EINVAL;
     291             : 
     292           0 :         khugepaged_max_ptes_swap = max_ptes_swap;
     293             : 
     294           0 :         return count;
     295             : }
     296             : 
     297             : static struct kobj_attribute khugepaged_max_ptes_swap_attr =
     298             :         __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
     299             :                khugepaged_max_ptes_swap_store);
     300             : 
     301           0 : static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
     302             :                                                struct kobj_attribute *attr,
     303             :                                                char *buf)
     304             : {
     305           0 :         return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
     306             : }
     307             : 
     308           0 : static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
     309             :                                               struct kobj_attribute *attr,
     310             :                                               const char *buf, size_t count)
     311             : {
     312           0 :         int err;
     313           0 :         unsigned long max_ptes_shared;
     314             : 
     315           0 :         err  = kstrtoul(buf, 10, &max_ptes_shared);
     316           0 :         if (err || max_ptes_shared > HPAGE_PMD_NR-1)
     317             :                 return -EINVAL;
     318             : 
     319           0 :         khugepaged_max_ptes_shared = max_ptes_shared;
     320             : 
     321           0 :         return count;
     322             : }
     323             : 
     324             : static struct kobj_attribute khugepaged_max_ptes_shared_attr =
     325             :         __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
     326             :                khugepaged_max_ptes_shared_store);
     327             : 
     328             : static struct attribute *khugepaged_attr[] = {
     329             :         &khugepaged_defrag_attr.attr,
     330             :         &khugepaged_max_ptes_none_attr.attr,
     331             :         &khugepaged_max_ptes_swap_attr.attr,
     332             :         &khugepaged_max_ptes_shared_attr.attr,
     333             :         &pages_to_scan_attr.attr,
     334             :         &pages_collapsed_attr.attr,
     335             :         &full_scans_attr.attr,
     336             :         &scan_sleep_millisecs_attr.attr,
     337             :         &alloc_sleep_millisecs_attr.attr,
     338             :         NULL,
     339             : };
     340             : 
     341             : struct attribute_group khugepaged_attr_group = {
     342             :         .attrs = khugepaged_attr,
     343             :         .name = "khugepaged",
     344             : };
     345             : #endif /* CONFIG_SYSFS */
     346             : 
     347           0 : int hugepage_madvise(struct vm_area_struct *vma,
     348             :                      unsigned long *vm_flags, int advice)
     349             : {
     350           0 :         switch (advice) {
     351           0 :         case MADV_HUGEPAGE:
     352             : #ifdef CONFIG_S390
     353             :                 /*
     354             :                  * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
     355             :                  * can't handle this properly after s390_enable_sie, so we simply
     356             :                  * ignore the madvise to prevent qemu from causing a SIGSEGV.
     357             :                  */
     358             :                 if (mm_has_pgste(vma->vm_mm))
     359             :                         return 0;
     360             : #endif
     361           0 :                 *vm_flags &= ~VM_NOHUGEPAGE;
     362           0 :                 *vm_flags |= VM_HUGEPAGE;
     363             :                 /*
     364             :                  * If the vma become good for khugepaged to scan,
     365             :                  * register it here without waiting a page fault that
     366             :                  * may not happen any time soon.
     367             :                  */
     368           0 :                 if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
     369           0 :                                 khugepaged_enter_vma_merge(vma, *vm_flags))
     370           0 :                         return -ENOMEM;
     371             :                 break;
     372           0 :         case MADV_NOHUGEPAGE:
     373           0 :                 *vm_flags &= ~VM_HUGEPAGE;
     374           0 :                 *vm_flags |= VM_NOHUGEPAGE;
     375             :                 /*
     376             :                  * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
     377             :                  * this vma even if we leave the mm registered in khugepaged if
     378             :                  * it got registered before VM_NOHUGEPAGE was set.
     379             :                  */
     380           0 :                 break;
     381             :         }
     382             : 
     383             :         return 0;
     384             : }
     385             : 
     386           1 : int __init khugepaged_init(void)
     387             : {
     388           1 :         mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
     389             :                                           sizeof(struct mm_slot),
     390             :                                           __alignof__(struct mm_slot), 0, NULL);
     391           1 :         if (!mm_slot_cache)
     392             :                 return -ENOMEM;
     393             : 
     394           1 :         khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
     395           1 :         khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
     396           1 :         khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
     397           1 :         khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
     398             : 
     399           1 :         return 0;
     400             : }
     401             : 
     402           0 : void __init khugepaged_destroy(void)
     403             : {
     404           0 :         kmem_cache_destroy(mm_slot_cache);
     405           0 : }
     406             : 
     407          25 : static inline struct mm_slot *alloc_mm_slot(void)
     408             : {
     409          25 :         if (!mm_slot_cache)     /* initialization failed */
     410             :                 return NULL;
     411          25 :         return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
     412             : }
     413             : 
     414          20 : static inline void free_mm_slot(struct mm_slot *mm_slot)
     415             : {
     416          20 :         kmem_cache_free(mm_slot_cache, mm_slot);
     417             : }
     418             : 
     419          20 : static struct mm_slot *get_mm_slot(struct mm_struct *mm)
     420             : {
     421          20 :         struct mm_slot *mm_slot;
     422             : 
     423          40 :         hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
     424          20 :                 if (mm == mm_slot->mm)
     425          20 :                         return mm_slot;
     426             : 
     427             :         return NULL;
     428             : }
     429             : 
     430          25 : static void insert_to_mm_slots_hash(struct mm_struct *mm,
     431             :                                     struct mm_slot *mm_slot)
     432             : {
     433          25 :         mm_slot->mm = mm;
     434          25 :         hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
     435          25 : }
     436             : 
     437        1768 : static inline int khugepaged_test_exit(struct mm_struct *mm)
     438             : {
     439        3536 :         return atomic_read(&mm->mm_users) == 0;
     440             : }
     441             : 
     442        6312 : static bool hugepage_vma_check(struct vm_area_struct *vma,
     443             :                                unsigned long vm_flags)
     444             : {
     445             :         /* Explicitly disabled through madvise. */
     446        6312 :         if ((vm_flags & VM_NOHUGEPAGE) ||
     447        6312 :             test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
     448           0 :                 return false;
     449             : 
     450             :         /* Enabled via shmem mount options or sysfs settings. */
     451        9906 :         if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
     452           0 :                 return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
     453             :                                 HPAGE_PMD_NR);
     454             :         }
     455             : 
     456             :         /* THP settings require madvise. */
     457        6312 :         if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
     458             :                 return false;
     459             : 
     460             :         /* Read-only file mappings need to be aligned for THP to work. */
     461        6312 :         if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
     462             :             (vm_flags & VM_DENYWRITE)) {
     463             :                 return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
     464             :                                 HPAGE_PMD_NR);
     465             :         }
     466             : 
     467        6312 :         if (!vma->anon_vma || vma->vm_ops)
     468             :                 return false;
     469        2649 :         if (vma_is_temporary_stack(vma))
     470             :                 return false;
     471        2649 :         return !(vm_flags & VM_NO_KHUGEPAGED);
     472             : }
     473             : 
     474          25 : int __khugepaged_enter(struct mm_struct *mm)
     475             : {
     476          25 :         struct mm_slot *mm_slot;
     477          25 :         int wakeup;
     478             : 
     479          25 :         mm_slot = alloc_mm_slot();
     480          25 :         if (!mm_slot)
     481             :                 return -ENOMEM;
     482             : 
     483             :         /* __khugepaged_exit() must not run from under us */
     484          25 :         VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
     485          25 :         if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
     486           0 :                 free_mm_slot(mm_slot);
     487           0 :                 return 0;
     488             :         }
     489             : 
     490          25 :         spin_lock(&khugepaged_mm_lock);
     491          25 :         insert_to_mm_slots_hash(mm, mm_slot);
     492             :         /*
     493             :          * Insert just behind the scanning cursor, to let the area settle
     494             :          * down a little.
     495             :          */
     496          25 :         wakeup = list_empty(&khugepaged_scan.mm_head);
     497          25 :         list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
     498          25 :         spin_unlock(&khugepaged_mm_lock);
     499             : 
     500          25 :         mmgrab(mm);
     501          25 :         if (wakeup)
     502           1 :                 wake_up_interruptible(&khugepaged_wait);
     503             : 
     504             :         return 0;
     505             : }
     506             : 
     507        4615 : int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
     508             :                                unsigned long vm_flags)
     509             : {
     510        4615 :         unsigned long hstart, hend;
     511             : 
     512             :         /*
     513             :          * khugepaged only supports read-only files for non-shmem files.
     514             :          * khugepaged does not yet work on special mappings. And
     515             :          * file-private shmem THP is not supported.
     516             :          */
     517        4615 :         if (!hugepage_vma_check(vma, vm_flags))
     518             :                 return 0;
     519             : 
     520        2476 :         hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
     521        2476 :         hend = vma->vm_end & HPAGE_PMD_MASK;
     522        2476 :         if (hstart < hend)
     523           9 :                 return khugepaged_enter(vma, vm_flags);
     524             :         return 0;
     525             : }
     526             : 
     527          20 : void __khugepaged_exit(struct mm_struct *mm)
     528             : {
     529          20 :         struct mm_slot *mm_slot;
     530          20 :         int free = 0;
     531             : 
     532          20 :         spin_lock(&khugepaged_mm_lock);
     533          20 :         mm_slot = get_mm_slot(mm);
     534          20 :         if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
     535          20 :                 hash_del(&mm_slot->hash);
     536          20 :                 list_del(&mm_slot->mm_node);
     537          20 :                 free = 1;
     538             :         }
     539          20 :         spin_unlock(&khugepaged_mm_lock);
     540             : 
     541          20 :         if (free) {
     542          20 :                 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
     543          20 :                 free_mm_slot(mm_slot);
     544          20 :                 mmdrop(mm);
     545           0 :         } else if (mm_slot) {
     546             :                 /*
     547             :                  * This is required to serialize against
     548             :                  * khugepaged_test_exit() (which is guaranteed to run
     549             :                  * under mmap sem read mode). Stop here (after we
     550             :                  * return all pagetables will be destroyed) until
     551             :                  * khugepaged has finished working on the pagetables
     552             :                  * under the mmap_lock.
     553             :                  */
     554           0 :                 mmap_write_lock(mm);
     555           0 :                 mmap_write_unlock(mm);
     556             :         }
     557          20 : }
     558             : 
     559          10 : static void release_pte_page(struct page *page)
     560             : {
     561          30 :         mod_node_page_state(page_pgdat(page),
     562          10 :                         NR_ISOLATED_ANON + page_is_file_lru(page),
     563          10 :                         -compound_nr(page));
     564          10 :         unlock_page(page);
     565          10 :         putback_lru_page(page);
     566          10 : }
     567             : 
     568           0 : static void release_pte_pages(pte_t *pte, pte_t *_pte,
     569             :                 struct list_head *compound_pagelist)
     570             : {
     571           0 :         struct page *page, *tmp;
     572             : 
     573           0 :         while (--_pte >= pte) {
     574           0 :                 pte_t pteval = *_pte;
     575             : 
     576           0 :                 page = pte_page(pteval);
     577           0 :                 if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
     578           0 :                                 !PageCompound(page))
     579           0 :                         release_pte_page(page);
     580             :         }
     581             : 
     582           0 :         list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
     583           0 :                 list_del(&page->lru);
     584           0 :                 release_pte_page(page);
     585             :         }
     586           0 : }
     587             : 
     588          20 : static bool is_refcount_suitable(struct page *page)
     589             : {
     590          20 :         int expected_refcount;
     591             : 
     592          20 :         expected_refcount = total_mapcount(page);
     593          20 :         if (PageSwapCache(page))
     594             :                 expected_refcount += compound_nr(page);
     595             : 
     596          20 :         return page_count(page) == expected_refcount;
     597             : }
     598             : 
     599           2 : static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
     600             :                                         unsigned long address,
     601             :                                         pte_t *pte,
     602             :                                         struct list_head *compound_pagelist)
     603             : {
     604           2 :         struct page *page = NULL;
     605           2 :         pte_t *_pte;
     606           2 :         int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
     607           2 :         bool writable = false;
     608             : 
     609        1026 :         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
     610        1024 :              _pte++, address += PAGE_SIZE) {
     611        1024 :                 pte_t pteval = *_pte;
     612        1024 :                 if (pte_none(pteval) || (pte_present(pteval) &&
     613          10 :                                 is_zero_pfn(pte_pfn(pteval)))) {
     614        1014 :                         if (!userfaultfd_armed(vma) &&
     615        1014 :                             ++none_or_zero <= khugepaged_max_ptes_none) {
     616        1014 :                                 continue;
     617             :                         } else {
     618           0 :                                 result = SCAN_EXCEED_NONE_PTE;
     619           0 :                                 goto out;
     620             :                         }
     621             :                 }
     622          10 :                 if (!pte_present(pteval)) {
     623           0 :                         result = SCAN_PTE_NON_PRESENT;
     624           0 :                         goto out;
     625             :                 }
     626          10 :                 page = vm_normal_page(vma, address, pteval);
     627          10 :                 if (unlikely(!page)) {
     628           0 :                         result = SCAN_PAGE_NULL;
     629           0 :                         goto out;
     630             :                 }
     631             : 
     632          10 :                 VM_BUG_ON_PAGE(!PageAnon(page), page);
     633             : 
     634          10 :                 if (page_mapcount(page) > 1 &&
     635           4 :                                 ++shared > khugepaged_max_ptes_shared) {
     636           0 :                         result = SCAN_EXCEED_SHARED_PTE;
     637           0 :                         goto out;
     638             :                 }
     639             : 
     640          20 :                 if (PageCompound(page)) {
     641           0 :                         struct page *p;
     642           0 :                         page = compound_head(page);
     643             : 
     644             :                         /*
     645             :                          * Check if we have dealt with the compound page
     646             :                          * already
     647             :                          */
     648           0 :                         list_for_each_entry(p, compound_pagelist, lru) {
     649           0 :                                 if (page == p)
     650           0 :                                         goto next;
     651             :                         }
     652             :                 }
     653             : 
     654             :                 /*
     655             :                  * We can do it before isolate_lru_page because the
     656             :                  * page can't be freed from under us. NOTE: PG_lock
     657             :                  * is needed to serialize against split_huge_page
     658             :                  * when invoked from the VM.
     659             :                  */
     660          10 :                 if (!trylock_page(page)) {
     661           0 :                         result = SCAN_PAGE_LOCK;
     662           0 :                         goto out;
     663             :                 }
     664             : 
     665             :                 /*
     666             :                  * Check if the page has any GUP (or other external) pins.
     667             :                  *
     668             :                  * The page table that maps the page has been already unlinked
     669             :                  * from the page table tree and this process cannot get
     670             :                  * an additinal pin on the page.
     671             :                  *
     672             :                  * New pins can come later if the page is shared across fork,
     673             :                  * but not from this process. The other process cannot write to
     674             :                  * the page, only trigger CoW.
     675             :                  */
     676          10 :                 if (!is_refcount_suitable(page)) {
     677           0 :                         unlock_page(page);
     678           0 :                         result = SCAN_PAGE_COUNT;
     679           0 :                         goto out;
     680             :                 }
     681          10 :                 if (!pte_write(pteval) && PageSwapCache(page) &&
     682             :                                 !reuse_swap_page(page, NULL)) {
     683             :                         /*
     684             :                          * Page is in the swap cache and cannot be re-used.
     685             :                          * It cannot be collapsed into a THP.
     686             :                          */
     687             :                         unlock_page(page);
     688             :                         result = SCAN_SWAP_CACHE_PAGE;
     689             :                         goto out;
     690             :                 }
     691             : 
     692             :                 /*
     693             :                  * Isolate the page to avoid collapsing an hugepage
     694             :                  * currently in use by the VM.
     695             :                  */
     696          10 :                 if (isolate_lru_page(page)) {
     697           0 :                         unlock_page(page);
     698           0 :                         result = SCAN_DEL_PAGE_LRU;
     699           0 :                         goto out;
     700             :                 }
     701          30 :                 mod_node_page_state(page_pgdat(page),
     702          10 :                                 NR_ISOLATED_ANON + page_is_file_lru(page),
     703          10 :                                 compound_nr(page));
     704          20 :                 VM_BUG_ON_PAGE(!PageLocked(page), page);
     705          20 :                 VM_BUG_ON_PAGE(PageLRU(page), page);
     706             : 
     707          20 :                 if (PageCompound(page))
     708           0 :                         list_add_tail(&page->lru, compound_pagelist);
     709          10 : next:
     710             :                 /* There should be enough young pte to collapse the page */
     711          10 :                 if (pte_young(pteval) ||
     712           6 :                     page_is_young(page) || PageReferenced(page) ||
     713          10 :                     mmu_notifier_test_young(vma->vm_mm, address))
     714           8 :                         referenced++;
     715             : 
     716          10 :                 if (pte_write(pteval))
     717           2 :                         writable = true;
     718             :         }
     719           2 :         if (likely(writable)) {
     720           2 :                 if (likely(referenced)) {
     721           2 :                         result = SCAN_SUCCEED;
     722           2 :                         trace_mm_collapse_huge_page_isolate(page, none_or_zero,
     723             :                                                             referenced, writable, result);
     724           2 :                         return 1;
     725             :                 }
     726             :         } else {
     727             :                 result = SCAN_PAGE_RO;
     728             :         }
     729             : 
     730           0 : out:
     731           0 :         release_pte_pages(pte, _pte, compound_pagelist);
     732           0 :         trace_mm_collapse_huge_page_isolate(page, none_or_zero,
     733             :                                             referenced, writable, result);
     734           0 :         return 0;
     735             : }
     736             : 
     737           2 : static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
     738             :                                       struct vm_area_struct *vma,
     739             :                                       unsigned long address,
     740             :                                       spinlock_t *ptl,
     741             :                                       struct list_head *compound_pagelist)
     742             : {
     743           2 :         struct page *src_page, *tmp;
     744           2 :         pte_t *_pte;
     745        1026 :         for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
     746        1024 :                                 _pte++, page++, address += PAGE_SIZE) {
     747        1024 :                 pte_t pteval = *_pte;
     748             : 
     749        1024 :                 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
     750        1014 :                         clear_user_highpage(page, address);
     751        1014 :                         add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
     752        1014 :                         if (is_zero_pfn(pte_pfn(pteval))) {
     753             :                                 /*
     754             :                                  * ptl mostly unnecessary.
     755             :                                  */
     756           0 :                                 spin_lock(ptl);
     757             :                                 /*
     758             :                                  * paravirt calls inside pte_clear here are
     759             :                                  * superfluous.
     760             :                                  */
     761           0 :                                 pte_clear(vma->vm_mm, address, _pte);
     762        1024 :                                 spin_unlock(ptl);
     763             :                         }
     764             :                 } else {
     765          10 :                         src_page = pte_page(pteval);
     766          10 :                         copy_user_highpage(page, src_page, address, vma);
     767          20 :                         if (!PageCompound(src_page))
     768          10 :                                 release_pte_page(src_page);
     769             :                         /*
     770             :                          * ptl mostly unnecessary, but preempt has to
     771             :                          * be disabled to update the per-cpu stats
     772             :                          * inside page_remove_rmap().
     773             :                          */
     774          10 :                         spin_lock(ptl);
     775             :                         /*
     776             :                          * paravirt calls inside pte_clear here are
     777             :                          * superfluous.
     778             :                          */
     779          10 :                         pte_clear(vma->vm_mm, address, _pte);
     780          10 :                         page_remove_rmap(src_page, false);
     781          10 :                         spin_unlock(ptl);
     782          10 :                         free_page_and_swap_cache(src_page);
     783             :                 }
     784             :         }
     785             : 
     786           2 :         list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
     787           0 :                 list_del(&src_page->lru);
     788           0 :                 release_pte_page(src_page);
     789             :         }
     790           2 : }
     791             : 
     792           0 : static void khugepaged_alloc_sleep(void)
     793             : {
     794           0 :         DEFINE_WAIT(wait);
     795             : 
     796           0 :         add_wait_queue(&khugepaged_wait, &wait);
     797           0 :         freezable_schedule_timeout_interruptible(
     798             :                 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
     799           0 :         remove_wait_queue(&khugepaged_wait, &wait);
     800           0 : }
     801             : 
     802             : static int khugepaged_node_load[MAX_NUMNODES];
     803             : 
     804          10 : static bool khugepaged_scan_abort(int nid)
     805             : {
     806          10 :         int i;
     807             : 
     808             :         /*
     809             :          * If node_reclaim_mode is disabled, then no extra effort is made to
     810             :          * allocate memory locally.
     811             :          */
     812          10 :         if (!node_reclaim_mode)
     813             :                 return false;
     814             : 
     815             :         /* If there is a count for this node already, it must be acceptable */
     816           0 :         if (khugepaged_node_load[nid])
     817             :                 return false;
     818             : 
     819           0 :         for (i = 0; i < MAX_NUMNODES; i++) {
     820           0 :                 if (!khugepaged_node_load[i])
     821           0 :                         continue;
     822           0 :                 if (node_distance(nid, i) > node_reclaim_distance)
     823             :                         return true;
     824             :         }
     825             :         return false;
     826             : }
     827             : 
     828             : /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
     829           2 : static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
     830             : {
     831           2 :         return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
     832             : }
     833             : 
     834             : #ifdef CONFIG_NUMA
     835           2 : static int khugepaged_find_target_node(void)
     836             : {
     837           2 :         static int last_khugepaged_target_node = NUMA_NO_NODE;
     838           2 :         int nid, target_node = 0, max_value = 0;
     839             : 
     840             :         /* find first node with max normal pages hit */
     841         130 :         for (nid = 0; nid < MAX_NUMNODES; nid++)
     842         128 :                 if (khugepaged_node_load[nid] > max_value) {
     843           2 :                         max_value = khugepaged_node_load[nid];
     844           2 :                         target_node = nid;
     845             :                 }
     846             : 
     847             :         /* do some balance if several nodes have the same hit record */
     848           2 :         if (target_node <= last_khugepaged_target_node)
     849          64 :                 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
     850          63 :                                 nid++)
     851          63 :                         if (max_value == khugepaged_node_load[nid]) {
     852             :                                 target_node = nid;
     853             :                                 break;
     854             :                         }
     855             : 
     856           2 :         last_khugepaged_target_node = target_node;
     857           2 :         return target_node;
     858             : }
     859             : 
     860          17 : static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
     861             : {
     862          17 :         if (IS_ERR(*hpage)) {
     863           0 :                 if (!*wait)
     864             :                         return false;
     865             : 
     866           0 :                 *wait = false;
     867           0 :                 *hpage = NULL;
     868           0 :                 khugepaged_alloc_sleep();
     869          17 :         } else if (*hpage) {
     870           0 :                 put_page(*hpage);
     871           0 :                 *hpage = NULL;
     872             :         }
     873             : 
     874             :         return true;
     875             : }
     876             : 
     877             : static struct page *
     878           2 : khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
     879             : {
     880           2 :         VM_BUG_ON_PAGE(*hpage, *hpage);
     881             : 
     882           2 :         *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
     883           2 :         if (unlikely(!*hpage)) {
     884           0 :                 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
     885           0 :                 *hpage = ERR_PTR(-ENOMEM);
     886           0 :                 return NULL;
     887             :         }
     888             : 
     889           2 :         prep_transhuge_page(*hpage);
     890           2 :         count_vm_event(THP_COLLAPSE_ALLOC);
     891           2 :         return *hpage;
     892             : }
     893             : #else
     894             : static int khugepaged_find_target_node(void)
     895             : {
     896             :         return 0;
     897             : }
     898             : 
     899             : static inline struct page *alloc_khugepaged_hugepage(void)
     900             : {
     901             :         struct page *page;
     902             : 
     903             :         page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
     904             :                            HPAGE_PMD_ORDER);
     905             :         if (page)
     906             :                 prep_transhuge_page(page);
     907             :         return page;
     908             : }
     909             : 
     910             : static struct page *khugepaged_alloc_hugepage(bool *wait)
     911             : {
     912             :         struct page *hpage;
     913             : 
     914             :         do {
     915             :                 hpage = alloc_khugepaged_hugepage();
     916             :                 if (!hpage) {
     917             :                         count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
     918             :                         if (!*wait)
     919             :                                 return NULL;
     920             : 
     921             :                         *wait = false;
     922             :                         khugepaged_alloc_sleep();
     923             :                 } else
     924             :                         count_vm_event(THP_COLLAPSE_ALLOC);
     925             :         } while (unlikely(!hpage) && likely(khugepaged_enabled()));
     926             : 
     927             :         return hpage;
     928             : }
     929             : 
     930             : static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
     931             : {
     932             :         /*
     933             :          * If the hpage allocated earlier was briefly exposed in page cache
     934             :          * before collapse_file() failed, it is possible that racing lookups
     935             :          * have not yet completed, and would then be unpleasantly surprised by
     936             :          * finding the hpage reused for the same mapping at a different offset.
     937             :          * Just release the previous allocation if there is any danger of that.
     938             :          */
     939             :         if (*hpage && page_count(*hpage) > 1) {
     940             :                 put_page(*hpage);
     941             :                 *hpage = NULL;
     942             :         }
     943             : 
     944             :         if (!*hpage)
     945             :                 *hpage = khugepaged_alloc_hugepage(wait);
     946             : 
     947             :         if (unlikely(!*hpage))
     948             :                 return false;
     949             : 
     950             :         return true;
     951             : }
     952             : 
     953             : static struct page *
     954             : khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
     955             : {
     956             :         VM_BUG_ON(!*hpage);
     957             : 
     958             :         return  *hpage;
     959             : }
     960             : #endif
     961             : 
     962             : /*
     963             :  * If mmap_lock temporarily dropped, revalidate vma
     964             :  * before taking mmap_lock.
     965             :  * Return 0 if succeeds, otherwise return none-zero
     966             :  * value (scan code).
     967             :  */
     968             : 
     969           4 : static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
     970             :                 struct vm_area_struct **vmap)
     971             : {
     972           4 :         struct vm_area_struct *vma;
     973           4 :         unsigned long hstart, hend;
     974             : 
     975           4 :         if (unlikely(khugepaged_test_exit(mm)))
     976             :                 return SCAN_ANY_PROCESS;
     977             : 
     978           4 :         *vmap = vma = find_vma(mm, address);
     979           4 :         if (!vma)
     980             :                 return SCAN_VMA_NULL;
     981             : 
     982           4 :         hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
     983           4 :         hend = vma->vm_end & HPAGE_PMD_MASK;
     984           4 :         if (address < hstart || address + HPAGE_PMD_SIZE > hend)
     985             :                 return SCAN_ADDRESS_RANGE;
     986           4 :         if (!hugepage_vma_check(vma, vma->vm_flags))
     987             :                 return SCAN_VMA_CHECK;
     988             :         /* Anon VMA expected */
     989           4 :         if (!vma->anon_vma || vma->vm_ops)
     990           0 :                 return SCAN_VMA_CHECK;
     991             :         return 0;
     992             : }
     993             : 
     994             : /*
     995             :  * Bring missing pages in from swap, to complete THP collapse.
     996             :  * Only done if khugepaged_scan_pmd believes it is worthwhile.
     997             :  *
     998             :  * Called and returns without pte mapped or spinlocks held,
     999             :  * but with mmap_lock held to protect against vma changes.
    1000             :  */
    1001             : 
    1002           0 : static bool __collapse_huge_page_swapin(struct mm_struct *mm,
    1003             :                                         struct vm_area_struct *vma,
    1004             :                                         unsigned long haddr, pmd_t *pmd,
    1005             :                                         int referenced)
    1006             : {
    1007           0 :         int swapped_in = 0;
    1008           0 :         vm_fault_t ret = 0;
    1009           0 :         unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
    1010             : 
    1011           0 :         for (address = haddr; address < end; address += PAGE_SIZE) {
    1012           0 :                 struct vm_fault vmf = {
    1013             :                         .vma = vma,
    1014             :                         .address = address,
    1015           0 :                         .pgoff = linear_page_index(vma, haddr),
    1016             :                         .flags = FAULT_FLAG_ALLOW_RETRY,
    1017             :                         .pmd = pmd,
    1018             :                 };
    1019             : 
    1020           0 :                 vmf.pte = pte_offset_map(pmd, address);
    1021           0 :                 vmf.orig_pte = *vmf.pte;
    1022           0 :                 if (!is_swap_pte(vmf.orig_pte)) {
    1023           0 :                         pte_unmap(vmf.pte);
    1024           0 :                         continue;
    1025             :                 }
    1026           0 :                 swapped_in++;
    1027           0 :                 ret = do_swap_page(&vmf);
    1028             : 
    1029             :                 /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
    1030           0 :                 if (ret & VM_FAULT_RETRY) {
    1031           0 :                         mmap_read_lock(mm);
    1032           0 :                         if (hugepage_vma_revalidate(mm, haddr, &vma)) {
    1033             :                                 /* vma is no longer available, don't continue to swapin */
    1034           0 :                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
    1035           0 :                                 return false;
    1036             :                         }
    1037             :                         /* check if the pmd is still valid */
    1038           0 :                         if (mm_find_pmd(mm, haddr) != pmd) {
    1039           0 :                                 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
    1040           0 :                                 return false;
    1041             :                         }
    1042             :                 }
    1043           0 :                 if (ret & VM_FAULT_ERROR) {
    1044           0 :                         trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
    1045           0 :                         return false;
    1046             :                 }
    1047             :         }
    1048             : 
    1049             :         /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
    1050           0 :         if (swapped_in)
    1051           0 :                 lru_add_drain();
    1052             : 
    1053           0 :         trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
    1054           0 :         return true;
    1055             : }
    1056             : 
    1057           2 : static void collapse_huge_page(struct mm_struct *mm,
    1058             :                                    unsigned long address,
    1059             :                                    struct page **hpage,
    1060             :                                    int node, int referenced, int unmapped)
    1061             : {
    1062           2 :         LIST_HEAD(compound_pagelist);
    1063           2 :         pmd_t *pmd, _pmd;
    1064           2 :         pte_t *pte;
    1065           2 :         pgtable_t pgtable;
    1066           2 :         struct page *new_page;
    1067           2 :         spinlock_t *pmd_ptl, *pte_ptl;
    1068           2 :         int isolated = 0, result = 0;
    1069           2 :         struct vm_area_struct *vma;
    1070           2 :         struct mmu_notifier_range range;
    1071           2 :         gfp_t gfp;
    1072             : 
    1073           2 :         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
    1074             : 
    1075             :         /* Only allocate from the target node */
    1076           2 :         gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
    1077             : 
    1078             :         /*
    1079             :          * Before allocating the hugepage, release the mmap_lock read lock.
    1080             :          * The allocation can take potentially a long time if it involves
    1081             :          * sync compaction, and we do not need to hold the mmap_lock during
    1082             :          * that. We will recheck the vma after taking it again in write mode.
    1083             :          */
    1084           2 :         mmap_read_unlock(mm);
    1085           2 :         new_page = khugepaged_alloc_page(hpage, gfp, node);
    1086           2 :         if (!new_page) {
    1087           0 :                 result = SCAN_ALLOC_HUGE_PAGE_FAIL;
    1088           0 :                 goto out_nolock;
    1089             :         }
    1090             : 
    1091           2 :         if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
    1092             :                 result = SCAN_CGROUP_CHARGE_FAIL;
    1093             :                 goto out_nolock;
    1094             :         }
    1095           2 :         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
    1096             : 
    1097           2 :         mmap_read_lock(mm);
    1098           2 :         result = hugepage_vma_revalidate(mm, address, &vma);
    1099           2 :         if (result) {
    1100           0 :                 mmap_read_unlock(mm);
    1101           0 :                 goto out_nolock;
    1102             :         }
    1103             : 
    1104           2 :         pmd = mm_find_pmd(mm, address);
    1105           2 :         if (!pmd) {
    1106           0 :                 result = SCAN_PMD_NULL;
    1107           0 :                 mmap_read_unlock(mm);
    1108           0 :                 goto out_nolock;
    1109             :         }
    1110             : 
    1111             :         /*
    1112             :          * __collapse_huge_page_swapin always returns with mmap_lock locked.
    1113             :          * If it fails, we release mmap_lock and jump out_nolock.
    1114             :          * Continuing to collapse causes inconsistency.
    1115             :          */
    1116           2 :         if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
    1117             :                                                      pmd, referenced)) {
    1118           0 :                 mmap_read_unlock(mm);
    1119           0 :                 goto out_nolock;
    1120             :         }
    1121             : 
    1122           2 :         mmap_read_unlock(mm);
    1123             :         /*
    1124             :          * Prevent all access to pagetables with the exception of
    1125             :          * gup_fast later handled by the ptep_clear_flush and the VM
    1126             :          * handled by the anon_vma lock + PG_lock.
    1127             :          */
    1128           2 :         mmap_write_lock(mm);
    1129           2 :         result = hugepage_vma_revalidate(mm, address, &vma);
    1130           2 :         if (result)
    1131           0 :                 goto out;
    1132             :         /* check if the pmd is still valid */
    1133           2 :         if (mm_find_pmd(mm, address) != pmd)
    1134           0 :                 goto out;
    1135             : 
    1136           2 :         anon_vma_lock_write(vma->anon_vma);
    1137             : 
    1138           2 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
    1139             :                                 address, address + HPAGE_PMD_SIZE);
    1140           2 :         mmu_notifier_invalidate_range_start(&range);
    1141             : 
    1142           2 :         pte = pte_offset_map(pmd, address);
    1143           2 :         pte_ptl = pte_lockptr(mm, pmd);
    1144             : 
    1145           2 :         pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
    1146             :         /*
    1147             :          * After this gup_fast can't run anymore. This also removes
    1148             :          * any huge TLB entry from the CPU so we won't allow
    1149             :          * huge and small TLB entries for the same virtual address
    1150             :          * to avoid the risk of CPU bugs in that area.
    1151             :          */
    1152           2 :         _pmd = pmdp_collapse_flush(vma, address, pmd);
    1153           2 :         spin_unlock(pmd_ptl);
    1154           2 :         mmu_notifier_invalidate_range_end(&range);
    1155             : 
    1156           2 :         spin_lock(pte_ptl);
    1157           2 :         isolated = __collapse_huge_page_isolate(vma, address, pte,
    1158             :                         &compound_pagelist);
    1159           2 :         spin_unlock(pte_ptl);
    1160             : 
    1161           2 :         if (unlikely(!isolated)) {
    1162           0 :                 pte_unmap(pte);
    1163           0 :                 spin_lock(pmd_ptl);
    1164           0 :                 BUG_ON(!pmd_none(*pmd));
    1165             :                 /*
    1166             :                  * We can only use set_pmd_at when establishing
    1167             :                  * hugepmds and never for establishing regular pmds that
    1168             :                  * points to regular pagetables. Use pmd_populate for that
    1169             :                  */
    1170           0 :                 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
    1171           0 :                 spin_unlock(pmd_ptl);
    1172           0 :                 anon_vma_unlock_write(vma->anon_vma);
    1173           0 :                 result = SCAN_FAIL;
    1174           0 :                 goto out;
    1175             :         }
    1176             : 
    1177             :         /*
    1178             :          * All pages are isolated and locked so anon_vma rmap
    1179             :          * can't run anymore.
    1180             :          */
    1181           2 :         anon_vma_unlock_write(vma->anon_vma);
    1182             : 
    1183           2 :         __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
    1184             :                         &compound_pagelist);
    1185           2 :         pte_unmap(pte);
    1186           2 :         __SetPageUptodate(new_page);
    1187           2 :         pgtable = pmd_pgtable(_pmd);
    1188             : 
    1189           2 :         _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
    1190           2 :         _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
    1191             : 
    1192             :         /*
    1193             :          * spin_lock() below is not the equivalent of smp_wmb(), so
    1194             :          * this is needed to avoid the copy_huge_page writes to become
    1195             :          * visible after the set_pmd_at() write.
    1196             :          */
    1197           2 :         smp_wmb();
    1198             : 
    1199           2 :         spin_lock(pmd_ptl);
    1200           2 :         BUG_ON(!pmd_none(*pmd));
    1201           2 :         page_add_new_anon_rmap(new_page, vma, address, true);
    1202           2 :         lru_cache_add_inactive_or_unevictable(new_page, vma);
    1203           2 :         pgtable_trans_huge_deposit(mm, pmd, pgtable);
    1204           2 :         set_pmd_at(mm, address, pmd, _pmd);
    1205           2 :         update_mmu_cache_pmd(vma, address, pmd);
    1206           2 :         spin_unlock(pmd_ptl);
    1207             : 
    1208           2 :         *hpage = NULL;
    1209             : 
    1210           2 :         khugepaged_pages_collapsed++;
    1211           2 :         result = SCAN_SUCCEED;
    1212           2 : out_up_write:
    1213           2 :         mmap_write_unlock(mm);
    1214           2 : out_nolock:
    1215           2 :         if (!IS_ERR_OR_NULL(*hpage))
    1216           2 :                 mem_cgroup_uncharge(*hpage);
    1217           2 :         trace_mm_collapse_huge_page(mm, isolated, result);
    1218           2 :         return;
    1219           0 : out:
    1220           0 :         goto out_up_write;
    1221             : }
    1222             : 
    1223          32 : static int khugepaged_scan_pmd(struct mm_struct *mm,
    1224             :                                struct vm_area_struct *vma,
    1225             :                                unsigned long address,
    1226             :                                struct page **hpage)
    1227             : {
    1228          32 :         pmd_t *pmd;
    1229          32 :         pte_t *pte, *_pte;
    1230          32 :         int ret = 0, result = 0, referenced = 0;
    1231          32 :         int none_or_zero = 0, shared = 0;
    1232          32 :         struct page *page = NULL;
    1233          32 :         unsigned long _address;
    1234          32 :         spinlock_t *ptl;
    1235          32 :         int node = NUMA_NO_NODE, unmapped = 0;
    1236          32 :         bool writable = false;
    1237             : 
    1238          32 :         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
    1239             : 
    1240          32 :         pmd = mm_find_pmd(mm, address);
    1241          32 :         if (!pmd) {
    1242          30 :                 result = SCAN_PMD_NULL;
    1243          30 :                 goto out;
    1244             :         }
    1245             : 
    1246           2 :         memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
    1247           4 :         pte = pte_offset_map_lock(mm, pmd, address, &ptl);
    1248        1028 :         for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
    1249        1024 :              _pte++, _address += PAGE_SIZE) {
    1250        1024 :                 pte_t pteval = *_pte;
    1251        1024 :                 if (is_swap_pte(pteval)) {
    1252           0 :                         if (++unmapped <= khugepaged_max_ptes_swap) {
    1253             :                                 /*
    1254             :                                  * Always be strict with uffd-wp
    1255             :                                  * enabled swap entries.  Please see
    1256             :                                  * comment below for pte_uffd_wp().
    1257             :                                  */
    1258           0 :                                 if (pte_swp_uffd_wp(pteval)) {
    1259             :                                         result = SCAN_PTE_UFFD_WP;
    1260           0 :                                         goto out_unmap;
    1261             :                                 }
    1262        1014 :                                 continue;
    1263             :                         } else {
    1264           0 :                                 result = SCAN_EXCEED_SWAP_PTE;
    1265           0 :                                 goto out_unmap;
    1266             :                         }
    1267             :                 }
    1268        1024 :                 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
    1269        1014 :                         if (!userfaultfd_armed(vma) &&
    1270        1014 :                             ++none_or_zero <= khugepaged_max_ptes_none) {
    1271        1014 :                                 continue;
    1272             :                         } else {
    1273           0 :                                 result = SCAN_EXCEED_NONE_PTE;
    1274           0 :                                 goto out_unmap;
    1275             :                         }
    1276             :                 }
    1277          10 :                 if (!pte_present(pteval)) {
    1278           0 :                         result = SCAN_PTE_NON_PRESENT;
    1279           0 :                         goto out_unmap;
    1280             :                 }
    1281          10 :                 if (pte_uffd_wp(pteval)) {
    1282             :                         /*
    1283             :                          * Don't collapse the page if any of the small
    1284             :                          * PTEs are armed with uffd write protection.
    1285             :                          * Here we can also mark the new huge pmd as
    1286             :                          * write protected if any of the small ones is
    1287             :                          * marked but that could bring unknown
    1288             :                          * userfault messages that falls outside of
    1289             :                          * the registered range.  So, just be simple.
    1290             :                          */
    1291             :                         result = SCAN_PTE_UFFD_WP;
    1292             :                         goto out_unmap;
    1293             :                 }
    1294          10 :                 if (pte_write(pteval))
    1295           2 :                         writable = true;
    1296             : 
    1297          10 :                 page = vm_normal_page(vma, _address, pteval);
    1298          10 :                 if (unlikely(!page)) {
    1299           0 :                         result = SCAN_PAGE_NULL;
    1300           0 :                         goto out_unmap;
    1301             :                 }
    1302             : 
    1303          10 :                 if (page_mapcount(page) > 1 &&
    1304           4 :                                 ++shared > khugepaged_max_ptes_shared) {
    1305           0 :                         result = SCAN_EXCEED_SHARED_PTE;
    1306           0 :                         goto out_unmap;
    1307             :                 }
    1308             : 
    1309          10 :                 page = compound_head(page);
    1310             : 
    1311             :                 /*
    1312             :                  * Record which node the original page is from and save this
    1313             :                  * information to khugepaged_node_load[].
    1314             :                  * Khupaged will allocate hugepage from the node has the max
    1315             :                  * hit record.
    1316             :                  */
    1317          10 :                 node = page_to_nid(page);
    1318          10 :                 if (khugepaged_scan_abort(node)) {
    1319           0 :                         result = SCAN_SCAN_ABORT;
    1320           0 :                         goto out_unmap;
    1321             :                 }
    1322          10 :                 khugepaged_node_load[node]++;
    1323          20 :                 if (!PageLRU(page)) {
    1324           0 :                         result = SCAN_PAGE_LRU;
    1325           0 :                         goto out_unmap;
    1326             :                 }
    1327          20 :                 if (PageLocked(page)) {
    1328           0 :                         result = SCAN_PAGE_LOCK;
    1329           0 :                         goto out_unmap;
    1330             :                 }
    1331          10 :                 if (!PageAnon(page)) {
    1332           0 :                         result = SCAN_PAGE_ANON;
    1333           0 :                         goto out_unmap;
    1334             :                 }
    1335             : 
    1336             :                 /*
    1337             :                  * Check if the page has any GUP (or other external) pins.
    1338             :                  *
    1339             :                  * Here the check is racy it may see totmal_mapcount > refcount
    1340             :                  * in some cases.
    1341             :                  * For example, one process with one forked child process.
    1342             :                  * The parent has the PMD split due to MADV_DONTNEED, then
    1343             :                  * the child is trying unmap the whole PMD, but khugepaged
    1344             :                  * may be scanning the parent between the child has
    1345             :                  * PageDoubleMap flag cleared and dec the mapcount.  So
    1346             :                  * khugepaged may see total_mapcount > refcount.
    1347             :                  *
    1348             :                  * But such case is ephemeral we could always retry collapse
    1349             :                  * later.  However it may report false positive if the page
    1350             :                  * has excessive GUP pins (i.e. 512).  Anyway the same check
    1351             :                  * will be done again later the risk seems low.
    1352             :                  */
    1353          10 :                 if (!is_refcount_suitable(page)) {
    1354           0 :                         result = SCAN_PAGE_COUNT;
    1355           0 :                         goto out_unmap;
    1356             :                 }
    1357          10 :                 if (pte_young(pteval) ||
    1358           6 :                     page_is_young(page) || PageReferenced(page) ||
    1359          10 :                     mmu_notifier_test_young(vma->vm_mm, address))
    1360           8 :                         referenced++;
    1361             :         }
    1362           2 :         if (!writable) {
    1363             :                 result = SCAN_PAGE_RO;
    1364           2 :         } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
    1365             :                 result = SCAN_LACK_REFERENCED_PAGE;
    1366             :         } else {
    1367           2 :                 result = SCAN_SUCCEED;
    1368           2 :                 ret = 1;
    1369             :         }
    1370           2 : out_unmap:
    1371           2 :         pte_unmap_unlock(pte, ptl);
    1372           2 :         if (ret) {
    1373           2 :                 node = khugepaged_find_target_node();
    1374             :                 /* collapse_huge_page will return with the mmap_lock released */
    1375           2 :                 collapse_huge_page(mm, address, hpage, node,
    1376             :                                 referenced, unmapped);
    1377             :         }
    1378           0 : out:
    1379          32 :         trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
    1380             :                                      none_or_zero, result, unmapped);
    1381          32 :         return ret;
    1382             : }
    1383             : 
    1384           9 : static void collect_mm_slot(struct mm_slot *mm_slot)
    1385             : {
    1386           9 :         struct mm_struct *mm = mm_slot->mm;
    1387             : 
    1388          27 :         lockdep_assert_held(&khugepaged_mm_lock);
    1389             : 
    1390           9 :         if (khugepaged_test_exit(mm)) {
    1391             :                 /* free mm_slot */
    1392           0 :                 hash_del(&mm_slot->hash);
    1393           0 :                 list_del(&mm_slot->mm_node);
    1394             : 
    1395             :                 /*
    1396             :                  * Not strictly needed because the mm exited already.
    1397             :                  *
    1398             :                  * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
    1399             :                  */
    1400             : 
    1401             :                 /* khugepaged_mm_lock actually not necessary for the below */
    1402           0 :                 free_mm_slot(mm_slot);
    1403           0 :                 mmdrop(mm);
    1404             :         }
    1405           9 : }
    1406             : 
    1407             : #ifdef CONFIG_SHMEM
    1408             : /*
    1409             :  * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
    1410             :  * khugepaged should try to collapse the page table.
    1411             :  */
    1412           0 : static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
    1413             :                                          unsigned long addr)
    1414             : {
    1415           0 :         struct mm_slot *mm_slot;
    1416             : 
    1417           0 :         VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
    1418             : 
    1419           0 :         spin_lock(&khugepaged_mm_lock);
    1420           0 :         mm_slot = get_mm_slot(mm);
    1421           0 :         if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
    1422           0 :                 mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
    1423           0 :         spin_unlock(&khugepaged_mm_lock);
    1424           0 :         return 0;
    1425             : }
    1426             : 
    1427             : /**
    1428             :  * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
    1429             :  * address haddr.
    1430             :  *
    1431             :  * @mm: process address space where collapse happens
    1432             :  * @addr: THP collapse address
    1433             :  *
    1434             :  * This function checks whether all the PTEs in the PMD are pointing to the
    1435             :  * right THP. If so, retract the page table so the THP can refault in with
    1436             :  * as pmd-mapped.
    1437             :  */
    1438           0 : void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
    1439             : {
    1440           0 :         unsigned long haddr = addr & HPAGE_PMD_MASK;
    1441           0 :         struct vm_area_struct *vma = find_vma(mm, haddr);
    1442           0 :         struct page *hpage;
    1443           0 :         pte_t *start_pte, *pte;
    1444           0 :         pmd_t *pmd, _pmd;
    1445           0 :         spinlock_t *ptl;
    1446           0 :         int count = 0;
    1447           0 :         int i;
    1448             : 
    1449           0 :         if (!vma || !vma->vm_file ||
    1450           0 :             vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
    1451             :                 return;
    1452             : 
    1453             :         /*
    1454             :          * This vm_flags may not have VM_HUGEPAGE if the page was not
    1455             :          * collapsed by this mm. But we can still collapse if the page is
    1456             :          * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
    1457             :          * will not fail the vma for missing VM_HUGEPAGE
    1458             :          */
    1459           0 :         if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
    1460             :                 return;
    1461             : 
    1462           0 :         hpage = find_lock_page(vma->vm_file->f_mapping,
    1463             :                                linear_page_index(vma, haddr));
    1464           0 :         if (!hpage)
    1465             :                 return;
    1466             : 
    1467           0 :         if (!PageHead(hpage))
    1468           0 :                 goto drop_hpage;
    1469             : 
    1470           0 :         pmd = mm_find_pmd(mm, haddr);
    1471           0 :         if (!pmd)
    1472           0 :                 goto drop_hpage;
    1473             : 
    1474           0 :         start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
    1475             : 
    1476             :         /* step 1: check all mapped PTEs are to the right huge page */
    1477           0 :         for (i = 0, addr = haddr, pte = start_pte;
    1478           0 :              i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
    1479           0 :                 struct page *page;
    1480             : 
    1481             :                 /* empty pte, skip */
    1482           0 :                 if (pte_none(*pte))
    1483           0 :                         continue;
    1484             : 
    1485             :                 /* page swapped out, abort */
    1486           0 :                 if (!pte_present(*pte))
    1487           0 :                         goto abort;
    1488             : 
    1489           0 :                 page = vm_normal_page(vma, addr, *pte);
    1490             : 
    1491             :                 /*
    1492             :                  * Note that uprobe, debugger, or MAP_PRIVATE may change the
    1493             :                  * page table, but the new page will not be a subpage of hpage.
    1494             :                  */
    1495           0 :                 if (hpage + i != page)
    1496           0 :                         goto abort;
    1497           0 :                 count++;
    1498             :         }
    1499             : 
    1500             :         /* step 2: adjust rmap */
    1501           0 :         for (i = 0, addr = haddr, pte = start_pte;
    1502           0 :              i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
    1503           0 :                 struct page *page;
    1504             : 
    1505           0 :                 if (pte_none(*pte))
    1506           0 :                         continue;
    1507           0 :                 page = vm_normal_page(vma, addr, *pte);
    1508           0 :                 page_remove_rmap(page, false);
    1509             :         }
    1510             : 
    1511           0 :         pte_unmap_unlock(start_pte, ptl);
    1512             : 
    1513             :         /* step 3: set proper refcount and mm_counters. */
    1514           0 :         if (count) {
    1515           0 :                 page_ref_sub(hpage, count);
    1516           0 :                 add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
    1517             :         }
    1518             : 
    1519             :         /* step 4: collapse pmd */
    1520           0 :         ptl = pmd_lock(vma->vm_mm, pmd);
    1521           0 :         _pmd = pmdp_collapse_flush(vma, haddr, pmd);
    1522           0 :         spin_unlock(ptl);
    1523           0 :         mm_dec_nr_ptes(mm);
    1524           0 :         pte_free(mm, pmd_pgtable(_pmd));
    1525             : 
    1526           0 : drop_hpage:
    1527           0 :         unlock_page(hpage);
    1528           0 :         put_page(hpage);
    1529           0 :         return;
    1530             : 
    1531           0 : abort:
    1532           0 :         pte_unmap_unlock(start_pte, ptl);
    1533           0 :         goto drop_hpage;
    1534             : }
    1535             : 
    1536          15 : static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
    1537             : {
    1538          15 :         struct mm_struct *mm = mm_slot->mm;
    1539          15 :         int i;
    1540             : 
    1541          15 :         if (likely(mm_slot->nr_pte_mapped_thp == 0))
    1542             :                 return 0;
    1543             : 
    1544           0 :         if (!mmap_write_trylock(mm))
    1545             :                 return -EBUSY;
    1546             : 
    1547           0 :         if (unlikely(khugepaged_test_exit(mm)))
    1548           0 :                 goto out;
    1549             : 
    1550           0 :         for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
    1551           0 :                 collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
    1552             : 
    1553           0 : out:
    1554           0 :         mm_slot->nr_pte_mapped_thp = 0;
    1555           0 :         mmap_write_unlock(mm);
    1556           0 :         return 0;
    1557             : }
    1558             : 
    1559           0 : static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
    1560             : {
    1561           0 :         struct vm_area_struct *vma;
    1562           0 :         struct mm_struct *mm;
    1563           0 :         unsigned long addr;
    1564           0 :         pmd_t *pmd, _pmd;
    1565             : 
    1566           0 :         i_mmap_lock_write(mapping);
    1567           0 :         vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
    1568             :                 /*
    1569             :                  * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
    1570             :                  * got written to. These VMAs are likely not worth investing
    1571             :                  * mmap_write_lock(mm) as PMD-mapping is likely to be split
    1572             :                  * later.
    1573             :                  *
    1574             :                  * Not that vma->anon_vma check is racy: it can be set up after
    1575             :                  * the check but before we took mmap_lock by the fault path.
    1576             :                  * But page lock would prevent establishing any new ptes of the
    1577             :                  * page, so we are safe.
    1578             :                  *
    1579             :                  * An alternative would be drop the check, but check that page
    1580             :                  * table is clear before calling pmdp_collapse_flush() under
    1581             :                  * ptl. It has higher chance to recover THP for the VMA, but
    1582             :                  * has higher cost too.
    1583             :                  */
    1584           0 :                 if (vma->anon_vma)
    1585           0 :                         continue;
    1586           0 :                 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
    1587           0 :                 if (addr & ~HPAGE_PMD_MASK)
    1588           0 :                         continue;
    1589           0 :                 if (vma->vm_end < addr + HPAGE_PMD_SIZE)
    1590           0 :                         continue;
    1591           0 :                 mm = vma->vm_mm;
    1592           0 :                 pmd = mm_find_pmd(mm, addr);
    1593           0 :                 if (!pmd)
    1594           0 :                         continue;
    1595             :                 /*
    1596             :                  * We need exclusive mmap_lock to retract page table.
    1597             :                  *
    1598             :                  * We use trylock due to lock inversion: we need to acquire
    1599             :                  * mmap_lock while holding page lock. Fault path does it in
    1600             :                  * reverse order. Trylock is a way to avoid deadlock.
    1601             :                  */
    1602           0 :                 if (mmap_write_trylock(mm)) {
    1603           0 :                         if (!khugepaged_test_exit(mm)) {
    1604           0 :                                 spinlock_t *ptl = pmd_lock(mm, pmd);
    1605             :                                 /* assume page table is clear */
    1606           0 :                                 _pmd = pmdp_collapse_flush(vma, addr, pmd);
    1607           0 :                                 spin_unlock(ptl);
    1608           0 :                                 mm_dec_nr_ptes(mm);
    1609           0 :                                 pte_free(mm, pmd_pgtable(_pmd));
    1610             :                         }
    1611           0 :                         mmap_write_unlock(mm);
    1612             :                 } else {
    1613             :                         /* Try again later */
    1614           0 :                         khugepaged_add_pte_mapped_thp(mm, addr);
    1615             :                 }
    1616             :         }
    1617           0 :         i_mmap_unlock_write(mapping);
    1618           0 : }
    1619             : 
    1620             : /**
    1621             :  * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
    1622             :  *
    1623             :  * @mm: process address space where collapse happens
    1624             :  * @file: file that collapse on
    1625             :  * @start: collapse start address
    1626             :  * @hpage: new allocated huge page for collapse
    1627             :  * @node: appointed node the new huge page allocate from
    1628             :  *
    1629             :  * Basic scheme is simple, details are more complex:
    1630             :  *  - allocate and lock a new huge page;
    1631             :  *  - scan page cache replacing old pages with the new one
    1632             :  *    + swap/gup in pages if necessary;
    1633             :  *    + fill in gaps;
    1634             :  *    + keep old pages around in case rollback is required;
    1635             :  *  - if replacing succeeds:
    1636             :  *    + copy data over;
    1637             :  *    + free old pages;
    1638             :  *    + unlock huge page;
    1639             :  *  - if replacing failed;
    1640             :  *    + put all pages back and unfreeze them;
    1641             :  *    + restore gaps in the page cache;
    1642             :  *    + unlock and free huge page;
    1643             :  */
    1644           0 : static void collapse_file(struct mm_struct *mm,
    1645             :                 struct file *file, pgoff_t start,
    1646             :                 struct page **hpage, int node)
    1647             : {
    1648           0 :         struct address_space *mapping = file->f_mapping;
    1649           0 :         gfp_t gfp;
    1650           0 :         struct page *new_page;
    1651           0 :         pgoff_t index, end = start + HPAGE_PMD_NR;
    1652           0 :         LIST_HEAD(pagelist);
    1653           0 :         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
    1654           0 :         int nr_none = 0, result = SCAN_SUCCEED;
    1655           0 :         bool is_shmem = shmem_file(file);
    1656           0 :         int nr;
    1657             : 
    1658           0 :         VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
    1659           0 :         VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
    1660             : 
    1661             :         /* Only allocate from the target node */
    1662           0 :         gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
    1663             : 
    1664           0 :         new_page = khugepaged_alloc_page(hpage, gfp, node);
    1665           0 :         if (!new_page) {
    1666           0 :                 result = SCAN_ALLOC_HUGE_PAGE_FAIL;
    1667           0 :                 goto out;
    1668             :         }
    1669             : 
    1670           0 :         if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
    1671             :                 result = SCAN_CGROUP_CHARGE_FAIL;
    1672             :                 goto out;
    1673             :         }
    1674           0 :         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
    1675             : 
    1676             :         /* This will be less messy when we use multi-index entries */
    1677           0 :         do {
    1678           0 :                 xas_lock_irq(&xas);
    1679           0 :                 xas_create_range(&xas);
    1680           0 :                 if (!xas_error(&xas))
    1681             :                         break;
    1682           0 :                 xas_unlock_irq(&xas);
    1683           0 :                 if (!xas_nomem(&xas, GFP_KERNEL)) {
    1684           0 :                         result = SCAN_FAIL;
    1685           0 :                         goto out;
    1686             :                 }
    1687             :         } while (1);
    1688             : 
    1689           0 :         __SetPageLocked(new_page);
    1690           0 :         if (is_shmem)
    1691           0 :                 __SetPageSwapBacked(new_page);
    1692           0 :         new_page->index = start;
    1693           0 :         new_page->mapping = mapping;
    1694             : 
    1695             :         /*
    1696             :          * At this point the new_page is locked and not up-to-date.
    1697             :          * It's safe to insert it into the page cache, because nobody would
    1698             :          * be able to map it or use it in another way until we unlock it.
    1699             :          */
    1700             : 
    1701           0 :         xas_set(&xas, start);
    1702           0 :         for (index = start; index < end; index++) {
    1703           0 :                 struct page *page = xas_next(&xas);
    1704             : 
    1705           0 :                 VM_BUG_ON(index != xas.xa_index);
    1706           0 :                 if (is_shmem) {
    1707           0 :                         if (!page) {
    1708             :                                 /*
    1709             :                                  * Stop if extent has been truncated or
    1710             :                                  * hole-punched, and is now completely
    1711             :                                  * empty.
    1712             :                                  */
    1713           0 :                                 if (index == start) {
    1714           0 :                                         if (!xas_next_entry(&xas, end - 1)) {
    1715           0 :                                                 result = SCAN_TRUNCATED;
    1716           0 :                                                 goto xa_locked;
    1717             :                                         }
    1718           0 :                                         xas_set(&xas, index);
    1719             :                                 }
    1720           0 :                                 if (!shmem_charge(mapping->host, 1)) {
    1721           0 :                                         result = SCAN_FAIL;
    1722           0 :                                         goto xa_locked;
    1723             :                                 }
    1724           0 :                                 xas_store(&xas, new_page);
    1725           0 :                                 nr_none++;
    1726           0 :                                 continue;
    1727             :                         }
    1728             : 
    1729           0 :                         if (xa_is_value(page) || !PageUptodate(page)) {
    1730           0 :                                 xas_unlock_irq(&xas);
    1731             :                                 /* swap in or instantiate fallocated page */
    1732           0 :                                 if (shmem_getpage(mapping->host, index, &page,
    1733             :                                                   SGP_NOHUGE)) {
    1734           0 :                                         result = SCAN_FAIL;
    1735           0 :                                         goto xa_unlocked;
    1736             :                                 }
    1737           0 :                         } else if (trylock_page(page)) {
    1738           0 :                                 get_page(page);
    1739           0 :                                 xas_unlock_irq(&xas);
    1740             :                         } else {
    1741           0 :                                 result = SCAN_PAGE_LOCK;
    1742           0 :                                 goto xa_locked;
    1743             :                         }
    1744             :                 } else {        /* !is_shmem */
    1745           0 :                         if (!page || xa_is_value(page)) {
    1746           0 :                                 xas_unlock_irq(&xas);
    1747           0 :                                 page_cache_sync_readahead(mapping, &file->f_ra,
    1748             :                                                           file, index,
    1749             :                                                           end - index);
    1750             :                                 /* drain pagevecs to help isolate_lru_page() */
    1751           0 :                                 lru_add_drain();
    1752           0 :                                 page = find_lock_page(mapping, index);
    1753           0 :                                 if (unlikely(page == NULL)) {
    1754           0 :                                         result = SCAN_FAIL;
    1755           0 :                                         goto xa_unlocked;
    1756             :                                 }
    1757           0 :                         } else if (PageDirty(page)) {
    1758             :                                 /*
    1759             :                                  * khugepaged only works on read-only fd,
    1760             :                                  * so this page is dirty because it hasn't
    1761             :                                  * been flushed since first write. There
    1762             :                                  * won't be new dirty pages.
    1763             :                                  *
    1764             :                                  * Trigger async flush here and hope the
    1765             :                                  * writeback is done when khugepaged
    1766             :                                  * revisits this page.
    1767             :                                  *
    1768             :                                  * This is a one-off situation. We are not
    1769             :                                  * forcing writeback in loop.
    1770             :                                  */
    1771           0 :                                 xas_unlock_irq(&xas);
    1772           0 :                                 filemap_flush(mapping);
    1773           0 :                                 result = SCAN_FAIL;
    1774           0 :                                 goto xa_unlocked;
    1775           0 :                         } else if (trylock_page(page)) {
    1776           0 :                                 get_page(page);
    1777           0 :                                 xas_unlock_irq(&xas);
    1778             :                         } else {
    1779           0 :                                 result = SCAN_PAGE_LOCK;
    1780           0 :                                 goto xa_locked;
    1781             :                         }
    1782             :                 }
    1783             : 
    1784             :                 /*
    1785             :                  * The page must be locked, so we can drop the i_pages lock
    1786             :                  * without racing with truncate.
    1787             :                  */
    1788           0 :                 VM_BUG_ON_PAGE(!PageLocked(page), page);
    1789             : 
    1790             :                 /* make sure the page is up to date */
    1791           0 :                 if (unlikely(!PageUptodate(page))) {
    1792           0 :                         result = SCAN_FAIL;
    1793           0 :                         goto out_unlock;
    1794             :                 }
    1795             : 
    1796             :                 /*
    1797             :                  * If file was truncated then extended, or hole-punched, before
    1798             :                  * we locked the first page, then a THP might be there already.
    1799             :                  */
    1800           0 :                 if (PageTransCompound(page)) {
    1801           0 :                         result = SCAN_PAGE_COMPOUND;
    1802           0 :                         goto out_unlock;
    1803             :                 }
    1804             : 
    1805           0 :                 if (page_mapping(page) != mapping) {
    1806           0 :                         result = SCAN_TRUNCATED;
    1807           0 :                         goto out_unlock;
    1808             :                 }
    1809             : 
    1810           0 :                 if (!is_shmem && PageDirty(page)) {
    1811             :                         /*
    1812             :                          * khugepaged only works on read-only fd, so this
    1813             :                          * page is dirty because it hasn't been flushed
    1814             :                          * since first write.
    1815             :                          */
    1816           0 :                         result = SCAN_FAIL;
    1817           0 :                         goto out_unlock;
    1818             :                 }
    1819             : 
    1820           0 :                 if (isolate_lru_page(page)) {
    1821           0 :                         result = SCAN_DEL_PAGE_LRU;
    1822           0 :                         goto out_unlock;
    1823             :                 }
    1824             : 
    1825           0 :                 if (page_has_private(page) &&
    1826           0 :                     !try_to_release_page(page, GFP_KERNEL)) {
    1827           0 :                         result = SCAN_PAGE_HAS_PRIVATE;
    1828           0 :                         putback_lru_page(page);
    1829           0 :                         goto out_unlock;
    1830             :                 }
    1831             : 
    1832           0 :                 if (page_mapped(page))
    1833           0 :                         unmap_mapping_pages(mapping, index, 1, false);
    1834             : 
    1835           0 :                 xas_lock_irq(&xas);
    1836           0 :                 xas_set(&xas, index);
    1837             : 
    1838           0 :                 VM_BUG_ON_PAGE(page != xas_load(&xas), page);
    1839           0 :                 VM_BUG_ON_PAGE(page_mapped(page), page);
    1840             : 
    1841             :                 /*
    1842             :                  * The page is expected to have page_count() == 3:
    1843             :                  *  - we hold a pin on it;
    1844             :                  *  - one reference from page cache;
    1845             :                  *  - one from isolate_lru_page;
    1846             :                  */
    1847           0 :                 if (!page_ref_freeze(page, 3)) {
    1848           0 :                         result = SCAN_PAGE_COUNT;
    1849           0 :                         xas_unlock_irq(&xas);
    1850           0 :                         putback_lru_page(page);
    1851           0 :                         goto out_unlock;
    1852             :                 }
    1853             : 
    1854             :                 /*
    1855             :                  * Add the page to the list to be able to undo the collapse if
    1856             :                  * something go wrong.
    1857             :                  */
    1858           0 :                 list_add_tail(&page->lru, &pagelist);
    1859             : 
    1860             :                 /* Finally, replace with the new page. */
    1861           0 :                 xas_store(&xas, new_page);
    1862           0 :                 continue;
    1863           0 : out_unlock:
    1864           0 :                 unlock_page(page);
    1865           0 :                 put_page(page);
    1866           0 :                 goto xa_unlocked;
    1867             :         }
    1868           0 :         nr = thp_nr_pages(new_page);
    1869             : 
    1870           0 :         if (is_shmem)
    1871           0 :                 __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
    1872             :         else {
    1873           0 :                 __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
    1874           0 :                 filemap_nr_thps_inc(mapping);
    1875             :         }
    1876             : 
    1877           0 :         if (nr_none) {
    1878           0 :                 __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
    1879           0 :                 if (is_shmem)
    1880           0 :                         __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
    1881             :         }
    1882             : 
    1883           0 : xa_locked:
    1884           0 :         xas_unlock_irq(&xas);
    1885           0 : xa_unlocked:
    1886             : 
    1887           0 :         if (result == SCAN_SUCCEED) {
    1888           0 :                 struct page *page, *tmp;
    1889             : 
    1890             :                 /*
    1891             :                  * Replacing old pages with new one has succeeded, now we
    1892             :                  * need to copy the content and free the old pages.
    1893             :                  */
    1894           0 :                 index = start;
    1895           0 :                 list_for_each_entry_safe(page, tmp, &pagelist, lru) {
    1896           0 :                         while (index < page->index) {
    1897           0 :                                 clear_highpage(new_page + (index % HPAGE_PMD_NR));
    1898           0 :                                 index++;
    1899             :                         }
    1900           0 :                         copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
    1901             :                                         page);
    1902           0 :                         list_del(&page->lru);
    1903           0 :                         page->mapping = NULL;
    1904           0 :                         page_ref_unfreeze(page, 1);
    1905           0 :                         ClearPageActive(page);
    1906           0 :                         ClearPageUnevictable(page);
    1907           0 :                         unlock_page(page);
    1908           0 :                         put_page(page);
    1909           0 :                         index++;
    1910             :                 }
    1911           0 :                 while (index < end) {
    1912           0 :                         clear_highpage(new_page + (index % HPAGE_PMD_NR));
    1913           0 :                         index++;
    1914             :                 }
    1915             : 
    1916           0 :                 SetPageUptodate(new_page);
    1917           0 :                 page_ref_add(new_page, HPAGE_PMD_NR - 1);
    1918           0 :                 if (is_shmem)
    1919           0 :                         set_page_dirty(new_page);
    1920           0 :                 lru_cache_add(new_page);
    1921             : 
    1922             :                 /*
    1923             :                  * Remove pte page tables, so we can re-fault the page as huge.
    1924             :                  */
    1925           0 :                 retract_page_tables(mapping, start);
    1926           0 :                 *hpage = NULL;
    1927             : 
    1928           0 :                 khugepaged_pages_collapsed++;
    1929             :         } else {
    1930           0 :                 struct page *page;
    1931             : 
    1932             :                 /* Something went wrong: roll back page cache changes */
    1933           0 :                 xas_lock_irq(&xas);
    1934           0 :                 mapping->nrpages -= nr_none;
    1935             : 
    1936           0 :                 if (is_shmem)
    1937           0 :                         shmem_uncharge(mapping->host, nr_none);
    1938             : 
    1939           0 :                 xas_set(&xas, start);
    1940           0 :                 xas_for_each(&xas, page, end - 1) {
    1941           0 :                         page = list_first_entry_or_null(&pagelist,
    1942             :                                         struct page, lru);
    1943           0 :                         if (!page || xas.xa_index < page->index) {
    1944           0 :                                 if (!nr_none)
    1945             :                                         break;
    1946           0 :                                 nr_none--;
    1947             :                                 /* Put holes back where they were */
    1948           0 :                                 xas_store(&xas, NULL);
    1949           0 :                                 continue;
    1950             :                         }
    1951             : 
    1952           0 :                         VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
    1953             : 
    1954             :                         /* Unfreeze the page. */
    1955           0 :                         list_del(&page->lru);
    1956           0 :                         page_ref_unfreeze(page, 2);
    1957           0 :                         xas_store(&xas, page);
    1958           0 :                         xas_pause(&xas);
    1959           0 :                         xas_unlock_irq(&xas);
    1960           0 :                         unlock_page(page);
    1961           0 :                         putback_lru_page(page);
    1962           0 :                         xas_lock_irq(&xas);
    1963             :                 }
    1964           0 :                 VM_BUG_ON(nr_none);
    1965           0 :                 xas_unlock_irq(&xas);
    1966             : 
    1967           0 :                 new_page->mapping = NULL;
    1968             :         }
    1969             : 
    1970           0 :         unlock_page(new_page);
    1971           0 : out:
    1972           0 :         VM_BUG_ON(!list_empty(&pagelist));
    1973           0 :         if (!IS_ERR_OR_NULL(*hpage))
    1974           0 :                 mem_cgroup_uncharge(*hpage);
    1975             :         /* TODO: tracepoints */
    1976           0 : }
    1977             : 
    1978           0 : static void khugepaged_scan_file(struct mm_struct *mm,
    1979             :                 struct file *file, pgoff_t start, struct page **hpage)
    1980             : {
    1981           0 :         struct page *page = NULL;
    1982           0 :         struct address_space *mapping = file->f_mapping;
    1983           0 :         XA_STATE(xas, &mapping->i_pages, start);
    1984           0 :         int present, swap;
    1985           0 :         int node = NUMA_NO_NODE;
    1986           0 :         int result = SCAN_SUCCEED;
    1987             : 
    1988           0 :         present = 0;
    1989           0 :         swap = 0;
    1990           0 :         memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
    1991           0 :         rcu_read_lock();
    1992           0 :         xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
    1993           0 :                 if (xas_retry(&xas, page))
    1994           0 :                         continue;
    1995             : 
    1996           0 :                 if (xa_is_value(page)) {
    1997           0 :                         if (++swap > khugepaged_max_ptes_swap) {
    1998             :                                 result = SCAN_EXCEED_SWAP_PTE;
    1999             :                                 break;
    2000             :                         }
    2001           0 :                         continue;
    2002             :                 }
    2003             : 
    2004           0 :                 if (PageTransCompound(page)) {
    2005             :                         result = SCAN_PAGE_COMPOUND;
    2006             :                         break;
    2007             :                 }
    2008             : 
    2009           0 :                 node = page_to_nid(page);
    2010           0 :                 if (khugepaged_scan_abort(node)) {
    2011             :                         result = SCAN_SCAN_ABORT;
    2012             :                         break;
    2013             :                 }
    2014           0 :                 khugepaged_node_load[node]++;
    2015             : 
    2016           0 :                 if (!PageLRU(page)) {
    2017             :                         result = SCAN_PAGE_LRU;
    2018             :                         break;
    2019             :                 }
    2020             : 
    2021           0 :                 if (page_count(page) !=
    2022           0 :                     1 + page_mapcount(page) + page_has_private(page)) {
    2023             :                         result = SCAN_PAGE_COUNT;
    2024             :                         break;
    2025             :                 }
    2026             : 
    2027             :                 /*
    2028             :                  * We probably should check if the page is referenced here, but
    2029             :                  * nobody would transfer pte_young() to PageReferenced() for us.
    2030             :                  * And rmap walk here is just too costly...
    2031             :                  */
    2032             : 
    2033           0 :                 present++;
    2034             : 
    2035           0 :                 if (need_resched()) {
    2036           0 :                         xas_pause(&xas);
    2037           0 :                         cond_resched_rcu();
    2038             :                 }
    2039             :         }
    2040           0 :         rcu_read_unlock();
    2041             : 
    2042           0 :         if (result == SCAN_SUCCEED) {
    2043           0 :                 if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
    2044           0 :                         result = SCAN_EXCEED_NONE_PTE;
    2045             :                 } else {
    2046           0 :                         node = khugepaged_find_target_node();
    2047           0 :                         collapse_file(mm, file, start, hpage, node);
    2048             :                 }
    2049             :         }
    2050             : 
    2051             :         /* TODO: tracepoints */
    2052           0 : }
    2053             : #else
    2054             : static void khugepaged_scan_file(struct mm_struct *mm,
    2055             :                 struct file *file, pgoff_t start, struct page **hpage)
    2056             : {
    2057             :         BUILD_BUG();
    2058             : }
    2059             : 
    2060             : static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
    2061             : {
    2062             :         return 0;
    2063             : }
    2064             : #endif
    2065             : 
    2066          15 : static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
    2067             :                                             struct page **hpage)
    2068             :         __releases(&khugepaged_mm_lock)
    2069             :         __acquires(&khugepaged_mm_lock)
    2070             : {
    2071          15 :         struct mm_slot *mm_slot;
    2072          15 :         struct mm_struct *mm;
    2073          15 :         struct vm_area_struct *vma;
    2074          15 :         int progress = 0;
    2075             : 
    2076          15 :         VM_BUG_ON(!pages);
    2077          45 :         lockdep_assert_held(&khugepaged_mm_lock);
    2078             : 
    2079          15 :         if (khugepaged_scan.mm_slot)
    2080             :                 mm_slot = khugepaged_scan.mm_slot;
    2081             :         else {
    2082           3 :                 mm_slot = list_entry(khugepaged_scan.mm_head.next,
    2083             :                                      struct mm_slot, mm_node);
    2084           3 :                 khugepaged_scan.address = 0;
    2085           3 :                 khugepaged_scan.mm_slot = mm_slot;
    2086             :         }
    2087          15 :         spin_unlock(&khugepaged_mm_lock);
    2088          15 :         khugepaged_collapse_pte_mapped_thps(mm_slot);
    2089             : 
    2090          15 :         mm = mm_slot->mm;
    2091             :         /*
    2092             :          * Don't wait for semaphore (to avoid long wait times).  Just move to
    2093             :          * the next mm on the list.
    2094             :          */
    2095          15 :         vma = NULL;
    2096          15 :         if (unlikely(!mmap_read_trylock(mm)))
    2097           0 :                 goto breakouterloop_mmap_lock;
    2098          15 :         if (likely(!khugepaged_test_exit(mm)))
    2099          15 :                 vma = find_vma(mm, khugepaged_scan.address);
    2100             : 
    2101             :         progress++;
    2102        1702 :         for (; vma; vma = vma->vm_next) {
    2103        1693 :                 unsigned long hstart, hend;
    2104             : 
    2105        1693 :                 cond_resched();
    2106        1693 :                 if (unlikely(khugepaged_test_exit(mm))) {
    2107           0 :                         progress++;
    2108           0 :                         break;
    2109             :                 }
    2110        1693 :                 if (!hugepage_vma_check(vma, vma->vm_flags)) {
    2111        1524 : skip:
    2112        1675 :                         progress++;
    2113        1675 :                         continue;
    2114             :                 }
    2115         169 :                 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
    2116         169 :                 hend = vma->vm_end & HPAGE_PMD_MASK;
    2117         169 :                 if (hstart >= hend)
    2118         151 :                         goto skip;
    2119          18 :                 if (khugepaged_scan.address > hend)
    2120           0 :                         goto skip;
    2121          18 :                 if (khugepaged_scan.address < hstart)
    2122          13 :                         khugepaged_scan.address = hstart;
    2123          18 :                 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
    2124          18 :                 if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
    2125           0 :                         goto skip;
    2126             : 
    2127          44 :                 while (khugepaged_scan.address < hend) {
    2128          32 :                         int ret;
    2129          32 :                         cond_resched();
    2130          32 :                         if (unlikely(khugepaged_test_exit(mm)))
    2131           0 :                                 goto breakouterloop;
    2132             : 
    2133          32 :                         VM_BUG_ON(khugepaged_scan.address < hstart ||
    2134             :                                   khugepaged_scan.address + HPAGE_PMD_SIZE >
    2135             :                                   hend);
    2136          32 :                         if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
    2137           0 :                                 struct file *file = get_file(vma->vm_file);
    2138           0 :                                 pgoff_t pgoff = linear_page_index(vma,
    2139             :                                                 khugepaged_scan.address);
    2140             : 
    2141           0 :                                 mmap_read_unlock(mm);
    2142           0 :                                 ret = 1;
    2143           0 :                                 khugepaged_scan_file(mm, file, pgoff, hpage);
    2144           0 :                                 fput(file);
    2145             :                         } else {
    2146          32 :                                 ret = khugepaged_scan_pmd(mm, vma,
    2147             :                                                 khugepaged_scan.address,
    2148             :                                                 hpage);
    2149             :                         }
    2150             :                         /* move to next address */
    2151          32 :                         khugepaged_scan.address += HPAGE_PMD_SIZE;
    2152          32 :                         progress += HPAGE_PMD_NR;
    2153          32 :                         if (ret)
    2154             :                                 /* we released mmap_lock so break loop */
    2155           2 :                                 goto breakouterloop_mmap_lock;
    2156          30 :                         if (progress >= pages)
    2157           4 :                                 goto breakouterloop;
    2158             :                 }
    2159             :         }
    2160           9 : breakouterloop:
    2161          13 :         mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
    2162          15 : breakouterloop_mmap_lock:
    2163             : 
    2164          15 :         spin_lock(&khugepaged_mm_lock);
    2165          15 :         VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
    2166             :         /*
    2167             :          * Release the current mm_slot if this mm is about to die, or
    2168             :          * if we scanned all vmas of this mm.
    2169             :          */
    2170          15 :         if (khugepaged_test_exit(mm) || !vma) {
    2171             :                 /*
    2172             :                  * Make sure that if mm_users is reaching zero while
    2173             :                  * khugepaged runs here, khugepaged_exit will find
    2174             :                  * mm_slot not pointing to the exiting mm.
    2175             :                  */
    2176           9 :                 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
    2177           7 :                         khugepaged_scan.mm_slot = list_entry(
    2178             :                                 mm_slot->mm_node.next,
    2179             :                                 struct mm_slot, mm_node);
    2180           7 :                         khugepaged_scan.address = 0;
    2181             :                 } else {
    2182           2 :                         khugepaged_scan.mm_slot = NULL;
    2183           2 :                         khugepaged_full_scans++;
    2184             :                 }
    2185             : 
    2186           9 :                 collect_mm_slot(mm_slot);
    2187             :         }
    2188             : 
    2189          15 :         return progress;
    2190             : }
    2191             : 
    2192          23 : static int khugepaged_has_work(void)
    2193             : {
    2194          44 :         return !list_empty(&khugepaged_scan.mm_head) &&
    2195          21 :                 khugepaged_enabled();
    2196             : }
    2197             : 
    2198           3 : static int khugepaged_wait_event(void)
    2199             : {
    2200           5 :         return !list_empty(&khugepaged_scan.mm_head) ||
    2201           2 :                 kthread_should_stop();
    2202             : }
    2203             : 
    2204           6 : static void khugepaged_do_scan(void)
    2205             : {
    2206           6 :         struct page *hpage = NULL;
    2207           6 :         unsigned int progress = 0, pass_through_head = 0;
    2208           6 :         unsigned int pages = khugepaged_pages_to_scan;
    2209           6 :         bool wait = true;
    2210             : 
    2211           6 :         barrier(); /* write khugepaged_pages_to_scan to local stack */
    2212             : 
    2213           6 :         lru_add_drain_all();
    2214             : 
    2215           6 :         while (progress < pages) {
    2216          17 :                 if (!khugepaged_prealloc_page(&hpage, &wait))
    2217             :                         break;
    2218             : 
    2219          17 :                 cond_resched();
    2220             : 
    2221          17 :                 if (unlikely(kthread_should_stop() || try_to_freeze()))
    2222             :                         break;
    2223             : 
    2224          17 :                 spin_lock(&khugepaged_mm_lock);
    2225          17 :                 if (!khugepaged_scan.mm_slot)
    2226           5 :                         pass_through_head++;
    2227          33 :                 if (khugepaged_has_work() &&
    2228             :                     pass_through_head < 2)
    2229          15 :                         progress += khugepaged_scan_mm_slot(pages - progress,
    2230          15 :                                                             &hpage);
    2231             :                 else
    2232             :                         progress = pages;
    2233          40 :                 spin_unlock(&khugepaged_mm_lock);
    2234             :         }
    2235             : 
    2236           6 :         if (!IS_ERR_OR_NULL(hpage))
    2237           0 :                 put_page(hpage);
    2238           6 : }
    2239             : 
    2240          14 : static bool khugepaged_should_wakeup(void)
    2241             : {
    2242          14 :         return kthread_should_stop() ||
    2243          14 :                time_after_eq(jiffies, khugepaged_sleep_expire);
    2244             : }
    2245             : 
    2246           6 : static void khugepaged_wait_work(void)
    2247             : {
    2248           6 :         if (khugepaged_has_work()) {
    2249           5 :                 const unsigned long scan_sleep_jiffies =
    2250           5 :                         msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
    2251             : 
    2252           5 :                 if (!scan_sleep_jiffies)
    2253             :                         return;
    2254             : 
    2255           5 :                 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
    2256           9 :                 wait_event_freezable_timeout(khugepaged_wait,
    2257             :                                              khugepaged_should_wakeup(),
    2258             :                                              scan_sleep_jiffies);
    2259           4 :                 return;
    2260             :         }
    2261             : 
    2262           1 :         if (khugepaged_enabled())
    2263           2 :                 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
    2264             : }
    2265             : 
    2266           1 : static int khugepaged(void *none)
    2267             : {
    2268           1 :         struct mm_slot *mm_slot;
    2269             : 
    2270           1 :         set_freezable();
    2271           1 :         set_user_nice(current, MAX_NICE);
    2272             : 
    2273           6 :         while (!kthread_should_stop()) {
    2274           6 :                 khugepaged_do_scan();
    2275           6 :                 khugepaged_wait_work();
    2276             :         }
    2277             : 
    2278           0 :         spin_lock(&khugepaged_mm_lock);
    2279           0 :         mm_slot = khugepaged_scan.mm_slot;
    2280           0 :         khugepaged_scan.mm_slot = NULL;
    2281           0 :         if (mm_slot)
    2282           0 :                 collect_mm_slot(mm_slot);
    2283           0 :         spin_unlock(&khugepaged_mm_lock);
    2284           0 :         return 0;
    2285             : }
    2286             : 
    2287           1 : static void set_recommended_min_free_kbytes(void)
    2288             : {
    2289           1 :         struct zone *zone;
    2290           1 :         int nr_zones = 0;
    2291           1 :         unsigned long recommended_min;
    2292             : 
    2293           4 :         for_each_populated_zone(zone) {
    2294             :                 /*
    2295             :                  * We don't need to worry about fragmentation of
    2296             :                  * ZONE_MOVABLE since it only has movable pages.
    2297             :                  */
    2298           1 :                 if (zone_idx(zone) > gfp_zone(GFP_USER))
    2299           0 :                         continue;
    2300             : 
    2301           1 :                 nr_zones++;
    2302             :         }
    2303             : 
    2304             :         /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
    2305           1 :         recommended_min = pageblock_nr_pages * nr_zones * 2;
    2306             : 
    2307             :         /*
    2308             :          * Make sure that on average at least two pageblocks are almost free
    2309             :          * of another type, one for a migratetype to fall back to and a
    2310             :          * second to avoid subsequent fallbacks of other types There are 3
    2311             :          * MIGRATE_TYPES we care about.
    2312             :          */
    2313           1 :         recommended_min += pageblock_nr_pages * nr_zones *
    2314             :                            MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
    2315             : 
    2316             :         /* don't ever allow to reserve more than 5% of the lowmem */
    2317           1 :         recommended_min = min(recommended_min,
    2318             :                               (unsigned long) nr_free_buffer_pages() / 20);
    2319           1 :         recommended_min <<= (PAGE_SHIFT-10);
    2320             : 
    2321           1 :         if (recommended_min > min_free_kbytes) {
    2322           1 :                 if (user_min_free_kbytes >= 0)
    2323           0 :                         pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
    2324             :                                 min_free_kbytes, recommended_min);
    2325             : 
    2326           1 :                 min_free_kbytes = recommended_min;
    2327             :         }
    2328           1 :         setup_per_zone_wmarks();
    2329           1 : }
    2330             : 
    2331           1 : int start_stop_khugepaged(void)
    2332             : {
    2333           1 :         int err = 0;
    2334             : 
    2335           1 :         mutex_lock(&khugepaged_mutex);
    2336           1 :         if (khugepaged_enabled()) {
    2337           1 :                 if (!khugepaged_thread)
    2338           1 :                         khugepaged_thread = kthread_run(khugepaged, NULL,
    2339             :                                                         "khugepaged");
    2340           1 :                 if (IS_ERR(khugepaged_thread)) {
    2341           0 :                         pr_err("khugepaged: kthread_run(khugepaged) failed\n");
    2342           0 :                         err = PTR_ERR(khugepaged_thread);
    2343           0 :                         khugepaged_thread = NULL;
    2344           0 :                         goto fail;
    2345             :                 }
    2346             : 
    2347           1 :                 if (!list_empty(&khugepaged_scan.mm_head))
    2348           0 :                         wake_up_interruptible(&khugepaged_wait);
    2349             : 
    2350           1 :                 set_recommended_min_free_kbytes();
    2351           0 :         } else if (khugepaged_thread) {
    2352           0 :                 kthread_stop(khugepaged_thread);
    2353           0 :                 khugepaged_thread = NULL;
    2354             :         }
    2355           0 : fail:
    2356           1 :         mutex_unlock(&khugepaged_mutex);
    2357           1 :         return err;
    2358             : }
    2359             : 
    2360           1 : void khugepaged_min_free_kbytes_update(void)
    2361             : {
    2362           1 :         mutex_lock(&khugepaged_mutex);
    2363           1 :         if (khugepaged_enabled() && khugepaged_thread)
    2364           0 :                 set_recommended_min_free_kbytes();
    2365           1 :         mutex_unlock(&khugepaged_mutex);
    2366           1 : }

Generated by: LCOV version 1.14