LCOV - code coverage report
Current view: top level - mm - huge_memory.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 360 1438 25.0 %
Date: 2021-04-22 12:43:58 Functions: 24 77 31.2 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  Copyright (C) 2009  Red Hat, Inc.
       4             :  */
       5             : 
       6             : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
       7             : 
       8             : #include <linux/mm.h>
       9             : #include <linux/sched.h>
      10             : #include <linux/sched/coredump.h>
      11             : #include <linux/sched/numa_balancing.h>
      12             : #include <linux/highmem.h>
      13             : #include <linux/hugetlb.h>
      14             : #include <linux/mmu_notifier.h>
      15             : #include <linux/rmap.h>
      16             : #include <linux/swap.h>
      17             : #include <linux/shrinker.h>
      18             : #include <linux/mm_inline.h>
      19             : #include <linux/swapops.h>
      20             : #include <linux/dax.h>
      21             : #include <linux/khugepaged.h>
      22             : #include <linux/freezer.h>
      23             : #include <linux/pfn_t.h>
      24             : #include <linux/mman.h>
      25             : #include <linux/memremap.h>
      26             : #include <linux/pagemap.h>
      27             : #include <linux/debugfs.h>
      28             : #include <linux/migrate.h>
      29             : #include <linux/hashtable.h>
      30             : #include <linux/userfaultfd_k.h>
      31             : #include <linux/page_idle.h>
      32             : #include <linux/shmem_fs.h>
      33             : #include <linux/oom.h>
      34             : #include <linux/numa.h>
      35             : #include <linux/page_owner.h>
      36             : 
      37             : #include <asm/tlb.h>
      38             : #include <asm/pgalloc.h>
      39             : #include "internal.h"
      40             : 
      41             : /*
      42             :  * By default, transparent hugepage support is disabled in order to avoid
      43             :  * risking an increased memory footprint for applications that are not
      44             :  * guaranteed to benefit from it. When transparent hugepage support is
      45             :  * enabled, it is for all mappings, and khugepaged scans all mappings.
      46             :  * Defrag is invoked by khugepaged hugepage allocations and by page faults
      47             :  * for all hugepage allocations.
      48             :  */
      49             : unsigned long transparent_hugepage_flags __read_mostly =
      50             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
      51             :         (1<<TRANSPARENT_HUGEPAGE_FLAG)|
      52             : #endif
      53             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
      54             :         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
      55             : #endif
      56             :         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
      57             :         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
      58             :         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
      59             : 
      60             : static struct shrinker deferred_split_shrinker;
      61             : 
      62             : static atomic_t huge_zero_refcount;
      63             : struct page *huge_zero_page __read_mostly;
      64             : 
      65           0 : bool transparent_hugepage_enabled(struct vm_area_struct *vma)
      66             : {
      67             :         /* The addr is used to check if the vma size fits */
      68           0 :         unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
      69             : 
      70           0 :         if (!transhuge_vma_suitable(vma, addr))
      71             :                 return false;
      72           0 :         if (vma_is_anonymous(vma))
      73           0 :                 return __transparent_hugepage_enabled(vma);
      74           0 :         if (vma_is_shmem(vma))
      75           0 :                 return shmem_huge_enabled(vma);
      76             : 
      77             :         return false;
      78             : }
      79             : 
      80           0 : static struct page *get_huge_zero_page(void)
      81             : {
      82           0 :         struct page *zero_page;
      83           0 : retry:
      84           0 :         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
      85           0 :                 return READ_ONCE(huge_zero_page);
      86             : 
      87           0 :         zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
      88             :                         HPAGE_PMD_ORDER);
      89           0 :         if (!zero_page) {
      90           0 :                 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
      91           0 :                 return NULL;
      92             :         }
      93           0 :         count_vm_event(THP_ZERO_PAGE_ALLOC);
      94           0 :         preempt_disable();
      95           0 :         if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
      96           0 :                 preempt_enable();
      97           0 :                 __free_pages(zero_page, compound_order(zero_page));
      98           0 :                 goto retry;
      99             :         }
     100             : 
     101             :         /* We take additional reference here. It will be put back by shrinker */
     102           0 :         atomic_set(&huge_zero_refcount, 2);
     103           0 :         preempt_enable();
     104           0 :         return READ_ONCE(huge_zero_page);
     105             : }
     106             : 
     107           0 : static void put_huge_zero_page(void)
     108             : {
     109             :         /*
     110             :          * Counter should never go to zero here. Only shrinker can put
     111             :          * last reference.
     112             :          */
     113           0 :         BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
     114           0 : }
     115             : 
     116           0 : struct page *mm_get_huge_zero_page(struct mm_struct *mm)
     117             : {
     118           0 :         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
     119           0 :                 return READ_ONCE(huge_zero_page);
     120             : 
     121           0 :         if (!get_huge_zero_page())
     122             :                 return NULL;
     123             : 
     124           0 :         if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
     125           0 :                 put_huge_zero_page();
     126             : 
     127           0 :         return READ_ONCE(huge_zero_page);
     128             : }
     129             : 
     130        3976 : void mm_put_huge_zero_page(struct mm_struct *mm)
     131             : {
     132        3976 :         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
     133           0 :                 put_huge_zero_page();
     134        3976 : }
     135             : 
     136           0 : static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
     137             :                                         struct shrink_control *sc)
     138             : {
     139             :         /* we can free zero page only if last reference remains */
     140           0 :         return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
     141             : }
     142             : 
     143           0 : static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
     144             :                                        struct shrink_control *sc)
     145             : {
     146           0 :         if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
     147           0 :                 struct page *zero_page = xchg(&huge_zero_page, NULL);
     148           0 :                 BUG_ON(zero_page == NULL);
     149           0 :                 __free_pages(zero_page, compound_order(zero_page));
     150           0 :                 return HPAGE_PMD_NR;
     151             :         }
     152             : 
     153             :         return 0;
     154             : }
     155             : 
     156             : static struct shrinker huge_zero_page_shrinker = {
     157             :         .count_objects = shrink_huge_zero_page_count,
     158             :         .scan_objects = shrink_huge_zero_page_scan,
     159             :         .seeks = DEFAULT_SEEKS,
     160             : };
     161             : 
     162             : #ifdef CONFIG_SYSFS
     163           0 : static ssize_t enabled_show(struct kobject *kobj,
     164             :                             struct kobj_attribute *attr, char *buf)
     165             : {
     166           0 :         const char *output;
     167             : 
     168           0 :         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
     169             :                 output = "[always] madvise never";
     170           0 :         else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
     171             :                           &transparent_hugepage_flags))
     172             :                 output = "always [madvise] never";
     173             :         else
     174           0 :                 output = "always madvise [never]";
     175             : 
     176           0 :         return sysfs_emit(buf, "%s\n", output);
     177             : }
     178             : 
     179           0 : static ssize_t enabled_store(struct kobject *kobj,
     180             :                              struct kobj_attribute *attr,
     181             :                              const char *buf, size_t count)
     182             : {
     183           0 :         ssize_t ret = count;
     184             : 
     185           0 :         if (sysfs_streq(buf, "always")) {
     186           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
     187           0 :                 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
     188           0 :         } else if (sysfs_streq(buf, "madvise")) {
     189           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
     190           0 :                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
     191           0 :         } else if (sysfs_streq(buf, "never")) {
     192           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
     193           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
     194             :         } else
     195             :                 ret = -EINVAL;
     196             : 
     197           0 :         if (ret > 0) {
     198           0 :                 int err = start_stop_khugepaged();
     199           0 :                 if (err)
     200           0 :                         ret = err;
     201             :         }
     202           0 :         return ret;
     203             : }
     204             : static struct kobj_attribute enabled_attr =
     205             :         __ATTR(enabled, 0644, enabled_show, enabled_store);
     206             : 
     207           0 : ssize_t single_hugepage_flag_show(struct kobject *kobj,
     208             :                                   struct kobj_attribute *attr, char *buf,
     209             :                                   enum transparent_hugepage_flag flag)
     210             : {
     211           0 :         return sysfs_emit(buf, "%d\n",
     212           0 :                           !!test_bit(flag, &transparent_hugepage_flags));
     213             : }
     214             : 
     215           0 : ssize_t single_hugepage_flag_store(struct kobject *kobj,
     216             :                                  struct kobj_attribute *attr,
     217             :                                  const char *buf, size_t count,
     218             :                                  enum transparent_hugepage_flag flag)
     219             : {
     220           0 :         unsigned long value;
     221           0 :         int ret;
     222             : 
     223           0 :         ret = kstrtoul(buf, 10, &value);
     224           0 :         if (ret < 0)
     225           0 :                 return ret;
     226           0 :         if (value > 1)
     227             :                 return -EINVAL;
     228             : 
     229           0 :         if (value)
     230           0 :                 set_bit(flag, &transparent_hugepage_flags);
     231             :         else
     232           0 :                 clear_bit(flag, &transparent_hugepage_flags);
     233             : 
     234           0 :         return count;
     235             : }
     236             : 
     237           0 : static ssize_t defrag_show(struct kobject *kobj,
     238             :                            struct kobj_attribute *attr, char *buf)
     239             : {
     240           0 :         const char *output;
     241             : 
     242           0 :         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
     243             :                      &transparent_hugepage_flags))
     244             :                 output = "[always] defer defer+madvise madvise never";
     245           0 :         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
     246             :                           &transparent_hugepage_flags))
     247             :                 output = "always [defer] defer+madvise madvise never";
     248           0 :         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
     249             :                           &transparent_hugepage_flags))
     250             :                 output = "always defer [defer+madvise] madvise never";
     251           0 :         else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
     252             :                           &transparent_hugepage_flags))
     253             :                 output = "always defer defer+madvise [madvise] never";
     254             :         else
     255           0 :                 output = "always defer defer+madvise madvise [never]";
     256             : 
     257           0 :         return sysfs_emit(buf, "%s\n", output);
     258             : }
     259             : 
     260           0 : static ssize_t defrag_store(struct kobject *kobj,
     261             :                             struct kobj_attribute *attr,
     262             :                             const char *buf, size_t count)
     263             : {
     264           0 :         if (sysfs_streq(buf, "always")) {
     265           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
     266           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
     267           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
     268           0 :                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
     269           0 :         } else if (sysfs_streq(buf, "defer+madvise")) {
     270           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
     271           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
     272           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
     273           0 :                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
     274           0 :         } else if (sysfs_streq(buf, "defer")) {
     275           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
     276           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
     277           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
     278           0 :                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
     279           0 :         } else if (sysfs_streq(buf, "madvise")) {
     280           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
     281           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
     282           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
     283           0 :                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
     284           0 :         } else if (sysfs_streq(buf, "never")) {
     285           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
     286           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
     287           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
     288           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
     289             :         } else
     290             :                 return -EINVAL;
     291             : 
     292           0 :         return count;
     293             : }
     294             : static struct kobj_attribute defrag_attr =
     295             :         __ATTR(defrag, 0644, defrag_show, defrag_store);
     296             : 
     297           0 : static ssize_t use_zero_page_show(struct kobject *kobj,
     298             :                                   struct kobj_attribute *attr, char *buf)
     299             : {
     300           0 :         return single_hugepage_flag_show(kobj, attr, buf,
     301             :                                          TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
     302             : }
     303           0 : static ssize_t use_zero_page_store(struct kobject *kobj,
     304             :                 struct kobj_attribute *attr, const char *buf, size_t count)
     305             : {
     306           0 :         return single_hugepage_flag_store(kobj, attr, buf, count,
     307             :                                  TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
     308             : }
     309             : static struct kobj_attribute use_zero_page_attr =
     310             :         __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
     311             : 
     312           0 : static ssize_t hpage_pmd_size_show(struct kobject *kobj,
     313             :                                    struct kobj_attribute *attr, char *buf)
     314             : {
     315           0 :         return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
     316             : }
     317             : static struct kobj_attribute hpage_pmd_size_attr =
     318             :         __ATTR_RO(hpage_pmd_size);
     319             : 
     320             : static struct attribute *hugepage_attr[] = {
     321             :         &enabled_attr.attr,
     322             :         &defrag_attr.attr,
     323             :         &use_zero_page_attr.attr,
     324             :         &hpage_pmd_size_attr.attr,
     325             : #ifdef CONFIG_SHMEM
     326             :         &shmem_enabled_attr.attr,
     327             : #endif
     328             :         NULL,
     329             : };
     330             : 
     331             : static const struct attribute_group hugepage_attr_group = {
     332             :         .attrs = hugepage_attr,
     333             : };
     334             : 
     335           1 : static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
     336             : {
     337           1 :         int err;
     338             : 
     339           1 :         *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
     340           1 :         if (unlikely(!*hugepage_kobj)) {
     341           0 :                 pr_err("failed to create transparent hugepage kobject\n");
     342           0 :                 return -ENOMEM;
     343             :         }
     344             : 
     345           1 :         err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
     346           1 :         if (err) {
     347           0 :                 pr_err("failed to register transparent hugepage group\n");
     348           0 :                 goto delete_obj;
     349             :         }
     350             : 
     351           1 :         err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
     352           1 :         if (err) {
     353           0 :                 pr_err("failed to register transparent hugepage group\n");
     354           0 :                 goto remove_hp_group;
     355             :         }
     356             : 
     357             :         return 0;
     358             : 
     359           0 : remove_hp_group:
     360           0 :         sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
     361           0 : delete_obj:
     362           0 :         kobject_put(*hugepage_kobj);
     363           0 :         return err;
     364             : }
     365             : 
     366           0 : static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
     367             : {
     368           0 :         sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
     369           0 :         sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
     370           0 :         kobject_put(hugepage_kobj);
     371           0 : }
     372             : #else
     373             : static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
     374             : {
     375             :         return 0;
     376             : }
     377             : 
     378             : static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
     379             : {
     380             : }
     381             : #endif /* CONFIG_SYSFS */
     382             : 
     383           1 : static int __init hugepage_init(void)
     384             : {
     385           1 :         int err;
     386           1 :         struct kobject *hugepage_kobj;
     387             : 
     388           1 :         if (!has_transparent_hugepage()) {
     389             :                 /*
     390             :                  * Hardware doesn't support hugepages, hence disable
     391             :                  * DAX PMD support.
     392             :                  */
     393             :                 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
     394             :                 return -EINVAL;
     395             :         }
     396             : 
     397             :         /*
     398             :          * hugepages can't be allocated by the buddy allocator
     399             :          */
     400           1 :         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
     401             :         /*
     402             :          * we use page->mapping and page->index in second tail page
     403             :          * as list_head: assuming THP order >= 2
     404             :          */
     405           1 :         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
     406             : 
     407           1 :         err = hugepage_init_sysfs(&hugepage_kobj);
     408           1 :         if (err)
     409           0 :                 goto err_sysfs;
     410             : 
     411           1 :         err = khugepaged_init();
     412           1 :         if (err)
     413           0 :                 goto err_slab;
     414             : 
     415           1 :         err = register_shrinker(&huge_zero_page_shrinker);
     416           1 :         if (err)
     417           0 :                 goto err_hzp_shrinker;
     418           1 :         err = register_shrinker(&deferred_split_shrinker);
     419           1 :         if (err)
     420           0 :                 goto err_split_shrinker;
     421             : 
     422             :         /*
     423             :          * By default disable transparent hugepages on smaller systems,
     424             :          * where the extra memory used could hurt more than TLB overhead
     425             :          * is likely to save.  The admin can still enable it through /sys.
     426             :          */
     427           1 :         if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
     428           0 :                 transparent_hugepage_flags = 0;
     429           0 :                 return 0;
     430             :         }
     431             : 
     432           1 :         err = start_stop_khugepaged();
     433           1 :         if (err)
     434           0 :                 goto err_khugepaged;
     435             : 
     436             :         return 0;
     437           0 : err_khugepaged:
     438           0 :         unregister_shrinker(&deferred_split_shrinker);
     439           0 : err_split_shrinker:
     440           0 :         unregister_shrinker(&huge_zero_page_shrinker);
     441           0 : err_hzp_shrinker:
     442           0 :         khugepaged_destroy();
     443           0 : err_slab:
     444           0 :         hugepage_exit_sysfs(hugepage_kobj);
     445             : err_sysfs:
     446             :         return err;
     447             : }
     448             : subsys_initcall(hugepage_init);
     449             : 
     450           0 : static int __init setup_transparent_hugepage(char *str)
     451             : {
     452           0 :         int ret = 0;
     453           0 :         if (!str)
     454           0 :                 goto out;
     455           0 :         if (!strcmp(str, "always")) {
     456           0 :                 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
     457             :                         &transparent_hugepage_flags);
     458           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
     459             :                           &transparent_hugepage_flags);
     460           0 :                 ret = 1;
     461           0 :         } else if (!strcmp(str, "madvise")) {
     462           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
     463             :                           &transparent_hugepage_flags);
     464           0 :                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
     465             :                         &transparent_hugepage_flags);
     466           0 :                 ret = 1;
     467           0 :         } else if (!strcmp(str, "never")) {
     468           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
     469             :                           &transparent_hugepage_flags);
     470           0 :                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
     471             :                           &transparent_hugepage_flags);
     472           0 :                 ret = 1;
     473             :         }
     474           0 : out:
     475           0 :         if (!ret)
     476           0 :                 pr_warn("transparent_hugepage= cannot parse, ignored\n");
     477           0 :         return ret;
     478             : }
     479             : __setup("transparent_hugepage=", setup_transparent_hugepage);
     480             : 
     481          19 : pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
     482             : {
     483          19 :         if (likely(vma->vm_flags & VM_WRITE))
     484          19 :                 pmd = pmd_mkwrite(pmd);
     485          19 :         return pmd;
     486             : }
     487             : 
     488             : #ifdef CONFIG_MEMCG
     489             : static inline struct deferred_split *get_deferred_split_queue(struct page *page)
     490             : {
     491             :         struct mem_cgroup *memcg = page_memcg(compound_head(page));
     492             :         struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
     493             : 
     494             :         if (memcg)
     495             :                 return &memcg->deferred_split_queue;
     496             :         else
     497             :                 return &pgdat->deferred_split_queue;
     498             : }
     499             : #else
     500          18 : static inline struct deferred_split *get_deferred_split_queue(struct page *page)
     501             : {
     502          18 :         struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
     503             : 
     504          18 :         return &pgdat->deferred_split_queue;
     505             : }
     506             : #endif
     507             : 
     508          19 : void prep_transhuge_page(struct page *page)
     509             : {
     510             :         /*
     511             :          * we use page->mapping and page->indexlru in second tail page
     512             :          * as list_head: assuming THP order >= 2
     513             :          */
     514             : 
     515           2 :         INIT_LIST_HEAD(page_deferred_list(page));
     516          19 :         set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
     517           2 : }
     518             : 
     519           0 : bool is_transparent_hugepage(struct page *page)
     520             : {
     521           0 :         if (!PageCompound(page))
     522             :                 return false;
     523             : 
     524           0 :         page = compound_head(page);
     525           0 :         return is_huge_zero_page(page) ||
     526           0 :                page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
     527             : }
     528             : EXPORT_SYMBOL_GPL(is_transparent_hugepage);
     529             : 
     530             : static unsigned long __thp_get_unmapped_area(struct file *filp,
     531             :                 unsigned long addr, unsigned long len,
     532             :                 loff_t off, unsigned long flags, unsigned long size)
     533             : {
     534             :         loff_t off_end = off + len;
     535             :         loff_t off_align = round_up(off, size);
     536             :         unsigned long len_pad, ret;
     537             : 
     538             :         if (off_end <= off_align || (off_end - off_align) < size)
     539             :                 return 0;
     540             : 
     541             :         len_pad = len + size;
     542             :         if (len_pad < len || (off + len_pad) < off)
     543             :                 return 0;
     544             : 
     545             :         ret = current->mm->get_unmapped_area(filp, addr, len_pad,
     546             :                                               off >> PAGE_SHIFT, flags);
     547             : 
     548             :         /*
     549             :          * The failure might be due to length padding. The caller will retry
     550             :          * without the padding.
     551             :          */
     552             :         if (IS_ERR_VALUE(ret))
     553             :                 return 0;
     554             : 
     555             :         /*
     556             :          * Do not try to align to THP boundary if allocation at the address
     557             :          * hint succeeds.
     558             :          */
     559             :         if (ret == addr)
     560             :                 return addr;
     561             : 
     562             :         ret += (off - ret) & (size - 1);
     563             :         return ret;
     564             : }
     565             : 
     566       25596 : unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
     567             :                 unsigned long len, unsigned long pgoff, unsigned long flags)
     568             : {
     569       25596 :         unsigned long ret;
     570       25596 :         loff_t off = (loff_t)pgoff << PAGE_SHIFT;
     571             : 
     572       25596 :         if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
     573       25596 :                 goto out;
     574             : 
     575             :         ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
     576             :         if (ret)
     577             :                 return ret;
     578       25596 : out:
     579       25596 :         return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
     580             : }
     581             : EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
     582             : 
     583          17 : static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
     584             :                         struct page *page, gfp_t gfp)
     585             : {
     586          17 :         struct vm_area_struct *vma = vmf->vma;
     587          17 :         pgtable_t pgtable;
     588          17 :         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
     589          17 :         vm_fault_t ret = 0;
     590             : 
     591          34 :         VM_BUG_ON_PAGE(!PageCompound(page), page);
     592             : 
     593          17 :         if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
     594             :                 put_page(page);
     595             :                 count_vm_event(THP_FAULT_FALLBACK);
     596             :                 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
     597             :                 return VM_FAULT_FALLBACK;
     598             :         }
     599          17 :         cgroup_throttle_swaprate(page, gfp);
     600             : 
     601          17 :         pgtable = pte_alloc_one(vma->vm_mm);
     602          17 :         if (unlikely(!pgtable)) {
     603           0 :                 ret = VM_FAULT_OOM;
     604           0 :                 goto release;
     605             :         }
     606             : 
     607          17 :         clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
     608             :         /*
     609             :          * The memory barrier inside __SetPageUptodate makes sure that
     610             :          * clear_huge_page writes become visible before the set_pmd_at()
     611             :          * write.
     612             :          */
     613          17 :         __SetPageUptodate(page);
     614             : 
     615          17 :         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
     616          17 :         if (unlikely(!pmd_none(*vmf->pmd))) {
     617           0 :                 goto unlock_release;
     618             :         } else {
     619          17 :                 pmd_t entry;
     620             : 
     621          17 :                 ret = check_stable_address_space(vma->vm_mm);
     622          17 :                 if (ret)
     623           0 :                         goto unlock_release;
     624             : 
     625             :                 /* Deliver the page fault to userland */
     626          17 :                 if (userfaultfd_missing(vma)) {
     627             :                         vm_fault_t ret2;
     628             : 
     629             :                         spin_unlock(vmf->ptl);
     630             :                         put_page(page);
     631             :                         pte_free(vma->vm_mm, pgtable);
     632             :                         ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
     633             :                         VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
     634             :                         return ret2;
     635             :                 }
     636             : 
     637          17 :                 entry = mk_huge_pmd(page, vma->vm_page_prot);
     638          17 :                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
     639          17 :                 page_add_new_anon_rmap(page, vma, haddr, true);
     640          17 :                 lru_cache_add_inactive_or_unevictable(page, vma);
     641          17 :                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
     642          17 :                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
     643          17 :                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
     644          17 :                 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
     645          17 :                 mm_inc_nr_ptes(vma->vm_mm);
     646          17 :                 spin_unlock(vmf->ptl);
     647          17 :                 count_vm_event(THP_FAULT_ALLOC);
     648          17 :                 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
     649             :         }
     650             : 
     651          17 :         return 0;
     652           0 : unlock_release:
     653           0 :         spin_unlock(vmf->ptl);
     654           0 : release:
     655           0 :         if (pgtable)
     656           0 :                 pte_free(vma->vm_mm, pgtable);
     657           0 :         put_page(page);
     658           0 :         return ret;
     659             : 
     660             : }
     661             : 
     662             : /*
     663             :  * always: directly stall for all thp allocations
     664             :  * defer: wake kswapd and fail if not immediately available
     665             :  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
     666             :  *                fail if not immediately available
     667             :  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
     668             :  *          available
     669             :  * never: never stall for any thp allocation
     670             :  */
     671          17 : gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
     672             : {
     673          17 :         const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
     674             : 
     675             :         /* Always do synchronous compaction */
     676          17 :         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
     677           0 :                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
     678             : 
     679             :         /* Kick kcompactd and fail quickly */
     680          17 :         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
     681             :                 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
     682             : 
     683             :         /* Synchronous compaction if madvised, otherwise kick kcompactd */
     684          17 :         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
     685           0 :                 return GFP_TRANSHUGE_LIGHT |
     686             :                         (vma_madvised ? __GFP_DIRECT_RECLAIM :
     687             :                                         __GFP_KSWAPD_RECLAIM);
     688             : 
     689             :         /* Only do synchronous compaction if madvised */
     690          17 :         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
     691          17 :                 return GFP_TRANSHUGE_LIGHT |
     692             :                        (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
     693             : 
     694             :         return GFP_TRANSHUGE_LIGHT;
     695             : }
     696             : 
     697             : /* Caller must hold page table lock. */
     698           0 : static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
     699             :                 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
     700             :                 struct page *zero_page)
     701             : {
     702           0 :         pmd_t entry;
     703           0 :         if (!pmd_none(*pmd))
     704           0 :                 return;
     705           0 :         entry = mk_pmd(zero_page, vma->vm_page_prot);
     706           0 :         entry = pmd_mkhuge(entry);
     707           0 :         if (pgtable)
     708           0 :                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
     709           0 :         set_pmd_at(mm, haddr, pmd, entry);
     710           0 :         mm_inc_nr_ptes(mm);
     711             : }
     712             : 
     713        1114 : vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
     714             : {
     715        1114 :         struct vm_area_struct *vma = vmf->vma;
     716        1114 :         gfp_t gfp;
     717        1114 :         struct page *page;
     718        1114 :         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
     719             : 
     720        1114 :         if (!transhuge_vma_suitable(vma, haddr))
     721             :                 return VM_FAULT_FALLBACK;
     722          17 :         if (unlikely(anon_vma_prepare(vma)))
     723             :                 return VM_FAULT_OOM;
     724          17 :         if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
     725             :                 return VM_FAULT_OOM;
     726          17 :         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
     727           0 :                         !mm_forbids_zeropage(vma->vm_mm) &&
     728           0 :                         transparent_hugepage_use_zero_page()) {
     729           0 :                 pgtable_t pgtable;
     730           0 :                 struct page *zero_page;
     731           0 :                 vm_fault_t ret;
     732           0 :                 pgtable = pte_alloc_one(vma->vm_mm);
     733           0 :                 if (unlikely(!pgtable))
     734             :                         return VM_FAULT_OOM;
     735           0 :                 zero_page = mm_get_huge_zero_page(vma->vm_mm);
     736           0 :                 if (unlikely(!zero_page)) {
     737           0 :                         pte_free(vma->vm_mm, pgtable);
     738           0 :                         count_vm_event(THP_FAULT_FALLBACK);
     739           0 :                         return VM_FAULT_FALLBACK;
     740             :                 }
     741           0 :                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
     742           0 :                 ret = 0;
     743           0 :                 if (pmd_none(*vmf->pmd)) {
     744           0 :                         ret = check_stable_address_space(vma->vm_mm);
     745           0 :                         if (ret) {
     746           0 :                                 spin_unlock(vmf->ptl);
     747           0 :                                 pte_free(vma->vm_mm, pgtable);
     748           0 :                         } else if (userfaultfd_missing(vma)) {
     749             :                                 spin_unlock(vmf->ptl);
     750             :                                 pte_free(vma->vm_mm, pgtable);
     751             :                                 ret = handle_userfault(vmf, VM_UFFD_MISSING);
     752             :                                 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
     753             :                         } else {
     754           0 :                                 set_huge_zero_page(pgtable, vma->vm_mm, vma,
     755             :                                                    haddr, vmf->pmd, zero_page);
     756           0 :                                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
     757           0 :                                 spin_unlock(vmf->ptl);
     758             :                         }
     759             :                 } else {
     760           0 :                         spin_unlock(vmf->ptl);
     761           0 :                         pte_free(vma->vm_mm, pgtable);
     762             :                 }
     763           0 :                 return ret;
     764             :         }
     765          17 :         gfp = vma_thp_gfp_mask(vma);
     766          17 :         page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
     767          17 :         if (unlikely(!page)) {
     768           0 :                 count_vm_event(THP_FAULT_FALLBACK);
     769           0 :                 return VM_FAULT_FALLBACK;
     770             :         }
     771          17 :         prep_transhuge_page(page);
     772          17 :         return __do_huge_pmd_anonymous_page(vmf, page, gfp);
     773             : }
     774             : 
     775           0 : static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
     776             :                 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
     777             :                 pgtable_t pgtable)
     778             : {
     779           0 :         struct mm_struct *mm = vma->vm_mm;
     780           0 :         pmd_t entry;
     781           0 :         spinlock_t *ptl;
     782             : 
     783           0 :         ptl = pmd_lock(mm, pmd);
     784           0 :         if (!pmd_none(*pmd)) {
     785           0 :                 if (write) {
     786           0 :                         if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
     787           0 :                                 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
     788           0 :                                 goto out_unlock;
     789             :                         }
     790           0 :                         entry = pmd_mkyoung(*pmd);
     791           0 :                         entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
     792           0 :                         if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
     793           0 :                                 update_mmu_cache_pmd(vma, addr, pmd);
     794             :                 }
     795             : 
     796           0 :                 goto out_unlock;
     797             :         }
     798             : 
     799           0 :         entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
     800           0 :         if (pfn_t_devmap(pfn))
     801           0 :                 entry = pmd_mkdevmap(entry);
     802           0 :         if (write) {
     803           0 :                 entry = pmd_mkyoung(pmd_mkdirty(entry));
     804           0 :                 entry = maybe_pmd_mkwrite(entry, vma);
     805             :         }
     806             : 
     807           0 :         if (pgtable) {
     808           0 :                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
     809           0 :                 mm_inc_nr_ptes(mm);
     810           0 :                 pgtable = NULL;
     811             :         }
     812             : 
     813           0 :         set_pmd_at(mm, addr, pmd, entry);
     814           0 :         update_mmu_cache_pmd(vma, addr, pmd);
     815             : 
     816           0 : out_unlock:
     817           0 :         spin_unlock(ptl);
     818           0 :         if (pgtable)
     819           0 :                 pte_free(mm, pgtable);
     820           0 : }
     821             : 
     822             : /**
     823             :  * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
     824             :  * @vmf: Structure describing the fault
     825             :  * @pfn: pfn to insert
     826             :  * @pgprot: page protection to use
     827             :  * @write: whether it's a write fault
     828             :  *
     829             :  * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
     830             :  * also consult the vmf_insert_mixed_prot() documentation when
     831             :  * @pgprot != @vmf->vma->vm_page_prot.
     832             :  *
     833             :  * Return: vm_fault_t value.
     834             :  */
     835           0 : vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
     836             :                                    pgprot_t pgprot, bool write)
     837             : {
     838           0 :         unsigned long addr = vmf->address & PMD_MASK;
     839           0 :         struct vm_area_struct *vma = vmf->vma;
     840           0 :         pgtable_t pgtable = NULL;
     841             : 
     842             :         /*
     843             :          * If we had pmd_special, we could avoid all these restrictions,
     844             :          * but we need to be consistent with PTEs and architectures that
     845             :          * can't support a 'special' bit.
     846             :          */
     847           0 :         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
     848             :                         !pfn_t_devmap(pfn));
     849           0 :         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
     850             :                                                 (VM_PFNMAP|VM_MIXEDMAP));
     851           0 :         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
     852             : 
     853           0 :         if (addr < vma->vm_start || addr >= vma->vm_end)
     854             :                 return VM_FAULT_SIGBUS;
     855             : 
     856           0 :         if (arch_needs_pgtable_deposit()) {
     857             :                 pgtable = pte_alloc_one(vma->vm_mm);
     858             :                 if (!pgtable)
     859             :                         return VM_FAULT_OOM;
     860             :         }
     861             : 
     862           0 :         track_pfn_insert(vma, &pgprot, pfn);
     863             : 
     864           0 :         insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
     865           0 :         return VM_FAULT_NOPAGE;
     866             : }
     867             : EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
     868             : 
     869             : #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
     870           0 : static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
     871             : {
     872           0 :         if (likely(vma->vm_flags & VM_WRITE))
     873           0 :                 pud = pud_mkwrite(pud);
     874           0 :         return pud;
     875             : }
     876             : 
     877           0 : static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
     878             :                 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
     879             : {
     880           0 :         struct mm_struct *mm = vma->vm_mm;
     881           0 :         pud_t entry;
     882           0 :         spinlock_t *ptl;
     883             : 
     884           0 :         ptl = pud_lock(mm, pud);
     885           0 :         if (!pud_none(*pud)) {
     886           0 :                 if (write) {
     887           0 :                         if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
     888           0 :                                 WARN_ON_ONCE(!is_huge_zero_pud(*pud));
     889           0 :                                 goto out_unlock;
     890             :                         }
     891           0 :                         entry = pud_mkyoung(*pud);
     892           0 :                         entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
     893           0 :                         if (pudp_set_access_flags(vma, addr, pud, entry, 1))
     894           0 :                                 update_mmu_cache_pud(vma, addr, pud);
     895             :                 }
     896           0 :                 goto out_unlock;
     897             :         }
     898             : 
     899           0 :         entry = pud_mkhuge(pfn_t_pud(pfn, prot));
     900           0 :         if (pfn_t_devmap(pfn))
     901           0 :                 entry = pud_mkdevmap(entry);
     902           0 :         if (write) {
     903           0 :                 entry = pud_mkyoung(pud_mkdirty(entry));
     904           0 :                 entry = maybe_pud_mkwrite(entry, vma);
     905             :         }
     906           0 :         set_pud_at(mm, addr, pud, entry);
     907           0 :         update_mmu_cache_pud(vma, addr, pud);
     908             : 
     909           0 : out_unlock:
     910           0 :         spin_unlock(ptl);
     911           0 : }
     912             : 
     913             : /**
     914             :  * vmf_insert_pfn_pud_prot - insert a pud size pfn
     915             :  * @vmf: Structure describing the fault
     916             :  * @pfn: pfn to insert
     917             :  * @pgprot: page protection to use
     918             :  * @write: whether it's a write fault
     919             :  *
     920             :  * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
     921             :  * also consult the vmf_insert_mixed_prot() documentation when
     922             :  * @pgprot != @vmf->vma->vm_page_prot.
     923             :  *
     924             :  * Return: vm_fault_t value.
     925             :  */
     926           0 : vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
     927             :                                    pgprot_t pgprot, bool write)
     928             : {
     929           0 :         unsigned long addr = vmf->address & PUD_MASK;
     930           0 :         struct vm_area_struct *vma = vmf->vma;
     931             : 
     932             :         /*
     933             :          * If we had pud_special, we could avoid all these restrictions,
     934             :          * but we need to be consistent with PTEs and architectures that
     935             :          * can't support a 'special' bit.
     936             :          */
     937           0 :         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
     938             :                         !pfn_t_devmap(pfn));
     939           0 :         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
     940             :                                                 (VM_PFNMAP|VM_MIXEDMAP));
     941           0 :         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
     942             : 
     943           0 :         if (addr < vma->vm_start || addr >= vma->vm_end)
     944             :                 return VM_FAULT_SIGBUS;
     945             : 
     946           0 :         track_pfn_insert(vma, &pgprot, pfn);
     947             : 
     948           0 :         insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
     949           0 :         return VM_FAULT_NOPAGE;
     950             : }
     951             : EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
     952             : #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
     953             : 
     954           0 : static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
     955             :                 pmd_t *pmd, int flags)
     956             : {
     957           0 :         pmd_t _pmd;
     958             : 
     959           0 :         _pmd = pmd_mkyoung(*pmd);
     960           0 :         if (flags & FOLL_WRITE)
     961           0 :                 _pmd = pmd_mkdirty(_pmd);
     962           0 :         if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
     963             :                                 pmd, _pmd, flags & FOLL_WRITE))
     964           0 :                 update_mmu_cache_pmd(vma, addr, pmd);
     965           0 : }
     966             : 
     967           0 : struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
     968             :                 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
     969             : {
     970           0 :         unsigned long pfn = pmd_pfn(*pmd);
     971           0 :         struct mm_struct *mm = vma->vm_mm;
     972           0 :         struct page *page;
     973             : 
     974           0 :         assert_spin_locked(pmd_lockptr(mm, pmd));
     975             : 
     976             :         /*
     977             :          * When we COW a devmap PMD entry, we split it into PTEs, so we should
     978             :          * not be in this function with `flags & FOLL_COW` set.
     979             :          */
     980           0 :         WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
     981             : 
     982             :         /* FOLL_GET and FOLL_PIN are mutually exclusive. */
     983           0 :         if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
     984             :                          (FOLL_PIN | FOLL_GET)))
     985             :                 return NULL;
     986             : 
     987           0 :         if (flags & FOLL_WRITE && !pmd_write(*pmd))
     988             :                 return NULL;
     989             : 
     990           0 :         if (pmd_present(*pmd) && pmd_devmap(*pmd))
     991             :                 /* pass */;
     992             :         else
     993             :                 return NULL;
     994             : 
     995           0 :         if (flags & FOLL_TOUCH)
     996           0 :                 touch_pmd(vma, addr, pmd, flags);
     997             : 
     998             :         /*
     999             :          * device mapped pages can only be returned if the
    1000             :          * caller will manage the page reference count.
    1001             :          */
    1002           0 :         if (!(flags & (FOLL_GET | FOLL_PIN)))
    1003           0 :                 return ERR_PTR(-EEXIST);
    1004             : 
    1005           0 :         pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
    1006           0 :         *pgmap = get_dev_pagemap(pfn, *pgmap);
    1007           0 :         if (!*pgmap)
    1008           0 :                 return ERR_PTR(-EFAULT);
    1009             :         page = pfn_to_page(pfn);
    1010             :         if (!try_grab_page(page, flags))
    1011             :                 page = ERR_PTR(-ENOMEM);
    1012             : 
    1013             :         return page;
    1014             : }
    1015             : 
    1016           1 : int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
    1017             :                   pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
    1018             :                   struct vm_area_struct *vma)
    1019             : {
    1020           1 :         spinlock_t *dst_ptl, *src_ptl;
    1021           1 :         struct page *src_page;
    1022           1 :         pmd_t pmd;
    1023           1 :         pgtable_t pgtable = NULL;
    1024           1 :         int ret = -ENOMEM;
    1025             : 
    1026             :         /* Skip if can be re-fill on fault */
    1027           1 :         if (!vma_is_anonymous(vma))
    1028             :                 return 0;
    1029             : 
    1030           1 :         pgtable = pte_alloc_one(dst_mm);
    1031           1 :         if (unlikely(!pgtable))
    1032           0 :                 goto out;
    1033             : 
    1034           1 :         dst_ptl = pmd_lock(dst_mm, dst_pmd);
    1035           1 :         src_ptl = pmd_lockptr(src_mm, src_pmd);
    1036           1 :         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
    1037             : 
    1038           1 :         ret = -EAGAIN;
    1039           1 :         pmd = *src_pmd;
    1040             : 
    1041             :         /*
    1042             :          * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
    1043             :          * does not have the VM_UFFD_WP, which means that the uffd
    1044             :          * fork event is not enabled.
    1045             :          */
    1046           1 :         if (!(vma->vm_flags & VM_UFFD_WP))
    1047           1 :                 pmd = pmd_clear_uffd_wp(pmd);
    1048             : 
    1049             : #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
    1050           1 :         if (unlikely(is_swap_pmd(pmd))) {
    1051           0 :                 swp_entry_t entry = pmd_to_swp_entry(pmd);
    1052             : 
    1053           0 :                 VM_BUG_ON(!is_pmd_migration_entry(pmd));
    1054           0 :                 if (is_write_migration_entry(entry)) {
    1055           0 :                         make_migration_entry_read(&entry);
    1056           0 :                         pmd = swp_entry_to_pmd(entry);
    1057           0 :                         if (pmd_swp_soft_dirty(*src_pmd))
    1058             :                                 pmd = pmd_swp_mksoft_dirty(pmd);
    1059           0 :                         set_pmd_at(src_mm, addr, src_pmd, pmd);
    1060             :                 }
    1061           0 :                 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
    1062           0 :                 mm_inc_nr_ptes(dst_mm);
    1063           0 :                 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
    1064           0 :                 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
    1065           0 :                 ret = 0;
    1066           0 :                 goto out_unlock;
    1067             :         }
    1068             : #endif
    1069             : 
    1070           1 :         if (unlikely(!pmd_trans_huge(pmd))) {
    1071           0 :                 pte_free(dst_mm, pgtable);
    1072           0 :                 goto out_unlock;
    1073             :         }
    1074             :         /*
    1075             :          * When page table lock is held, the huge zero pmd should not be
    1076             :          * under splitting since we don't split the page itself, only pmd to
    1077             :          * a page table.
    1078             :          */
    1079           1 :         if (is_huge_zero_pmd(pmd)) {
    1080           0 :                 struct page *zero_page;
    1081             :                 /*
    1082             :                  * get_huge_zero_page() will never allocate a new page here,
    1083             :                  * since we already have a zero page to copy. It just takes a
    1084             :                  * reference.
    1085             :                  */
    1086           0 :                 zero_page = mm_get_huge_zero_page(dst_mm);
    1087           0 :                 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
    1088             :                                 zero_page);
    1089           0 :                 ret = 0;
    1090           0 :                 goto out_unlock;
    1091             :         }
    1092             : 
    1093           1 :         src_page = pmd_page(pmd);
    1094           1 :         VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
    1095             : 
    1096             :         /*
    1097             :          * If this page is a potentially pinned page, split and retry the fault
    1098             :          * with smaller page size.  Normally this should not happen because the
    1099             :          * userspace should use MADV_DONTFORK upon pinned regions.  This is a
    1100             :          * best effort that the pinned pages won't be replaced by another
    1101             :          * random page during the coming copy-on-write.
    1102             :          */
    1103           1 :         if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
    1104           0 :                 pte_free(dst_mm, pgtable);
    1105           0 :                 spin_unlock(src_ptl);
    1106           0 :                 spin_unlock(dst_ptl);
    1107           0 :                 __split_huge_pmd(vma, src_pmd, addr, false, NULL);
    1108           0 :                 return -EAGAIN;
    1109             :         }
    1110             : 
    1111           1 :         get_page(src_page);
    1112           1 :         page_dup_rmap(src_page, true);
    1113           1 :         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
    1114           1 :         mm_inc_nr_ptes(dst_mm);
    1115           1 :         pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
    1116             : 
    1117           1 :         pmdp_set_wrprotect(src_mm, addr, src_pmd);
    1118           1 :         pmd = pmd_mkold(pmd_wrprotect(pmd));
    1119           1 :         set_pmd_at(dst_mm, addr, dst_pmd, pmd);
    1120             : 
    1121           1 :         ret = 0;
    1122           1 : out_unlock:
    1123           1 :         spin_unlock(src_ptl);
    1124           1 :         spin_unlock(dst_ptl);
    1125             : out:
    1126             :         return ret;
    1127             : }
    1128             : 
    1129             : #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
    1130           0 : static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
    1131             :                 pud_t *pud, int flags)
    1132             : {
    1133           0 :         pud_t _pud;
    1134             : 
    1135           0 :         _pud = pud_mkyoung(*pud);
    1136           0 :         if (flags & FOLL_WRITE)
    1137           0 :                 _pud = pud_mkdirty(_pud);
    1138           0 :         if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
    1139             :                                 pud, _pud, flags & FOLL_WRITE))
    1140           0 :                 update_mmu_cache_pud(vma, addr, pud);
    1141           0 : }
    1142             : 
    1143           0 : struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
    1144             :                 pud_t *pud, int flags, struct dev_pagemap **pgmap)
    1145             : {
    1146           0 :         unsigned long pfn = pud_pfn(*pud);
    1147           0 :         struct mm_struct *mm = vma->vm_mm;
    1148           0 :         struct page *page;
    1149             : 
    1150           0 :         assert_spin_locked(pud_lockptr(mm, pud));
    1151             : 
    1152           0 :         if (flags & FOLL_WRITE && !pud_write(*pud))
    1153             :                 return NULL;
    1154             : 
    1155             :         /* FOLL_GET and FOLL_PIN are mutually exclusive. */
    1156           0 :         if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
    1157             :                          (FOLL_PIN | FOLL_GET)))
    1158             :                 return NULL;
    1159             : 
    1160           0 :         if (pud_present(*pud) && pud_devmap(*pud))
    1161             :                 /* pass */;
    1162             :         else
    1163             :                 return NULL;
    1164             : 
    1165           0 :         if (flags & FOLL_TOUCH)
    1166           0 :                 touch_pud(vma, addr, pud, flags);
    1167             : 
    1168             :         /*
    1169             :          * device mapped pages can only be returned if the
    1170             :          * caller will manage the page reference count.
    1171             :          *
    1172             :          * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
    1173             :          */
    1174           0 :         if (!(flags & (FOLL_GET | FOLL_PIN)))
    1175           0 :                 return ERR_PTR(-EEXIST);
    1176             : 
    1177           0 :         pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
    1178           0 :         *pgmap = get_dev_pagemap(pfn, *pgmap);
    1179           0 :         if (!*pgmap)
    1180           0 :                 return ERR_PTR(-EFAULT);
    1181             :         page = pfn_to_page(pfn);
    1182             :         if (!try_grab_page(page, flags))
    1183             :                 page = ERR_PTR(-ENOMEM);
    1184             : 
    1185             :         return page;
    1186             : }
    1187             : 
    1188           0 : int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
    1189             :                   pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
    1190             :                   struct vm_area_struct *vma)
    1191             : {
    1192           0 :         spinlock_t *dst_ptl, *src_ptl;
    1193           0 :         pud_t pud;
    1194           0 :         int ret;
    1195             : 
    1196           0 :         dst_ptl = pud_lock(dst_mm, dst_pud);
    1197           0 :         src_ptl = pud_lockptr(src_mm, src_pud);
    1198           0 :         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
    1199             : 
    1200           0 :         ret = -EAGAIN;
    1201           0 :         pud = *src_pud;
    1202           0 :         if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
    1203           0 :                 goto out_unlock;
    1204             : 
    1205             :         /*
    1206             :          * When page table lock is held, the huge zero pud should not be
    1207             :          * under splitting since we don't split the page itself, only pud to
    1208             :          * a page table.
    1209             :          */
    1210           0 :         if (is_huge_zero_pud(pud)) {
    1211             :                 /* No huge zero pud yet */
    1212           0 :         }
    1213             : 
    1214             :         /* Please refer to comments in copy_huge_pmd() */
    1215           0 :         if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
    1216           0 :                 spin_unlock(src_ptl);
    1217           0 :                 spin_unlock(dst_ptl);
    1218           0 :                 __split_huge_pud(vma, src_pud, addr);
    1219           0 :                 return -EAGAIN;
    1220             :         }
    1221             : 
    1222           0 :         pudp_set_wrprotect(src_mm, addr, src_pud);
    1223           0 :         pud = pud_mkold(pud_wrprotect(pud));
    1224           0 :         set_pud_at(dst_mm, addr, dst_pud, pud);
    1225             : 
    1226           0 :         ret = 0;
    1227           0 : out_unlock:
    1228           0 :         spin_unlock(src_ptl);
    1229           0 :         spin_unlock(dst_ptl);
    1230           0 :         return ret;
    1231             : }
    1232             : 
    1233           0 : void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
    1234             : {
    1235           0 :         pud_t entry;
    1236           0 :         unsigned long haddr;
    1237           0 :         bool write = vmf->flags & FAULT_FLAG_WRITE;
    1238             : 
    1239           0 :         vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
    1240           0 :         if (unlikely(!pud_same(*vmf->pud, orig_pud)))
    1241           0 :                 goto unlock;
    1242             : 
    1243           0 :         entry = pud_mkyoung(orig_pud);
    1244           0 :         if (write)
    1245           0 :                 entry = pud_mkdirty(entry);
    1246           0 :         haddr = vmf->address & HPAGE_PUD_MASK;
    1247           0 :         if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
    1248           0 :                 update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
    1249             : 
    1250           0 : unlock:
    1251           0 :         spin_unlock(vmf->ptl);
    1252           0 : }
    1253             : #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
    1254             : 
    1255           0 : void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
    1256             : {
    1257           0 :         pmd_t entry;
    1258           0 :         unsigned long haddr;
    1259           0 :         bool write = vmf->flags & FAULT_FLAG_WRITE;
    1260             : 
    1261           0 :         vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
    1262           0 :         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
    1263           0 :                 goto unlock;
    1264             : 
    1265           0 :         entry = pmd_mkyoung(orig_pmd);
    1266           0 :         if (write)
    1267           0 :                 entry = pmd_mkdirty(entry);
    1268           0 :         haddr = vmf->address & HPAGE_PMD_MASK;
    1269           0 :         if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
    1270           0 :                 update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
    1271             : 
    1272           0 : unlock:
    1273           0 :         spin_unlock(vmf->ptl);
    1274           0 : }
    1275             : 
    1276           2 : vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
    1277             : {
    1278           2 :         struct vm_area_struct *vma = vmf->vma;
    1279           2 :         struct page *page;
    1280           2 :         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
    1281             : 
    1282           2 :         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
    1283           2 :         VM_BUG_ON_VMA(!vma->anon_vma, vma);
    1284             : 
    1285           2 :         if (is_huge_zero_pmd(orig_pmd))
    1286           0 :                 goto fallback;
    1287             : 
    1288           2 :         spin_lock(vmf->ptl);
    1289             : 
    1290           2 :         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
    1291           0 :                 spin_unlock(vmf->ptl);
    1292           0 :                 return 0;
    1293             :         }
    1294             : 
    1295           2 :         page = pmd_page(orig_pmd);
    1296           6 :         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
    1297             : 
    1298             :         /* Lock page for reuse_swap_page() */
    1299           2 :         if (!trylock_page(page)) {
    1300           1 :                 get_page(page);
    1301           1 :                 spin_unlock(vmf->ptl);
    1302           1 :                 lock_page(page);
    1303           1 :                 spin_lock(vmf->ptl);
    1304           1 :                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
    1305           0 :                         spin_unlock(vmf->ptl);
    1306           0 :                         unlock_page(page);
    1307           0 :                         put_page(page);
    1308           0 :                         return 0;
    1309             :                 }
    1310           1 :                 put_page(page);
    1311             :         }
    1312             : 
    1313             :         /*
    1314             :          * We can only reuse the page if nobody else maps the huge page or it's
    1315             :          * part.
    1316             :          */
    1317           2 :         if (reuse_swap_page(page, NULL)) {
    1318           0 :                 pmd_t entry;
    1319           0 :                 entry = pmd_mkyoung(orig_pmd);
    1320           0 :                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
    1321           0 :                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
    1322           0 :                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
    1323           0 :                 unlock_page(page);
    1324           0 :                 spin_unlock(vmf->ptl);
    1325           0 :                 return VM_FAULT_WRITE;
    1326             :         }
    1327             : 
    1328           2 :         unlock_page(page);
    1329           2 :         spin_unlock(vmf->ptl);
    1330           2 : fallback:
    1331           2 :         __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
    1332           2 :         return VM_FAULT_FALLBACK;
    1333             : }
    1334             : 
    1335             : /*
    1336             :  * FOLL_FORCE can write to even unwritable pmd's, but only
    1337             :  * after we've gone through a COW cycle and they are dirty.
    1338             :  */
    1339           0 : static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
    1340             : {
    1341           0 :         return pmd_write(pmd) ||
    1342           0 :                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
    1343             : }
    1344             : 
    1345           0 : struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
    1346             :                                    unsigned long addr,
    1347             :                                    pmd_t *pmd,
    1348             :                                    unsigned int flags)
    1349             : {
    1350           0 :         struct mm_struct *mm = vma->vm_mm;
    1351           0 :         struct page *page = NULL;
    1352             : 
    1353           0 :         assert_spin_locked(pmd_lockptr(mm, pmd));
    1354             : 
    1355           0 :         if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
    1356           0 :                 goto out;
    1357             : 
    1358             :         /* Avoid dumping huge zero page */
    1359           0 :         if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
    1360           0 :                 return ERR_PTR(-EFAULT);
    1361             : 
    1362             :         /* Full NUMA hinting faults to serialise migration in fault paths */
    1363           0 :         if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
    1364             :                 goto out;
    1365             : 
    1366           0 :         page = pmd_page(*pmd);
    1367           0 :         VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
    1368             : 
    1369           0 :         if (!try_grab_page(page, flags))
    1370           0 :                 return ERR_PTR(-ENOMEM);
    1371             : 
    1372           0 :         if (flags & FOLL_TOUCH)
    1373           0 :                 touch_pmd(vma, addr, pmd, flags);
    1374             : 
    1375           0 :         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
    1376             :                 /*
    1377             :                  * We don't mlock() pte-mapped THPs. This way we can avoid
    1378             :                  * leaking mlocked pages into non-VM_LOCKED VMAs.
    1379             :                  *
    1380             :                  * For anon THP:
    1381             :                  *
    1382             :                  * In most cases the pmd is the only mapping of the page as we
    1383             :                  * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
    1384             :                  * writable private mappings in populate_vma_page_range().
    1385             :                  *
    1386             :                  * The only scenario when we have the page shared here is if we
    1387             :                  * mlocking read-only mapping shared over fork(). We skip
    1388             :                  * mlocking such pages.
    1389             :                  *
    1390             :                  * For file THP:
    1391             :                  *
    1392             :                  * We can expect PageDoubleMap() to be stable under page lock:
    1393             :                  * for file pages we set it in page_add_file_rmap(), which
    1394             :                  * requires page to be locked.
    1395             :                  */
    1396             : 
    1397           0 :                 if (PageAnon(page) && compound_mapcount(page) != 1)
    1398           0 :                         goto skip_mlock;
    1399           0 :                 if (PageDoubleMap(page) || !page->mapping)
    1400           0 :                         goto skip_mlock;
    1401           0 :                 if (!trylock_page(page))
    1402           0 :                         goto skip_mlock;
    1403           0 :                 if (page->mapping && !PageDoubleMap(page))
    1404           0 :                         mlock_vma_page(page);
    1405           0 :                 unlock_page(page);
    1406             :         }
    1407           0 : skip_mlock:
    1408           0 :         page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
    1409           0 :         VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
    1410             : 
    1411           0 : out:
    1412             :         return page;
    1413             : }
    1414             : 
    1415             : /* NUMA hinting page fault entry point for trans huge pmds */
    1416           0 : vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
    1417             : {
    1418           0 :         struct vm_area_struct *vma = vmf->vma;
    1419           0 :         struct anon_vma *anon_vma = NULL;
    1420           0 :         struct page *page;
    1421           0 :         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
    1422           0 :         int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
    1423           0 :         int target_nid, last_cpupid = -1;
    1424           0 :         bool page_locked;
    1425           0 :         bool migrated = false;
    1426           0 :         bool was_writable;
    1427           0 :         int flags = 0;
    1428             : 
    1429           0 :         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
    1430           0 :         if (unlikely(!pmd_same(pmd, *vmf->pmd)))
    1431           0 :                 goto out_unlock;
    1432             : 
    1433             :         /*
    1434             :          * If there are potential migrations, wait for completion and retry
    1435             :          * without disrupting NUMA hinting information. Do not relock and
    1436             :          * check_same as the page may no longer be mapped.
    1437             :          */
    1438           0 :         if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
    1439             :                 page = pmd_page(*vmf->pmd);
    1440             :                 if (!get_page_unless_zero(page))
    1441             :                         goto out_unlock;
    1442             :                 spin_unlock(vmf->ptl);
    1443             :                 put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
    1444             :                 goto out;
    1445             :         }
    1446             : 
    1447           0 :         page = pmd_page(pmd);
    1448           0 :         BUG_ON(is_huge_zero_page(page));
    1449           0 :         page_nid = page_to_nid(page);
    1450           0 :         last_cpupid = page_cpupid_last(page);
    1451           0 :         count_vm_numa_event(NUMA_HINT_FAULTS);
    1452           0 :         if (page_nid == this_nid) {
    1453             :                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
    1454             :                 flags |= TNF_FAULT_LOCAL;
    1455             :         }
    1456             : 
    1457             :         /* See similar comment in do_numa_page for explanation */
    1458           0 :         if (!pmd_savedwrite(pmd))
    1459             :                 flags |= TNF_NO_GROUP;
    1460             : 
    1461             :         /*
    1462             :          * Acquire the page lock to serialise THP migrations but avoid dropping
    1463             :          * page_table_lock if at all possible
    1464             :          */
    1465           0 :         page_locked = trylock_page(page);
    1466           0 :         target_nid = mpol_misplaced(page, vma, haddr);
    1467           0 :         if (target_nid == NUMA_NO_NODE) {
    1468             :                 /* If the page was locked, there are no parallel migrations */
    1469           0 :                 if (page_locked)
    1470           0 :                         goto clear_pmdnuma;
    1471             :         }
    1472             : 
    1473             :         /* Migration could have started since the pmd_trans_migrating check */
    1474           0 :         if (!page_locked) {
    1475           0 :                 page_nid = NUMA_NO_NODE;
    1476           0 :                 if (!get_page_unless_zero(page))
    1477           0 :                         goto out_unlock;
    1478           0 :                 spin_unlock(vmf->ptl);
    1479           0 :                 put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
    1480           0 :                 goto out;
    1481             :         }
    1482             : 
    1483             :         /*
    1484             :          * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
    1485             :          * to serialises splits
    1486             :          */
    1487           0 :         get_page(page);
    1488           0 :         spin_unlock(vmf->ptl);
    1489           0 :         anon_vma = page_lock_anon_vma_read(page);
    1490             : 
    1491             :         /* Confirm the PMD did not change while page_table_lock was released */
    1492           0 :         spin_lock(vmf->ptl);
    1493           0 :         if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
    1494           0 :                 unlock_page(page);
    1495           0 :                 put_page(page);
    1496           0 :                 page_nid = NUMA_NO_NODE;
    1497           0 :                 goto out_unlock;
    1498             :         }
    1499             : 
    1500             :         /* Bail if we fail to protect against THP splits for any reason */
    1501           0 :         if (unlikely(!anon_vma)) {
    1502           0 :                 put_page(page);
    1503           0 :                 page_nid = NUMA_NO_NODE;
    1504           0 :                 goto clear_pmdnuma;
    1505             :         }
    1506             : 
    1507             :         /*
    1508             :          * Since we took the NUMA fault, we must have observed the !accessible
    1509             :          * bit. Make sure all other CPUs agree with that, to avoid them
    1510             :          * modifying the page we're about to migrate.
    1511             :          *
    1512             :          * Must be done under PTL such that we'll observe the relevant
    1513             :          * inc_tlb_flush_pending().
    1514             :          *
    1515             :          * We are not sure a pending tlb flush here is for a huge page
    1516             :          * mapping or not. Hence use the tlb range variant
    1517             :          */
    1518           0 :         if (mm_tlb_flush_pending(vma->vm_mm)) {
    1519           0 :                 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
    1520             :                 /*
    1521             :                  * change_huge_pmd() released the pmd lock before
    1522             :                  * invalidating the secondary MMUs sharing the primary
    1523             :                  * MMU pagetables (with ->invalidate_range()). The
    1524             :                  * mmu_notifier_invalidate_range_end() (which
    1525             :                  * internally calls ->invalidate_range()) in
    1526             :                  * change_pmd_range() will run after us, so we can't
    1527             :                  * rely on it here and we need an explicit invalidate.
    1528             :                  */
    1529           0 :                 mmu_notifier_invalidate_range(vma->vm_mm, haddr,
    1530             :                                               haddr + HPAGE_PMD_SIZE);
    1531             :         }
    1532             : 
    1533             :         /*
    1534             :          * Migrate the THP to the requested node, returns with page unlocked
    1535             :          * and access rights restored.
    1536             :          */
    1537           0 :         spin_unlock(vmf->ptl);
    1538             : 
    1539           0 :         migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
    1540             :                                 vmf->pmd, pmd, vmf->address, page, target_nid);
    1541           0 :         if (migrated) {
    1542           0 :                 flags |= TNF_MIGRATED;
    1543           0 :                 page_nid = target_nid;
    1544             :         } else
    1545             :                 flags |= TNF_MIGRATE_FAIL;
    1546             : 
    1547           0 :         goto out;
    1548           0 : clear_pmdnuma:
    1549           0 :         BUG_ON(!PageLocked(page));
    1550           0 :         was_writable = pmd_savedwrite(pmd);
    1551           0 :         pmd = pmd_modify(pmd, vma->vm_page_prot);
    1552           0 :         pmd = pmd_mkyoung(pmd);
    1553           0 :         if (was_writable)
    1554           0 :                 pmd = pmd_mkwrite(pmd);
    1555           0 :         set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
    1556           0 :         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
    1557           0 :         unlock_page(page);
    1558           0 : out_unlock:
    1559           0 :         spin_unlock(vmf->ptl);
    1560             : 
    1561           0 : out:
    1562           0 :         if (anon_vma)
    1563           0 :                 page_unlock_anon_vma_read(anon_vma);
    1564             : 
    1565           0 :         if (page_nid != NUMA_NO_NODE)
    1566           0 :                 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
    1567             :                                 flags);
    1568             : 
    1569           0 :         return 0;
    1570             : }
    1571             : 
    1572             : /*
    1573             :  * Return true if we do MADV_FREE successfully on entire pmd page.
    1574             :  * Otherwise, return false.
    1575             :  */
    1576           0 : bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
    1577             :                 pmd_t *pmd, unsigned long addr, unsigned long next)
    1578             : {
    1579           0 :         spinlock_t *ptl;
    1580           0 :         pmd_t orig_pmd;
    1581           0 :         struct page *page;
    1582           0 :         struct mm_struct *mm = tlb->mm;
    1583           0 :         bool ret = false;
    1584             : 
    1585           0 :         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
    1586             : 
    1587           0 :         ptl = pmd_trans_huge_lock(pmd, vma);
    1588           0 :         if (!ptl)
    1589           0 :                 goto out_unlocked;
    1590             : 
    1591           0 :         orig_pmd = *pmd;
    1592           0 :         if (is_huge_zero_pmd(orig_pmd))
    1593           0 :                 goto out;
    1594             : 
    1595           0 :         if (unlikely(!pmd_present(orig_pmd))) {
    1596           0 :                 VM_BUG_ON(thp_migration_supported() &&
    1597             :                                   !is_pmd_migration_entry(orig_pmd));
    1598           0 :                 goto out;
    1599             :         }
    1600             : 
    1601           0 :         page = pmd_page(orig_pmd);
    1602             :         /*
    1603             :          * If other processes are mapping this page, we couldn't discard
    1604             :          * the page unless they all do MADV_FREE so let's skip the page.
    1605             :          */
    1606           0 :         if (page_mapcount(page) != 1)
    1607           0 :                 goto out;
    1608             : 
    1609           0 :         if (!trylock_page(page))
    1610           0 :                 goto out;
    1611             : 
    1612             :         /*
    1613             :          * If user want to discard part-pages of THP, split it so MADV_FREE
    1614             :          * will deactivate only them.
    1615             :          */
    1616           0 :         if (next - addr != HPAGE_PMD_SIZE) {
    1617           0 :                 get_page(page);
    1618           0 :                 spin_unlock(ptl);
    1619           0 :                 split_huge_page(page);
    1620           0 :                 unlock_page(page);
    1621           0 :                 put_page(page);
    1622           0 :                 goto out_unlocked;
    1623             :         }
    1624             : 
    1625           0 :         if (PageDirty(page))
    1626           0 :                 ClearPageDirty(page);
    1627           0 :         unlock_page(page);
    1628             : 
    1629           0 :         if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
    1630           0 :                 pmdp_invalidate(vma, addr, pmd);
    1631           0 :                 orig_pmd = pmd_mkold(orig_pmd);
    1632           0 :                 orig_pmd = pmd_mkclean(orig_pmd);
    1633             : 
    1634           0 :                 set_pmd_at(mm, addr, pmd, orig_pmd);
    1635           0 :                 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
    1636             :         }
    1637             : 
    1638           0 :         mark_page_lazyfree(page);
    1639           0 :         ret = true;
    1640           0 : out:
    1641           0 :         spin_unlock(ptl);
    1642           0 : out_unlocked:
    1643           0 :         return ret;
    1644             : }
    1645             : 
    1646          17 : static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
    1647             : {
    1648          17 :         pgtable_t pgtable;
    1649             : 
    1650          17 :         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
    1651          17 :         pte_free(mm, pgtable);
    1652          17 :         mm_dec_nr_ptes(mm);
    1653          17 : }
    1654             : 
    1655          17 : int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
    1656             :                  pmd_t *pmd, unsigned long addr)
    1657             : {
    1658          17 :         pmd_t orig_pmd;
    1659          17 :         spinlock_t *ptl;
    1660             : 
    1661          17 :         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
    1662             : 
    1663          17 :         ptl = __pmd_trans_huge_lock(pmd, vma);
    1664          17 :         if (!ptl)
    1665             :                 return 0;
    1666             :         /*
    1667             :          * For architectures like ppc64 we look at deposited pgtable
    1668             :          * when calling pmdp_huge_get_and_clear. So do the
    1669             :          * pgtable_trans_huge_withdraw after finishing pmdp related
    1670             :          * operations.
    1671             :          */
    1672          34 :         orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
    1673          17 :                                                 tlb->fullmm);
    1674          17 :         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
    1675          34 :         if (vma_is_special_huge(vma)) {
    1676           0 :                 if (arch_needs_pgtable_deposit())
    1677             :                         zap_deposited_table(tlb->mm, pmd);
    1678           0 :                 spin_unlock(ptl);
    1679           0 :                 if (is_huge_zero_pmd(orig_pmd))
    1680           0 :                         tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
    1681          17 :         } else if (is_huge_zero_pmd(orig_pmd)) {
    1682           0 :                 zap_deposited_table(tlb->mm, pmd);
    1683           0 :                 spin_unlock(ptl);
    1684           0 :                 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
    1685             :         } else {
    1686          17 :                 struct page *page = NULL;
    1687          17 :                 int flush_needed = 1;
    1688             : 
    1689          17 :                 if (pmd_present(orig_pmd)) {
    1690          17 :                         page = pmd_page(orig_pmd);
    1691          17 :                         page_remove_rmap(page, true);
    1692          17 :                         VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
    1693          17 :                         VM_BUG_ON_PAGE(!PageHead(page), page);
    1694           0 :                 } else if (thp_migration_supported()) {
    1695           0 :                         swp_entry_t entry;
    1696             : 
    1697           0 :                         VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
    1698           0 :                         entry = pmd_to_swp_entry(orig_pmd);
    1699           0 :                         page = pfn_to_page(swp_offset(entry));
    1700           0 :                         flush_needed = 0;
    1701             :                 } else
    1702             :                         WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
    1703             : 
    1704          17 :                 if (PageAnon(page)) {
    1705          17 :                         zap_deposited_table(tlb->mm, pmd);
    1706          17 :                         add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
    1707             :                 } else {
    1708           0 :                         if (arch_needs_pgtable_deposit())
    1709             :                                 zap_deposited_table(tlb->mm, pmd);
    1710           0 :                         add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
    1711             :                 }
    1712             : 
    1713          17 :                 spin_unlock(ptl);
    1714          17 :                 if (flush_needed)
    1715          17 :                         tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
    1716             :         }
    1717             :         return 1;
    1718             : }
    1719             : 
    1720             : #ifndef pmd_move_must_withdraw
    1721           0 : static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
    1722             :                                          spinlock_t *old_pmd_ptl,
    1723             :                                          struct vm_area_struct *vma)
    1724             : {
    1725             :         /*
    1726             :          * With split pmd lock we also need to move preallocated
    1727             :          * PTE page table if new_pmd is on different PMD page table.
    1728             :          *
    1729             :          * We also don't deposit and withdraw tables for file pages.
    1730             :          */
    1731           0 :         return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
    1732             : }
    1733             : #endif
    1734             : 
    1735           0 : static pmd_t move_soft_dirty_pmd(pmd_t pmd)
    1736             : {
    1737             : #ifdef CONFIG_MEM_SOFT_DIRTY
    1738             :         if (unlikely(is_pmd_migration_entry(pmd)))
    1739             :                 pmd = pmd_swp_mksoft_dirty(pmd);
    1740             :         else if (pmd_present(pmd))
    1741             :                 pmd = pmd_mksoft_dirty(pmd);
    1742             : #endif
    1743           0 :         return pmd;
    1744             : }
    1745             : 
    1746           0 : bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
    1747             :                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
    1748             : {
    1749           0 :         spinlock_t *old_ptl, *new_ptl;
    1750           0 :         pmd_t pmd;
    1751           0 :         struct mm_struct *mm = vma->vm_mm;
    1752           0 :         bool force_flush = false;
    1753             : 
    1754             :         /*
    1755             :          * The destination pmd shouldn't be established, free_pgtables()
    1756             :          * should have release it.
    1757             :          */
    1758           0 :         if (WARN_ON(!pmd_none(*new_pmd))) {
    1759           0 :                 VM_BUG_ON(pmd_trans_huge(*new_pmd));
    1760             :                 return false;
    1761             :         }
    1762             : 
    1763             :         /*
    1764             :          * We don't have to worry about the ordering of src and dst
    1765             :          * ptlocks because exclusive mmap_lock prevents deadlock.
    1766             :          */
    1767           0 :         old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
    1768           0 :         if (old_ptl) {
    1769           0 :                 new_ptl = pmd_lockptr(mm, new_pmd);
    1770           0 :                 if (new_ptl != old_ptl)
    1771           0 :                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
    1772           0 :                 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
    1773           0 :                 if (pmd_present(pmd))
    1774           0 :                         force_flush = true;
    1775           0 :                 VM_BUG_ON(!pmd_none(*new_pmd));
    1776             : 
    1777           0 :                 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
    1778           0 :                         pgtable_t pgtable;
    1779           0 :                         pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
    1780           0 :                         pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
    1781             :                 }
    1782           0 :                 pmd = move_soft_dirty_pmd(pmd);
    1783           0 :                 set_pmd_at(mm, new_addr, new_pmd, pmd);
    1784           0 :                 if (force_flush)
    1785           0 :                         flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
    1786           0 :                 if (new_ptl != old_ptl)
    1787           0 :                         spin_unlock(new_ptl);
    1788           0 :                 spin_unlock(old_ptl);
    1789           0 :                 return true;
    1790             :         }
    1791             :         return false;
    1792             : }
    1793             : 
    1794             : /*
    1795             :  * Returns
    1796             :  *  - 0 if PMD could not be locked
    1797             :  *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
    1798             :  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
    1799             :  */
    1800           0 : int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
    1801             :                 unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
    1802             : {
    1803           0 :         struct mm_struct *mm = vma->vm_mm;
    1804           0 :         spinlock_t *ptl;
    1805           0 :         pmd_t entry;
    1806           0 :         bool preserve_write;
    1807           0 :         int ret;
    1808           0 :         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
    1809           0 :         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
    1810           0 :         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
    1811             : 
    1812           0 :         ptl = __pmd_trans_huge_lock(pmd, vma);
    1813           0 :         if (!ptl)
    1814             :                 return 0;
    1815             : 
    1816           0 :         preserve_write = prot_numa && pmd_write(*pmd);
    1817           0 :         ret = 1;
    1818             : 
    1819             : #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
    1820           0 :         if (is_swap_pmd(*pmd)) {
    1821           0 :                 swp_entry_t entry = pmd_to_swp_entry(*pmd);
    1822             : 
    1823           0 :                 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
    1824           0 :                 if (is_write_migration_entry(entry)) {
    1825           0 :                         pmd_t newpmd;
    1826             :                         /*
    1827             :                          * A protection check is difficult so
    1828             :                          * just be safe and disable write
    1829             :                          */
    1830           0 :                         make_migration_entry_read(&entry);
    1831           0 :                         newpmd = swp_entry_to_pmd(entry);
    1832           0 :                         if (pmd_swp_soft_dirty(*pmd))
    1833             :                                 newpmd = pmd_swp_mksoft_dirty(newpmd);
    1834           0 :                         set_pmd_at(mm, addr, pmd, newpmd);
    1835             :                 }
    1836           0 :                 goto unlock;
    1837             :         }
    1838             : #endif
    1839             : 
    1840             :         /*
    1841             :          * Avoid trapping faults against the zero page. The read-only
    1842             :          * data is likely to be read-cached on the local CPU and
    1843             :          * local/remote hits to the zero page are not interesting.
    1844             :          */
    1845           0 :         if (prot_numa && is_huge_zero_pmd(*pmd))
    1846           0 :                 goto unlock;
    1847             : 
    1848           0 :         if (prot_numa && pmd_protnone(*pmd))
    1849             :                 goto unlock;
    1850             : 
    1851             :         /*
    1852             :          * In case prot_numa, we are under mmap_read_lock(mm). It's critical
    1853             :          * to not clear pmd intermittently to avoid race with MADV_DONTNEED
    1854             :          * which is also under mmap_read_lock(mm):
    1855             :          *
    1856             :          *      CPU0:                           CPU1:
    1857             :          *                              change_huge_pmd(prot_numa=1)
    1858             :          *                               pmdp_huge_get_and_clear_notify()
    1859             :          * madvise_dontneed()
    1860             :          *  zap_pmd_range()
    1861             :          *   pmd_trans_huge(*pmd) == 0 (without ptl)
    1862             :          *   // skip the pmd
    1863             :          *                               set_pmd_at();
    1864             :          *                               // pmd is re-established
    1865             :          *
    1866             :          * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
    1867             :          * which may break userspace.
    1868             :          *
    1869             :          * pmdp_invalidate() is required to make sure we don't miss
    1870             :          * dirty/young flags set by hardware.
    1871             :          */
    1872           0 :         entry = pmdp_invalidate(vma, addr, pmd);
    1873             : 
    1874           0 :         entry = pmd_modify(entry, newprot);
    1875           0 :         if (preserve_write)
    1876           0 :                 entry = pmd_mk_savedwrite(entry);
    1877           0 :         if (uffd_wp) {
    1878           0 :                 entry = pmd_wrprotect(entry);
    1879           0 :                 entry = pmd_mkuffd_wp(entry);
    1880             :         } else if (uffd_wp_resolve) {
    1881             :                 /*
    1882             :                  * Leave the write bit to be handled by PF interrupt
    1883             :                  * handler, then things like COW could be properly
    1884             :                  * handled.
    1885             :                  */
    1886           0 :                 entry = pmd_clear_uffd_wp(entry);
    1887             :         }
    1888           0 :         ret = HPAGE_PMD_NR;
    1889           0 :         set_pmd_at(mm, addr, pmd, entry);
    1890           0 :         BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
    1891           0 : unlock:
    1892           0 :         spin_unlock(ptl);
    1893           0 :         return ret;
    1894             : }
    1895             : 
    1896             : /*
    1897             :  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
    1898             :  *
    1899             :  * Note that if it returns page table lock pointer, this routine returns without
    1900             :  * unlocking page table lock. So callers must unlock it.
    1901             :  */
    1902          17 : spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
    1903             : {
    1904          17 :         spinlock_t *ptl;
    1905          17 :         ptl = pmd_lock(vma->vm_mm, pmd);
    1906          17 :         if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
    1907             :                         pmd_devmap(*pmd)))
    1908             :                 return ptl;
    1909           0 :         spin_unlock(ptl);
    1910           0 :         return NULL;
    1911             : }
    1912             : 
    1913             : /*
    1914             :  * Returns true if a given pud maps a thp, false otherwise.
    1915             :  *
    1916             :  * Note that if it returns true, this routine returns without unlocking page
    1917             :  * table lock. So callers must unlock it.
    1918             :  */
    1919           0 : spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
    1920             : {
    1921           0 :         spinlock_t *ptl;
    1922             : 
    1923           0 :         ptl = pud_lock(vma->vm_mm, pud);
    1924           0 :         if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
    1925             :                 return ptl;
    1926           0 :         spin_unlock(ptl);
    1927           0 :         return NULL;
    1928             : }
    1929             : 
    1930             : #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
    1931           0 : int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
    1932             :                  pud_t *pud, unsigned long addr)
    1933             : {
    1934           0 :         spinlock_t *ptl;
    1935             : 
    1936           0 :         ptl = __pud_trans_huge_lock(pud, vma);
    1937           0 :         if (!ptl)
    1938             :                 return 0;
    1939             :         /*
    1940             :          * For architectures like ppc64 we look at deposited pgtable
    1941             :          * when calling pudp_huge_get_and_clear. So do the
    1942             :          * pgtable_trans_huge_withdraw after finishing pudp related
    1943             :          * operations.
    1944             :          */
    1945           0 :         pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
    1946           0 :         tlb_remove_pud_tlb_entry(tlb, pud, addr);
    1947           0 :         if (vma_is_special_huge(vma)) {
    1948           0 :                 spin_unlock(ptl);
    1949             :                 /* No zero page support yet */
    1950             :         } else {
    1951             :                 /* No support for anonymous PUD pages yet */
    1952           0 :                 BUG();
    1953             :         }
    1954           0 :         return 1;
    1955             : }
    1956             : 
    1957           0 : static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
    1958             :                 unsigned long haddr)
    1959             : {
    1960           0 :         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
    1961           0 :         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
    1962           0 :         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
    1963           0 :         VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
    1964             : 
    1965           0 :         count_vm_event(THP_SPLIT_PUD);
    1966             : 
    1967           0 :         pudp_huge_clear_flush_notify(vma, haddr, pud);
    1968           0 : }
    1969             : 
    1970        2004 : void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
    1971             :                 unsigned long address)
    1972             : {
    1973        2004 :         spinlock_t *ptl;
    1974        2004 :         struct mmu_notifier_range range;
    1975             : 
    1976        2004 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
    1977             :                                 address & HPAGE_PUD_MASK,
    1978             :                                 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
    1979        2004 :         mmu_notifier_invalidate_range_start(&range);
    1980        2004 :         ptl = pud_lock(vma->vm_mm, pud);
    1981        2004 :         if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
    1982        2004 :                 goto out;
    1983           0 :         __split_huge_pud_locked(vma, pud, range.start);
    1984             : 
    1985        2004 : out:
    1986        2004 :         spin_unlock(ptl);
    1987             :         /*
    1988             :          * No need to double call mmu_notifier->invalidate_range() callback as
    1989             :          * the above pudp_huge_clear_flush_notify() did already call it.
    1990             :          */
    1991        2004 :         mmu_notifier_invalidate_range_only_end(&range);
    1992        2004 : }
    1993             : #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
    1994             : 
    1995           0 : static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
    1996             :                 unsigned long haddr, pmd_t *pmd)
    1997             : {
    1998           0 :         struct mm_struct *mm = vma->vm_mm;
    1999           0 :         pgtable_t pgtable;
    2000           0 :         pmd_t _pmd;
    2001           0 :         int i;
    2002             : 
    2003             :         /*
    2004             :          * Leave pmd empty until pte is filled note that it is fine to delay
    2005             :          * notification until mmu_notifier_invalidate_range_end() as we are
    2006             :          * replacing a zero pmd write protected page with a zero pte write
    2007             :          * protected page.
    2008             :          *
    2009             :          * See Documentation/vm/mmu_notifier.rst
    2010             :          */
    2011           0 :         pmdp_huge_clear_flush(vma, haddr, pmd);
    2012             : 
    2013           0 :         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
    2014           0 :         pmd_populate(mm, &_pmd, pgtable);
    2015             : 
    2016           0 :         for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
    2017           0 :                 pte_t *pte, entry;
    2018           0 :                 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
    2019           0 :                 entry = pte_mkspecial(entry);
    2020           0 :                 pte = pte_offset_map(&_pmd, haddr);
    2021           0 :                 VM_BUG_ON(!pte_none(*pte));
    2022           0 :                 set_pte_at(mm, haddr, pte, entry);
    2023           0 :                 pte_unmap(pte);
    2024             :         }
    2025           0 :         smp_wmb(); /* make pte visible before pmd */
    2026           0 :         pmd_populate(mm, pmd, pgtable);
    2027           0 : }
    2028             : 
    2029           2 : static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
    2030             :                 unsigned long haddr, bool freeze)
    2031             : {
    2032           2 :         struct mm_struct *mm = vma->vm_mm;
    2033           2 :         struct page *page;
    2034           2 :         pgtable_t pgtable;
    2035           2 :         pmd_t old_pmd, _pmd;
    2036           2 :         bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
    2037           2 :         unsigned long addr;
    2038           2 :         int i;
    2039             : 
    2040           2 :         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
    2041           2 :         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
    2042           2 :         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
    2043           2 :         VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
    2044             :                                 && !pmd_devmap(*pmd));
    2045             : 
    2046           2 :         count_vm_event(THP_SPLIT_PMD);
    2047             : 
    2048           2 :         if (!vma_is_anonymous(vma)) {
    2049           0 :                 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
    2050             :                 /*
    2051             :                  * We are going to unmap this huge page. So
    2052             :                  * just go ahead and zap it
    2053             :                  */
    2054           0 :                 if (arch_needs_pgtable_deposit())
    2055             :                         zap_deposited_table(mm, pmd);
    2056           0 :                 if (vma_is_special_huge(vma))
    2057           0 :                         return;
    2058           0 :                 page = pmd_page(_pmd);
    2059           0 :                 if (!PageDirty(page) && pmd_dirty(_pmd))
    2060           0 :                         set_page_dirty(page);
    2061           0 :                 if (!PageReferenced(page) && pmd_young(_pmd))
    2062           0 :                         SetPageReferenced(page);
    2063           0 :                 page_remove_rmap(page, true);
    2064           0 :                 put_page(page);
    2065           0 :                 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
    2066           0 :                 return;
    2067           2 :         } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
    2068             :                 /*
    2069             :                  * FIXME: Do we want to invalidate secondary mmu by calling
    2070             :                  * mmu_notifier_invalidate_range() see comments below inside
    2071             :                  * __split_huge_pmd() ?
    2072             :                  *
    2073             :                  * We are going from a zero huge page write protected to zero
    2074             :                  * small page also write protected so it does not seems useful
    2075             :                  * to invalidate secondary mmu at this time.
    2076             :                  */
    2077           0 :                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
    2078             :         }
    2079             : 
    2080             :         /*
    2081             :          * Up to this point the pmd is present and huge and userland has the
    2082             :          * whole access to the hugepage during the split (which happens in
    2083             :          * place). If we overwrite the pmd with the not-huge version pointing
    2084             :          * to the pte here (which of course we could if all CPUs were bug
    2085             :          * free), userland could trigger a small page size TLB miss on the
    2086             :          * small sized TLB while the hugepage TLB entry is still established in
    2087             :          * the huge TLB. Some CPU doesn't like that.
    2088             :          * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
    2089             :          * 383 on page 105. Intel should be safe but is also warns that it's
    2090             :          * only safe if the permission and cache attributes of the two entries
    2091             :          * loaded in the two TLB is identical (which should be the case here).
    2092             :          * But it is generally safer to never allow small and huge TLB entries
    2093             :          * for the same virtual address to be loaded simultaneously. So instead
    2094             :          * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
    2095             :          * current pmd notpresent (atomically because here the pmd_trans_huge
    2096             :          * must remain set at all times on the pmd until the split is complete
    2097             :          * for this pmd), then we flush the SMP TLB and finally we write the
    2098             :          * non-huge version of the pmd entry with pmd_populate.
    2099             :          */
    2100           2 :         old_pmd = pmdp_invalidate(vma, haddr, pmd);
    2101             : 
    2102           2 :         pmd_migration = is_pmd_migration_entry(old_pmd);
    2103           2 :         if (unlikely(pmd_migration)) {
    2104           0 :                 swp_entry_t entry;
    2105             : 
    2106           0 :                 entry = pmd_to_swp_entry(old_pmd);
    2107           0 :                 page = pfn_to_page(swp_offset(entry));
    2108           0 :                 write = is_write_migration_entry(entry);
    2109           0 :                 young = false;
    2110           0 :                 soft_dirty = pmd_swp_soft_dirty(old_pmd);
    2111           0 :                 uffd_wp = pmd_swp_uffd_wp(old_pmd);
    2112             :         } else {
    2113           2 :                 page = pmd_page(old_pmd);
    2114           2 :                 if (pmd_dirty(old_pmd))
    2115           2 :                         SetPageDirty(page);
    2116           2 :                 write = pmd_write(old_pmd);
    2117           2 :                 young = pmd_young(old_pmd);
    2118           2 :                 soft_dirty = pmd_soft_dirty(old_pmd);
    2119           2 :                 uffd_wp = pmd_uffd_wp(old_pmd);
    2120             :         }
    2121           2 :         VM_BUG_ON_PAGE(!page_count(page), page);
    2122           2 :         page_ref_add(page, HPAGE_PMD_NR - 1);
    2123             : 
    2124             :         /*
    2125             :          * Withdraw the table only after we mark the pmd entry invalid.
    2126             :          * This's critical for some architectures (Power).
    2127             :          */
    2128           2 :         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
    2129           2 :         pmd_populate(mm, &_pmd, pgtable);
    2130             : 
    2131        1028 :         for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
    2132        1024 :                 pte_t entry, *pte;
    2133             :                 /*
    2134             :                  * Note that NUMA hinting access restrictions are not
    2135             :                  * transferred to avoid any possibility of altering
    2136             :                  * permissions across VMAs.
    2137             :                  */
    2138        1024 :                 if (freeze || pmd_migration) {
    2139           0 :                         swp_entry_t swp_entry;
    2140           0 :                         swp_entry = make_migration_entry(page + i, write);
    2141           0 :                         entry = swp_entry_to_pte(swp_entry);
    2142           0 :                         if (soft_dirty)
    2143             :                                 entry = pte_swp_mksoft_dirty(entry);
    2144           0 :                         if (uffd_wp)
    2145             :                                 entry = pte_swp_mkuffd_wp(entry);
    2146             :                 } else {
    2147        1024 :                         entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
    2148        1024 :                         entry = maybe_mkwrite(entry, vma);
    2149        1024 :                         if (!write)
    2150        1024 :                                 entry = pte_wrprotect(entry);
    2151        1024 :                         if (!young)
    2152         512 :                                 entry = pte_mkold(entry);
    2153             :                         if (soft_dirty)
    2154             :                                 entry = pte_mksoft_dirty(entry);
    2155             :                         if (uffd_wp)
    2156             :                                 entry = pte_mkuffd_wp(entry);
    2157             :                 }
    2158        1024 :                 pte = pte_offset_map(&_pmd, addr);
    2159        1024 :                 BUG_ON(!pte_none(*pte));
    2160        1024 :                 set_pte_at(mm, addr, pte, entry);
    2161        1024 :                 if (!pmd_migration)
    2162        1024 :                         atomic_inc(&page[i]._mapcount);
    2163        1024 :                 pte_unmap(pte);
    2164             :         }
    2165             : 
    2166           2 :         if (!pmd_migration) {
    2167             :                 /*
    2168             :                  * Set PG_double_map before dropping compound_mapcount to avoid
    2169             :                  * false-negative page_mapped().
    2170             :                  */
    2171           2 :                 if (compound_mapcount(page) > 1 &&
    2172           1 :                     !TestSetPageDoubleMap(page)) {
    2173         513 :                         for (i = 0; i < HPAGE_PMD_NR; i++)
    2174        1024 :                                 atomic_inc(&page[i]._mapcount);
    2175             :                 }
    2176             : 
    2177           2 :                 lock_page_memcg(page);
    2178           4 :                 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
    2179             :                         /* Last compound_mapcount is gone. */
    2180           1 :                         __mod_lruvec_page_state(page, NR_ANON_THPS,
    2181             :                                                 -HPAGE_PMD_NR);
    2182           1 :                         if (TestClearPageDoubleMap(page)) {
    2183             :                                 /* No need in mapcount reference anymore */
    2184         513 :                                 for (i = 0; i < HPAGE_PMD_NR; i++)
    2185        1024 :                                         atomic_dec(&page[i]._mapcount);
    2186             :                         }
    2187             :                 }
    2188           2 :                 unlock_page_memcg(page);
    2189             :         }
    2190             : 
    2191           2 :         smp_wmb(); /* make pte visible before pmd */
    2192           2 :         pmd_populate(mm, pmd, pgtable);
    2193             : 
    2194           2 :         if (freeze) {
    2195           0 :                 for (i = 0; i < HPAGE_PMD_NR; i++) {
    2196           0 :                         page_remove_rmap(page + i, false);
    2197           0 :                         put_page(page + i);
    2198             :                 }
    2199             :         }
    2200             : }
    2201             : 
    2202          90 : void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
    2203             :                 unsigned long address, bool freeze, struct page *page)
    2204             : {
    2205          90 :         spinlock_t *ptl;
    2206          90 :         struct mmu_notifier_range range;
    2207          90 :         bool do_unlock_page = false;
    2208          90 :         pmd_t _pmd;
    2209             : 
    2210          90 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
    2211             :                                 address & HPAGE_PMD_MASK,
    2212             :                                 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
    2213          90 :         mmu_notifier_invalidate_range_start(&range);
    2214          90 :         ptl = pmd_lock(vma->vm_mm, pmd);
    2215             : 
    2216             :         /*
    2217             :          * If caller asks to setup a migration entries, we need a page to check
    2218             :          * pmd against. Otherwise we can end up replacing wrong page.
    2219             :          */
    2220          90 :         VM_BUG_ON(freeze && !page);
    2221          90 :         if (page) {
    2222           0 :                 VM_WARN_ON_ONCE(!PageLocked(page));
    2223           0 :                 if (page != pmd_page(*pmd))
    2224           0 :                         goto out;
    2225             :         }
    2226             : 
    2227          90 : repeat:
    2228          90 :         if (pmd_trans_huge(*pmd)) {
    2229           2 :                 if (!page) {
    2230           2 :                         page = pmd_page(*pmd);
    2231             :                         /*
    2232             :                          * An anonymous page must be locked, to ensure that a
    2233             :                          * concurrent reuse_swap_page() sees stable mapcount;
    2234             :                          * but reuse_swap_page() is not used on shmem or file,
    2235             :                          * and page lock must not be taken when zap_pmd_range()
    2236             :                          * calls __split_huge_pmd() while i_mmap_lock is held.
    2237             :                          */
    2238           2 :                         if (PageAnon(page)) {
    2239           2 :                                 if (unlikely(!trylock_page(page))) {
    2240           0 :                                         get_page(page);
    2241           0 :                                         _pmd = *pmd;
    2242           0 :                                         spin_unlock(ptl);
    2243           0 :                                         lock_page(page);
    2244           0 :                                         spin_lock(ptl);
    2245           0 :                                         if (unlikely(!pmd_same(*pmd, _pmd))) {
    2246           0 :                                                 unlock_page(page);
    2247           0 :                                                 put_page(page);
    2248           0 :                                                 page = NULL;
    2249           0 :                                                 goto repeat;
    2250             :                                         }
    2251           0 :                                         put_page(page);
    2252             :                                 }
    2253             :                                 do_unlock_page = true;
    2254             :                         }
    2255             :                 }
    2256           4 :                 if (PageMlocked(page))
    2257           0 :                         clear_page_mlock(page);
    2258          88 :         } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
    2259          88 :                 goto out;
    2260           2 :         __split_huge_pmd_locked(vma, pmd, range.start, freeze);
    2261          90 : out:
    2262          90 :         spin_unlock(ptl);
    2263          90 :         if (do_unlock_page)
    2264           2 :                 unlock_page(page);
    2265             :         /*
    2266             :          * No need to double call mmu_notifier->invalidate_range() callback.
    2267             :          * They are 3 cases to consider inside __split_huge_pmd_locked():
    2268             :          *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
    2269             :          *  2) __split_huge_zero_page_pmd() read only zero page and any write
    2270             :          *    fault will trigger a flush_notify before pointing to a new page
    2271             :          *    (it is fine if the secondary mmu keeps pointing to the old zero
    2272             :          *    page in the meantime)
    2273             :          *  3) Split a huge pmd into pte pointing to the same page. No need
    2274             :          *     to invalidate secondary tlb entry they are all still valid.
    2275             :          *     any further changes to individual pte will notify. So no need
    2276             :          *     to call mmu_notifier->invalidate_range()
    2277             :          */
    2278          90 :         mmu_notifier_invalidate_range_only_end(&range);
    2279          90 : }
    2280             : 
    2281          89 : void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
    2282             :                 bool freeze, struct page *page)
    2283             : {
    2284          89 :         pgd_t *pgd;
    2285          89 :         p4d_t *p4d;
    2286          89 :         pud_t *pud;
    2287          89 :         pmd_t *pmd;
    2288             : 
    2289          89 :         pgd = pgd_offset(vma->vm_mm, address);
    2290          89 :         if (!pgd_present(*pgd))
    2291             :                 return;
    2292             : 
    2293          89 :         p4d = p4d_offset(pgd, address);
    2294          89 :         if (!p4d_present(*p4d))
    2295             :                 return;
    2296             : 
    2297          88 :         pud = pud_offset(p4d, address);
    2298         176 :         if (!pud_present(*pud))
    2299             :                 return;
    2300             : 
    2301          88 :         pmd = pmd_offset(pud, address);
    2302             : 
    2303          88 :         __split_huge_pmd(vma, pmd, address, freeze, page);
    2304             : }
    2305             : 
    2306       31703 : void vma_adjust_trans_huge(struct vm_area_struct *vma,
    2307             :                              unsigned long start,
    2308             :                              unsigned long end,
    2309             :                              long adjust_next)
    2310             : {
    2311             :         /*
    2312             :          * If the new start address isn't hpage aligned and it could
    2313             :          * previously contain an hugepage: check if we need to split
    2314             :          * an huge pmd.
    2315             :          */
    2316       31703 :         if (start & ~HPAGE_PMD_MASK &&
    2317       31594 :             (start & HPAGE_PMD_MASK) >= vma->vm_start &&
    2318        1408 :             (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
    2319          68 :                 split_huge_pmd_address(vma, start, false, NULL);
    2320             : 
    2321             :         /*
    2322             :          * If the new end address isn't hpage aligned and it could
    2323             :          * previously contain an hugepage: check if we need to split
    2324             :          * an huge pmd.
    2325             :          */
    2326       31703 :         if (end & ~HPAGE_PMD_MASK &&
    2327       31636 :             (end & HPAGE_PMD_MASK) >= vma->vm_start &&
    2328        4742 :             (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
    2329          21 :                 split_huge_pmd_address(vma, end, false, NULL);
    2330             : 
    2331             :         /*
    2332             :          * If we're also updating the vma->vm_next->vm_start, if the new
    2333             :          * vm_next->vm_start isn't hpage aligned and it could previously
    2334             :          * contain an hugepage: check if we need to split an huge pmd.
    2335             :          */
    2336       31703 :         if (adjust_next > 0) {
    2337          21 :                 struct vm_area_struct *next = vma->vm_next;
    2338          21 :                 unsigned long nstart = next->vm_start;
    2339          21 :                 nstart += adjust_next;
    2340          21 :                 if (nstart & ~HPAGE_PMD_MASK &&
    2341          21 :                     (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
    2342           0 :                     (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
    2343           0 :                         split_huge_pmd_address(next, nstart, false, NULL);
    2344             :         }
    2345       31703 : }
    2346             : 
    2347           0 : static void unmap_page(struct page *page)
    2348             : {
    2349           0 :         enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
    2350             :                 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
    2351           0 :         bool unmap_success;
    2352             : 
    2353           0 :         VM_BUG_ON_PAGE(!PageHead(page), page);
    2354             : 
    2355           0 :         if (PageAnon(page))
    2356           0 :                 ttu_flags |= TTU_SPLIT_FREEZE;
    2357             : 
    2358           0 :         unmap_success = try_to_unmap(page, ttu_flags);
    2359           0 :         VM_BUG_ON_PAGE(!unmap_success, page);
    2360           0 : }
    2361             : 
    2362           0 : static void remap_page(struct page *page, unsigned int nr)
    2363             : {
    2364           0 :         int i;
    2365           0 :         if (PageTransHuge(page)) {
    2366           0 :                 remove_migration_ptes(page, page, true);
    2367             :         } else {
    2368           0 :                 for (i = 0; i < nr; i++)
    2369           0 :                         remove_migration_ptes(page + i, page + i, true);
    2370             :         }
    2371           0 : }
    2372             : 
    2373           0 : static void lru_add_page_tail(struct page *head, struct page *tail,
    2374             :                 struct lruvec *lruvec, struct list_head *list)
    2375             : {
    2376           0 :         VM_BUG_ON_PAGE(!PageHead(head), head);
    2377           0 :         VM_BUG_ON_PAGE(PageCompound(tail), head);
    2378           0 :         VM_BUG_ON_PAGE(PageLRU(tail), head);
    2379           0 :         lockdep_assert_held(&lruvec->lru_lock);
    2380             : 
    2381           0 :         if (list) {
    2382             :                 /* page reclaim is reclaiming a huge page */
    2383           0 :                 VM_WARN_ON(PageLRU(head));
    2384           0 :                 get_page(tail);
    2385           0 :                 list_add_tail(&tail->lru, list);
    2386             :         } else {
    2387             :                 /* head is still on lru (and we have it frozen) */
    2388           0 :                 VM_WARN_ON(!PageLRU(head));
    2389           0 :                 SetPageLRU(tail);
    2390           0 :                 list_add_tail(&tail->lru, &head->lru);
    2391             :         }
    2392           0 : }
    2393             : 
    2394           0 : static void __split_huge_page_tail(struct page *head, int tail,
    2395             :                 struct lruvec *lruvec, struct list_head *list)
    2396             : {
    2397           0 :         struct page *page_tail = head + tail;
    2398             : 
    2399           0 :         VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
    2400             : 
    2401             :         /*
    2402             :          * Clone page flags before unfreezing refcount.
    2403             :          *
    2404             :          * After successful get_page_unless_zero() might follow flags change,
    2405             :          * for example lock_page() which set PG_waiters.
    2406             :          */
    2407           0 :         page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    2408           0 :         page_tail->flags |= (head->flags &
    2409             :                         ((1L << PG_referenced) |
    2410             :                          (1L << PG_swapbacked) |
    2411             :                          (1L << PG_swapcache) |
    2412             :                          (1L << PG_mlocked) |
    2413             :                          (1L << PG_uptodate) |
    2414             :                          (1L << PG_active) |
    2415             :                          (1L << PG_workingset) |
    2416             :                          (1L << PG_locked) |
    2417             :                          (1L << PG_unevictable) |
    2418             : #ifdef CONFIG_64BIT
    2419             :                          (1L << PG_arch_2) |
    2420             : #endif
    2421             :                          (1L << PG_dirty)));
    2422             : 
    2423             :         /* ->mapping in first tail page is compound_mapcount */
    2424           0 :         VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
    2425             :                         page_tail);
    2426           0 :         page_tail->mapping = head->mapping;
    2427           0 :         page_tail->index = head->index + tail;
    2428             : 
    2429             :         /* Page flags must be visible before we make the page non-compound. */
    2430           0 :         smp_wmb();
    2431             : 
    2432             :         /*
    2433             :          * Clear PageTail before unfreezing page refcount.
    2434             :          *
    2435             :          * After successful get_page_unless_zero() might follow put_page()
    2436             :          * which needs correct compound_head().
    2437             :          */
    2438           0 :         clear_compound_head(page_tail);
    2439             : 
    2440             :         /* Finally unfreeze refcount. Additional reference from page cache. */
    2441           0 :         page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
    2442           0 :                                           PageSwapCache(head)));
    2443             : 
    2444           0 :         if (page_is_young(head))
    2445           0 :                 set_page_young(page_tail);
    2446           0 :         if (page_is_idle(head))
    2447           0 :                 set_page_idle(page_tail);
    2448             : 
    2449           0 :         page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
    2450             : 
    2451             :         /*
    2452             :          * always add to the tail because some iterators expect new
    2453             :          * pages to show after the currently processed elements - e.g.
    2454             :          * migrate_pages
    2455             :          */
    2456           0 :         lru_add_page_tail(head, page_tail, lruvec, list);
    2457           0 : }
    2458             : 
    2459           0 : static void __split_huge_page(struct page *page, struct list_head *list,
    2460             :                 pgoff_t end)
    2461             : {
    2462           0 :         struct page *head = compound_head(page);
    2463           0 :         struct lruvec *lruvec;
    2464           0 :         struct address_space *swap_cache = NULL;
    2465           0 :         unsigned long offset = 0;
    2466           0 :         unsigned int nr = thp_nr_pages(head);
    2467           0 :         int i;
    2468             : 
    2469             :         /* complete memcg works before add pages to LRU */
    2470           0 :         split_page_memcg(head, nr);
    2471             : 
    2472           0 :         if (PageAnon(head) && PageSwapCache(head)) {
    2473             :                 swp_entry_t entry = { .val = page_private(head) };
    2474             : 
    2475             :                 offset = swp_offset(entry);
    2476             :                 swap_cache = swap_address_space(entry);
    2477             :                 xa_lock(&swap_cache->i_pages);
    2478             :         }
    2479             : 
    2480             :         /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
    2481           0 :         lruvec = lock_page_lruvec(head);
    2482             : 
    2483           0 :         for (i = nr - 1; i >= 1; i--) {
    2484           0 :                 __split_huge_page_tail(head, i, lruvec, list);
    2485             :                 /* Some pages can be beyond i_size: drop them from page cache */
    2486           0 :                 if (head[i].index >= end) {
    2487           0 :                         ClearPageDirty(head + i);
    2488           0 :                         __delete_from_page_cache(head + i, NULL);
    2489           0 :                         if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
    2490           0 :                                 shmem_uncharge(head->mapping->host, 1);
    2491           0 :                         put_page(head + i);
    2492           0 :                 } else if (!PageAnon(page)) {
    2493           0 :                         __xa_store(&head->mapping->i_pages, head[i].index,
    2494             :                                         head + i, 0);
    2495             :                 } else if (swap_cache) {
    2496             :                         __xa_store(&swap_cache->i_pages, offset + i,
    2497             :                                         head + i, 0);
    2498             :                 }
    2499             :         }
    2500             : 
    2501           0 :         ClearPageCompound(head);
    2502           0 :         unlock_page_lruvec(lruvec);
    2503             :         /* Caller disabled irqs, so they are still disabled here */
    2504             : 
    2505           0 :         split_page_owner(head, nr);
    2506             : 
    2507             :         /* See comment in __split_huge_page_tail() */
    2508           0 :         if (PageAnon(head)) {
    2509             :                 /* Additional pin to swap cache */
    2510           0 :                 if (PageSwapCache(head)) {
    2511             :                         page_ref_add(head, 2);
    2512           0 :                         xa_unlock(&swap_cache->i_pages);
    2513             :                 } else {
    2514           0 :                         page_ref_inc(head);
    2515             :                 }
    2516             :         } else {
    2517             :                 /* Additional pin to page cache */
    2518           0 :                 page_ref_add(head, 2);
    2519           0 :                 xa_unlock(&head->mapping->i_pages);
    2520             :         }
    2521           0 :         local_irq_enable();
    2522             : 
    2523           0 :         remap_page(head, nr);
    2524             : 
    2525           0 :         if (PageSwapCache(head)) {
    2526             :                 swp_entry_t entry = { .val = page_private(head) };
    2527             : 
    2528             :                 split_swap_cluster(entry);
    2529             :         }
    2530             : 
    2531           0 :         for (i = 0; i < nr; i++) {
    2532           0 :                 struct page *subpage = head + i;
    2533           0 :                 if (subpage == page)
    2534           0 :                         continue;
    2535           0 :                 unlock_page(subpage);
    2536             : 
    2537             :                 /*
    2538             :                  * Subpages may be freed if there wasn't any mapping
    2539             :                  * like if add_to_swap() is running on a lru page that
    2540             :                  * had its mapping zapped. And freeing these pages
    2541             :                  * requires taking the lru_lock so we do the put_page
    2542             :                  * of the tail pages after the split is complete.
    2543             :                  */
    2544           0 :                 put_page(subpage);
    2545             :         }
    2546           0 : }
    2547             : 
    2548          20 : int total_mapcount(struct page *page)
    2549             : {
    2550          20 :         int i, compound, nr, ret;
    2551             : 
    2552          20 :         VM_BUG_ON_PAGE(PageTail(page), page);
    2553             : 
    2554          40 :         if (likely(!PageCompound(page)))
    2555          20 :                 return atomic_read(&page->_mapcount) + 1;
    2556             : 
    2557           0 :         compound = compound_mapcount(page);
    2558           0 :         nr = compound_nr(page);
    2559           0 :         if (PageHuge(page))
    2560             :                 return compound;
    2561           0 :         ret = compound;
    2562           0 :         for (i = 0; i < nr; i++)
    2563           0 :                 ret += atomic_read(&page[i]._mapcount) + 1;
    2564             :         /* File pages has compound_mapcount included in _mapcount */
    2565           0 :         if (!PageAnon(page))
    2566           0 :                 return ret - compound * nr;
    2567           0 :         if (PageDoubleMap(page))
    2568           0 :                 ret -= nr;
    2569             :         return ret;
    2570             : }
    2571             : 
    2572             : /*
    2573             :  * This calculates accurately how many mappings a transparent hugepage
    2574             :  * has (unlike page_mapcount() which isn't fully accurate). This full
    2575             :  * accuracy is primarily needed to know if copy-on-write faults can
    2576             :  * reuse the page and change the mapping to read-write instead of
    2577             :  * copying them. At the same time this returns the total_mapcount too.
    2578             :  *
    2579             :  * The function returns the highest mapcount any one of the subpages
    2580             :  * has. If the return value is one, even if different processes are
    2581             :  * mapping different subpages of the transparent hugepage, they can
    2582             :  * all reuse it, because each process is reusing a different subpage.
    2583             :  *
    2584             :  * The total_mapcount is instead counting all virtual mappings of the
    2585             :  * subpages. If the total_mapcount is equal to "one", it tells the
    2586             :  * caller all mappings belong to the same "mm" and in turn the
    2587             :  * anon_vma of the transparent hugepage can become the vma->anon_vma
    2588             :  * local one as no other process may be mapping any of the subpages.
    2589             :  *
    2590             :  * It would be more accurate to replace page_mapcount() with
    2591             :  * page_trans_huge_mapcount(), however we only use
    2592             :  * page_trans_huge_mapcount() in the copy-on-write faults where we
    2593             :  * need full accuracy to avoid breaking page pinning, because
    2594             :  * page_trans_huge_mapcount() is slower than page_mapcount().
    2595             :  */
    2596           2 : int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
    2597             : {
    2598           2 :         int i, ret, _total_mapcount, mapcount;
    2599             : 
    2600             :         /* hugetlbfs shouldn't call it */
    2601           2 :         VM_BUG_ON_PAGE(PageHuge(page), page);
    2602             : 
    2603           2 :         if (likely(!PageTransCompound(page))) {
    2604           0 :                 mapcount = atomic_read(&page->_mapcount) + 1;
    2605           0 :                 if (total_mapcount)
    2606           0 :                         *total_mapcount = mapcount;
    2607           0 :                 return mapcount;
    2608             :         }
    2609             : 
    2610           2 :         page = compound_head(page);
    2611             : 
    2612           2 :         _total_mapcount = ret = 0;
    2613        2052 :         for (i = 0; i < thp_nr_pages(page); i++) {
    2614        1024 :                 mapcount = atomic_read(&page[i]._mapcount) + 1;
    2615        1024 :                 ret = max(ret, mapcount);
    2616        1024 :                 _total_mapcount += mapcount;
    2617             :         }
    2618           2 :         if (PageDoubleMap(page)) {
    2619           1 :                 ret -= 1;
    2620           2 :                 _total_mapcount -= thp_nr_pages(page);
    2621             :         }
    2622           2 :         mapcount = compound_mapcount(page);
    2623           2 :         ret += mapcount;
    2624           2 :         _total_mapcount += mapcount;
    2625           2 :         if (total_mapcount)
    2626           0 :                 *total_mapcount = _total_mapcount;
    2627             :         return ret;
    2628             : }
    2629             : 
    2630             : /* Racy check whether the huge page can be split */
    2631           0 : bool can_split_huge_page(struct page *page, int *pextra_pins)
    2632             : {
    2633           0 :         int extra_pins;
    2634             : 
    2635             :         /* Additional pins from page cache */
    2636           0 :         if (PageAnon(page))
    2637           0 :                 extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
    2638             :         else
    2639           0 :                 extra_pins = thp_nr_pages(page);
    2640           0 :         if (pextra_pins)
    2641           0 :                 *pextra_pins = extra_pins;
    2642           0 :         return total_mapcount(page) == page_count(page) - extra_pins - 1;
    2643             : }
    2644             : 
    2645             : /*
    2646             :  * This function splits huge page into normal pages. @page can point to any
    2647             :  * subpage of huge page to split. Split doesn't change the position of @page.
    2648             :  *
    2649             :  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
    2650             :  * The huge page must be locked.
    2651             :  *
    2652             :  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
    2653             :  *
    2654             :  * Both head page and tail pages will inherit mapping, flags, and so on from
    2655             :  * the hugepage.
    2656             :  *
    2657             :  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
    2658             :  * they are not mapped.
    2659             :  *
    2660             :  * Returns 0 if the hugepage is split successfully.
    2661             :  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
    2662             :  * us.
    2663             :  */
    2664           0 : int split_huge_page_to_list(struct page *page, struct list_head *list)
    2665             : {
    2666           0 :         struct page *head = compound_head(page);
    2667           0 :         struct deferred_split *ds_queue = get_deferred_split_queue(head);
    2668           0 :         struct anon_vma *anon_vma = NULL;
    2669           0 :         struct address_space *mapping = NULL;
    2670           0 :         int count, mapcount, extra_pins, ret;
    2671           0 :         pgoff_t end;
    2672             : 
    2673           0 :         VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
    2674           0 :         VM_BUG_ON_PAGE(!PageLocked(head), head);
    2675           0 :         VM_BUG_ON_PAGE(!PageCompound(head), head);
    2676             : 
    2677           0 :         if (PageWriteback(head))
    2678             :                 return -EBUSY;
    2679             : 
    2680           0 :         if (PageAnon(head)) {
    2681             :                 /*
    2682             :                  * The caller does not necessarily hold an mmap_lock that would
    2683             :                  * prevent the anon_vma disappearing so we first we take a
    2684             :                  * reference to it and then lock the anon_vma for write. This
    2685             :                  * is similar to page_lock_anon_vma_read except the write lock
    2686             :                  * is taken to serialise against parallel split or collapse
    2687             :                  * operations.
    2688             :                  */
    2689           0 :                 anon_vma = page_get_anon_vma(head);
    2690           0 :                 if (!anon_vma) {
    2691           0 :                         ret = -EBUSY;
    2692           0 :                         goto out;
    2693             :                 }
    2694           0 :                 end = -1;
    2695           0 :                 mapping = NULL;
    2696           0 :                 anon_vma_lock_write(anon_vma);
    2697             :         } else {
    2698           0 :                 mapping = head->mapping;
    2699             : 
    2700             :                 /* Truncated ? */
    2701           0 :                 if (!mapping) {
    2702           0 :                         ret = -EBUSY;
    2703           0 :                         goto out;
    2704             :                 }
    2705             : 
    2706           0 :                 anon_vma = NULL;
    2707           0 :                 i_mmap_lock_read(mapping);
    2708             : 
    2709             :                 /*
    2710             :                  *__split_huge_page() may need to trim off pages beyond EOF:
    2711             :                  * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
    2712             :                  * which cannot be nested inside the page tree lock. So note
    2713             :                  * end now: i_size itself may be changed at any moment, but
    2714             :                  * head page lock is good enough to serialize the trimming.
    2715             :                  */
    2716           0 :                 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
    2717             :         }
    2718             : 
    2719             :         /*
    2720             :          * Racy check if we can split the page, before unmap_page() will
    2721             :          * split PMDs
    2722             :          */
    2723           0 :         if (!can_split_huge_page(head, &extra_pins)) {
    2724           0 :                 ret = -EBUSY;
    2725           0 :                 goto out_unlock;
    2726             :         }
    2727             : 
    2728           0 :         unmap_page(head);
    2729           0 :         VM_BUG_ON_PAGE(compound_mapcount(head), head);
    2730             : 
    2731             :         /* block interrupt reentry in xa_lock and spinlock */
    2732           0 :         local_irq_disable();
    2733           0 :         if (mapping) {
    2734           0 :                 XA_STATE(xas, &mapping->i_pages, page_index(head));
    2735             : 
    2736             :                 /*
    2737             :                  * Check if the head page is present in page cache.
    2738             :                  * We assume all tail are present too, if head is there.
    2739             :                  */
    2740           0 :                 xa_lock(&mapping->i_pages);
    2741           0 :                 if (xas_load(&xas) != head)
    2742           0 :                         goto fail;
    2743             :         }
    2744             : 
    2745             :         /* Prevent deferred_split_scan() touching ->_refcount */
    2746           0 :         spin_lock(&ds_queue->split_queue_lock);
    2747           0 :         count = page_count(head);
    2748           0 :         mapcount = total_mapcount(head);
    2749           0 :         if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
    2750           0 :                 if (!list_empty(page_deferred_list(head))) {
    2751           0 :                         ds_queue->split_queue_len--;
    2752           0 :                         list_del(page_deferred_list(head));
    2753             :                 }
    2754           0 :                 spin_unlock(&ds_queue->split_queue_lock);
    2755           0 :                 if (mapping) {
    2756           0 :                         int nr = thp_nr_pages(head);
    2757             : 
    2758           0 :                         if (PageSwapBacked(head))
    2759           0 :                                 __mod_lruvec_page_state(head, NR_SHMEM_THPS,
    2760             :                                                         -nr);
    2761             :                         else
    2762           0 :                                 __mod_lruvec_page_state(head, NR_FILE_THPS,
    2763             :                                                         -nr);
    2764             :                 }
    2765             : 
    2766           0 :                 __split_huge_page(page, list, end);
    2767           0 :                 ret = 0;
    2768             :         } else {
    2769           0 :                 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
    2770           0 :                         pr_alert("total_mapcount: %u, page_count(): %u\n",
    2771             :                                         mapcount, count);
    2772           0 :                         if (PageTail(page))
    2773           0 :                                 dump_page(head, NULL);
    2774           0 :                         dump_page(page, "total_mapcount(head) > 0");
    2775           0 :                         BUG();
    2776             :                 }
    2777           0 :                 spin_unlock(&ds_queue->split_queue_lock);
    2778           0 : fail:           if (mapping)
    2779           0 :                         xa_unlock(&mapping->i_pages);
    2780           0 :                 local_irq_enable();
    2781           0 :                 remap_page(head, thp_nr_pages(head));
    2782           0 :                 ret = -EBUSY;
    2783             :         }
    2784             : 
    2785           0 : out_unlock:
    2786           0 :         if (anon_vma) {
    2787           0 :                 anon_vma_unlock_write(anon_vma);
    2788           0 :                 put_anon_vma(anon_vma);
    2789             :         }
    2790           0 :         if (mapping)
    2791           0 :                 i_mmap_unlock_read(mapping);
    2792           0 : out:
    2793           0 :         count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
    2794           0 :         return ret;
    2795             : }
    2796             : 
    2797          17 : void free_transhuge_page(struct page *page)
    2798             : {
    2799          17 :         struct deferred_split *ds_queue = get_deferred_split_queue(page);
    2800          17 :         unsigned long flags;
    2801             : 
    2802          17 :         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
    2803          17 :         if (!list_empty(page_deferred_list(page))) {
    2804           0 :                 ds_queue->split_queue_len--;
    2805           0 :                 list_del(page_deferred_list(page));
    2806             :         }
    2807          17 :         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
    2808          17 :         free_compound_page(page);
    2809          17 : }
    2810             : 
    2811           1 : void deferred_split_huge_page(struct page *page)
    2812             : {
    2813           1 :         struct deferred_split *ds_queue = get_deferred_split_queue(page);
    2814             : #ifdef CONFIG_MEMCG
    2815             :         struct mem_cgroup *memcg = page_memcg(compound_head(page));
    2816             : #endif
    2817           1 :         unsigned long flags;
    2818             : 
    2819           1 :         VM_BUG_ON_PAGE(!PageTransHuge(page), page);
    2820             : 
    2821             :         /*
    2822             :          * The try_to_unmap() in page reclaim path might reach here too,
    2823             :          * this may cause a race condition to corrupt deferred split queue.
    2824             :          * And, if page reclaim is already handling the same page, it is
    2825             :          * unnecessary to handle it again in shrinker.
    2826             :          *
    2827             :          * Check PageSwapCache to determine if the page is being
    2828             :          * handled by page reclaim since THP swap would add the page into
    2829             :          * swap cache before calling try_to_unmap().
    2830             :          */
    2831           1 :         if (PageSwapCache(page))
    2832             :                 return;
    2833             : 
    2834           1 :         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
    2835           1 :         if (list_empty(page_deferred_list(page))) {
    2836           1 :                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
    2837           1 :                 list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
    2838           1 :                 ds_queue->split_queue_len++;
    2839             : #ifdef CONFIG_MEMCG
    2840             :                 if (memcg)
    2841             :                         memcg_set_shrinker_bit(memcg, page_to_nid(page),
    2842             :                                                deferred_split_shrinker.id);
    2843             : #endif
    2844             :         }
    2845           1 :         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
    2846             : }
    2847             : 
    2848           0 : static unsigned long deferred_split_count(struct shrinker *shrink,
    2849             :                 struct shrink_control *sc)
    2850             : {
    2851           0 :         struct pglist_data *pgdata = NODE_DATA(sc->nid);
    2852           0 :         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
    2853             : 
    2854             : #ifdef CONFIG_MEMCG
    2855             :         if (sc->memcg)
    2856             :                 ds_queue = &sc->memcg->deferred_split_queue;
    2857             : #endif
    2858           0 :         return READ_ONCE(ds_queue->split_queue_len);
    2859             : }
    2860             : 
    2861           0 : static unsigned long deferred_split_scan(struct shrinker *shrink,
    2862             :                 struct shrink_control *sc)
    2863             : {
    2864           0 :         struct pglist_data *pgdata = NODE_DATA(sc->nid);
    2865           0 :         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
    2866           0 :         unsigned long flags;
    2867           0 :         LIST_HEAD(list), *pos, *next;
    2868           0 :         struct page *page;
    2869           0 :         int split = 0;
    2870             : 
    2871             : #ifdef CONFIG_MEMCG
    2872             :         if (sc->memcg)
    2873             :                 ds_queue = &sc->memcg->deferred_split_queue;
    2874             : #endif
    2875             : 
    2876           0 :         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
    2877             :         /* Take pin on all head pages to avoid freeing them under us */
    2878           0 :         list_for_each_safe(pos, next, &ds_queue->split_queue) {
    2879           0 :                 page = list_entry((void *)pos, struct page, mapping);
    2880           0 :                 page = compound_head(page);
    2881           0 :                 if (get_page_unless_zero(page)) {
    2882           0 :                         list_move(page_deferred_list(page), &list);
    2883             :                 } else {
    2884             :                         /* We lost race with put_compound_page() */
    2885           0 :                         list_del_init(page_deferred_list(page));
    2886           0 :                         ds_queue->split_queue_len--;
    2887             :                 }
    2888           0 :                 if (!--sc->nr_to_scan)
    2889             :                         break;
    2890             :         }
    2891           0 :         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
    2892             : 
    2893           0 :         list_for_each_safe(pos, next, &list) {
    2894           0 :                 page = list_entry((void *)pos, struct page, mapping);
    2895           0 :                 if (!trylock_page(page))
    2896           0 :                         goto next;
    2897             :                 /* split_huge_page() removes page from list on success */
    2898           0 :                 if (!split_huge_page(page))
    2899           0 :                         split++;
    2900           0 :                 unlock_page(page);
    2901           0 : next:
    2902           0 :                 put_page(page);
    2903             :         }
    2904             : 
    2905           0 :         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
    2906           0 :         list_splice_tail(&list, &ds_queue->split_queue);
    2907           0 :         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
    2908             : 
    2909             :         /*
    2910             :          * Stop shrinker if we didn't split any page, but the queue is empty.
    2911             :          * This can happen if pages were freed under us.
    2912             :          */
    2913           0 :         if (!split && list_empty(&ds_queue->split_queue))
    2914             :                 return SHRINK_STOP;
    2915           0 :         return split;
    2916             : }
    2917             : 
    2918             : static struct shrinker deferred_split_shrinker = {
    2919             :         .count_objects = deferred_split_count,
    2920             :         .scan_objects = deferred_split_scan,
    2921             :         .seeks = DEFAULT_SEEKS,
    2922             :         .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
    2923             :                  SHRINKER_NONSLAB,
    2924             : };
    2925             : 
    2926             : #ifdef CONFIG_DEBUG_FS
    2927           0 : static int split_huge_pages_set(void *data, u64 val)
    2928             : {
    2929           0 :         struct zone *zone;
    2930           0 :         struct page *page;
    2931           0 :         unsigned long pfn, max_zone_pfn;
    2932           0 :         unsigned long total = 0, split = 0;
    2933             : 
    2934           0 :         if (val != 1)
    2935             :                 return -EINVAL;
    2936             : 
    2937           0 :         for_each_populated_zone(zone) {
    2938           0 :                 max_zone_pfn = zone_end_pfn(zone);
    2939           0 :                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
    2940           0 :                         if (!pfn_valid(pfn))
    2941           0 :                                 continue;
    2942             : 
    2943           0 :                         page = pfn_to_page(pfn);
    2944           0 :                         if (!get_page_unless_zero(page))
    2945           0 :                                 continue;
    2946             : 
    2947           0 :                         if (zone != page_zone(page))
    2948           0 :                                 goto next;
    2949             : 
    2950           0 :                         if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
    2951           0 :                                 goto next;
    2952             : 
    2953           0 :                         total++;
    2954           0 :                         lock_page(page);
    2955           0 :                         if (!split_huge_page(page))
    2956           0 :                                 split++;
    2957           0 :                         unlock_page(page);
    2958           0 : next:
    2959           0 :                         put_page(page);
    2960             :                 }
    2961             :         }
    2962             : 
    2963           0 :         pr_info("%lu of %lu THP split\n", split, total);
    2964             : 
    2965           0 :         return 0;
    2966             : }
    2967           0 : DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
    2968             :                 "%llu\n");
    2969             : 
    2970           1 : static int __init split_huge_pages_debugfs(void)
    2971             : {
    2972           1 :         debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
    2973             :                             &split_huge_pages_fops);
    2974           1 :         return 0;
    2975             : }
    2976             : late_initcall(split_huge_pages_debugfs);
    2977             : #endif
    2978             : 
    2979             : #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
    2980           0 : void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
    2981             :                 struct page *page)
    2982             : {
    2983           0 :         struct vm_area_struct *vma = pvmw->vma;
    2984           0 :         struct mm_struct *mm = vma->vm_mm;
    2985           0 :         unsigned long address = pvmw->address;
    2986           0 :         pmd_t pmdval;
    2987           0 :         swp_entry_t entry;
    2988           0 :         pmd_t pmdswp;
    2989             : 
    2990           0 :         if (!(pvmw->pmd && !pvmw->pte))
    2991           0 :                 return;
    2992             : 
    2993           0 :         flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
    2994           0 :         pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
    2995           0 :         if (pmd_dirty(pmdval))
    2996           0 :                 set_page_dirty(page);
    2997           0 :         entry = make_migration_entry(page, pmd_write(pmdval));
    2998           0 :         pmdswp = swp_entry_to_pmd(entry);
    2999           0 :         if (pmd_soft_dirty(pmdval))
    3000             :                 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
    3001           0 :         set_pmd_at(mm, address, pvmw->pmd, pmdswp);
    3002           0 :         page_remove_rmap(page, true);
    3003           0 :         put_page(page);
    3004             : }
    3005             : 
    3006           0 : void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
    3007             : {
    3008           0 :         struct vm_area_struct *vma = pvmw->vma;
    3009           0 :         struct mm_struct *mm = vma->vm_mm;
    3010           0 :         unsigned long address = pvmw->address;
    3011           0 :         unsigned long mmun_start = address & HPAGE_PMD_MASK;
    3012           0 :         pmd_t pmde;
    3013           0 :         swp_entry_t entry;
    3014             : 
    3015           0 :         if (!(pvmw->pmd && !pvmw->pte))
    3016           0 :                 return;
    3017             : 
    3018           0 :         entry = pmd_to_swp_entry(*pvmw->pmd);
    3019           0 :         get_page(new);
    3020           0 :         pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
    3021           0 :         if (pmd_swp_soft_dirty(*pvmw->pmd))
    3022             :                 pmde = pmd_mksoft_dirty(pmde);
    3023           0 :         if (is_write_migration_entry(entry))
    3024           0 :                 pmde = maybe_pmd_mkwrite(pmde, vma);
    3025             : 
    3026           0 :         flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
    3027           0 :         if (PageAnon(new))
    3028           0 :                 page_add_anon_rmap(new, vma, mmun_start, true);
    3029             :         else
    3030           0 :                 page_add_file_rmap(new, true);
    3031           0 :         set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
    3032           0 :         if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
    3033           0 :                 mlock_vma_page(new);
    3034           0 :         update_mmu_cache_pmd(vma, address, pvmw->pmd);
    3035             : }
    3036             : #endif

Generated by: LCOV version 1.14