LCOV - code coverage report
Current view: top level - mm - page_alloc.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 1494 2688 55.6 %
Date: 2021-04-22 12:43:58 Functions: 130 200 65.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/page_alloc.c
       4             :  *
       5             :  *  Manages the free list, the system allocates free pages here.
       6             :  *  Note that kmalloc() lives in slab.c
       7             :  *
       8             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       9             :  *  Swap reorganised 29.12.95, Stephen Tweedie
      10             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
      11             :  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
      12             :  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
      13             :  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
      14             :  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
      15             :  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
      16             :  */
      17             : 
      18             : #include <linux/stddef.h>
      19             : #include <linux/mm.h>
      20             : #include <linux/highmem.h>
      21             : #include <linux/swap.h>
      22             : #include <linux/interrupt.h>
      23             : #include <linux/pagemap.h>
      24             : #include <linux/jiffies.h>
      25             : #include <linux/memblock.h>
      26             : #include <linux/compiler.h>
      27             : #include <linux/kernel.h>
      28             : #include <linux/kasan.h>
      29             : #include <linux/module.h>
      30             : #include <linux/suspend.h>
      31             : #include <linux/pagevec.h>
      32             : #include <linux/blkdev.h>
      33             : #include <linux/slab.h>
      34             : #include <linux/ratelimit.h>
      35             : #include <linux/oom.h>
      36             : #include <linux/topology.h>
      37             : #include <linux/sysctl.h>
      38             : #include <linux/cpu.h>
      39             : #include <linux/cpuset.h>
      40             : #include <linux/memory_hotplug.h>
      41             : #include <linux/nodemask.h>
      42             : #include <linux/vmalloc.h>
      43             : #include <linux/vmstat.h>
      44             : #include <linux/mempolicy.h>
      45             : #include <linux/memremap.h>
      46             : #include <linux/stop_machine.h>
      47             : #include <linux/random.h>
      48             : #include <linux/sort.h>
      49             : #include <linux/pfn.h>
      50             : #include <linux/backing-dev.h>
      51             : #include <linux/fault-inject.h>
      52             : #include <linux/page-isolation.h>
      53             : #include <linux/debugobjects.h>
      54             : #include <linux/kmemleak.h>
      55             : #include <linux/compaction.h>
      56             : #include <trace/events/kmem.h>
      57             : #include <trace/events/oom.h>
      58             : #include <linux/prefetch.h>
      59             : #include <linux/mm_inline.h>
      60             : #include <linux/mmu_notifier.h>
      61             : #include <linux/migrate.h>
      62             : #include <linux/hugetlb.h>
      63             : #include <linux/sched/rt.h>
      64             : #include <linux/sched/mm.h>
      65             : #include <linux/page_owner.h>
      66             : #include <linux/kthread.h>
      67             : #include <linux/memcontrol.h>
      68             : #include <linux/ftrace.h>
      69             : #include <linux/lockdep.h>
      70             : #include <linux/nmi.h>
      71             : #include <linux/psi.h>
      72             : #include <linux/padata.h>
      73             : #include <linux/khugepaged.h>
      74             : #include <linux/buffer_head.h>
      75             : 
      76             : #include <asm/sections.h>
      77             : #include <asm/tlbflush.h>
      78             : #include <asm/div64.h>
      79             : #include "internal.h"
      80             : #include "shuffle.h"
      81             : #include "page_reporting.h"
      82             : 
      83             : /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
      84             : typedef int __bitwise fpi_t;
      85             : 
      86             : /* No special request */
      87             : #define FPI_NONE                ((__force fpi_t)0)
      88             : 
      89             : /*
      90             :  * Skip free page reporting notification for the (possibly merged) page.
      91             :  * This does not hinder free page reporting from grabbing the page,
      92             :  * reporting it and marking it "reported" -  it only skips notifying
      93             :  * the free page reporting infrastructure about a newly freed page. For
      94             :  * example, used when temporarily pulling a page from a freelist and
      95             :  * putting it back unmodified.
      96             :  */
      97             : #define FPI_SKIP_REPORT_NOTIFY  ((__force fpi_t)BIT(0))
      98             : 
      99             : /*
     100             :  * Place the (possibly merged) page to the tail of the freelist. Will ignore
     101             :  * page shuffling (relevant code - e.g., memory onlining - is expected to
     102             :  * shuffle the whole zone).
     103             :  *
     104             :  * Note: No code should rely on this flag for correctness - it's purely
     105             :  *       to allow for optimizations when handing back either fresh pages
     106             :  *       (memory onlining) or untouched pages (page isolation, free page
     107             :  *       reporting).
     108             :  */
     109             : #define FPI_TO_TAIL             ((__force fpi_t)BIT(1))
     110             : 
     111             : /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
     112             : static DEFINE_MUTEX(pcp_batch_high_lock);
     113             : #define MIN_PERCPU_PAGELIST_FRACTION    (8)
     114             : 
     115             : #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
     116             : DEFINE_PER_CPU(int, numa_node);
     117             : EXPORT_PER_CPU_SYMBOL(numa_node);
     118             : #endif
     119             : 
     120             : DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
     121             : 
     122             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
     123             : /*
     124             :  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
     125             :  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
     126             :  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
     127             :  * defined in <linux/topology.h>.
     128             :  */
     129             : DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
     130             : EXPORT_PER_CPU_SYMBOL(_numa_mem_);
     131             : #endif
     132             : 
     133             : /* work_structs for global per-cpu drains */
     134             : struct pcpu_drain {
     135             :         struct zone *zone;
     136             :         struct work_struct work;
     137             : };
     138             : static DEFINE_MUTEX(pcpu_drain_mutex);
     139             : static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
     140             : 
     141             : #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
     142             : volatile unsigned long latent_entropy __latent_entropy;
     143             : EXPORT_SYMBOL(latent_entropy);
     144             : #endif
     145             : 
     146             : /*
     147             :  * Array of node states.
     148             :  */
     149             : nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
     150             :         [N_POSSIBLE] = NODE_MASK_ALL,
     151             :         [N_ONLINE] = { { [0] = 1UL } },
     152             : #ifndef CONFIG_NUMA
     153             :         [N_NORMAL_MEMORY] = { { [0] = 1UL } },
     154             : #ifdef CONFIG_HIGHMEM
     155             :         [N_HIGH_MEMORY] = { { [0] = 1UL } },
     156             : #endif
     157             :         [N_MEMORY] = { { [0] = 1UL } },
     158             :         [N_CPU] = { { [0] = 1UL } },
     159             : #endif  /* NUMA */
     160             : };
     161             : EXPORT_SYMBOL(node_states);
     162             : 
     163             : atomic_long_t _totalram_pages __read_mostly;
     164             : EXPORT_SYMBOL(_totalram_pages);
     165             : unsigned long totalreserve_pages __read_mostly;
     166             : unsigned long totalcma_pages __read_mostly;
     167             : 
     168             : int percpu_pagelist_fraction;
     169             : gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
     170             : DEFINE_STATIC_KEY_FALSE(init_on_alloc);
     171             : EXPORT_SYMBOL(init_on_alloc);
     172             : 
     173             : DEFINE_STATIC_KEY_FALSE(init_on_free);
     174             : EXPORT_SYMBOL(init_on_free);
     175             : 
     176             : static bool _init_on_alloc_enabled_early __read_mostly
     177             :                                 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
     178           0 : static int __init early_init_on_alloc(char *buf)
     179             : {
     180             : 
     181           0 :         return kstrtobool(buf, &_init_on_alloc_enabled_early);
     182             : }
     183             : early_param("init_on_alloc", early_init_on_alloc);
     184             : 
     185             : static bool _init_on_free_enabled_early __read_mostly
     186             :                                 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
     187           0 : static int __init early_init_on_free(char *buf)
     188             : {
     189           0 :         return kstrtobool(buf, &_init_on_free_enabled_early);
     190             : }
     191             : early_param("init_on_free", early_init_on_free);
     192             : 
     193             : /*
     194             :  * A cached value of the page's pageblock's migratetype, used when the page is
     195             :  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
     196             :  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
     197             :  * Also the migratetype set in the page does not necessarily match the pcplist
     198             :  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
     199             :  * other index - this ensures that it will be put on the correct CMA freelist.
     200             :  */
     201      179298 : static inline int get_pcppage_migratetype(struct page *page)
     202             : {
     203      179298 :         return page->index;
     204             : }
     205             : 
     206      216485 : static inline void set_pcppage_migratetype(struct page *page, int migratetype)
     207             : {
     208      216485 :         page->index = migratetype;
     209       83160 : }
     210             : 
     211             : #ifdef CONFIG_PM_SLEEP
     212             : /*
     213             :  * The following functions are used by the suspend/hibernate code to temporarily
     214             :  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
     215             :  * while devices are suspended.  To avoid races with the suspend/hibernate code,
     216             :  * they should always be called with system_transition_mutex held
     217             :  * (gfp_allowed_mask also should only be modified with system_transition_mutex
     218             :  * held, unless the suspend/hibernate code is guaranteed not to run in parallel
     219             :  * with that modification).
     220             :  */
     221             : 
     222             : static gfp_t saved_gfp_mask;
     223             : 
     224             : void pm_restore_gfp_mask(void)
     225             : {
     226             :         WARN_ON(!mutex_is_locked(&system_transition_mutex));
     227             :         if (saved_gfp_mask) {
     228             :                 gfp_allowed_mask = saved_gfp_mask;
     229             :                 saved_gfp_mask = 0;
     230             :         }
     231             : }
     232             : 
     233             : void pm_restrict_gfp_mask(void)
     234             : {
     235             :         WARN_ON(!mutex_is_locked(&system_transition_mutex));
     236             :         WARN_ON(saved_gfp_mask);
     237             :         saved_gfp_mask = gfp_allowed_mask;
     238             :         gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
     239             : }
     240             : 
     241             : bool pm_suspended_storage(void)
     242             : {
     243             :         if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
     244             :                 return false;
     245             :         return true;
     246             : }
     247             : #endif /* CONFIG_PM_SLEEP */
     248             : 
     249             : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
     250             : unsigned int pageblock_order __read_mostly;
     251             : #endif
     252             : 
     253             : static void __free_pages_ok(struct page *page, unsigned int order,
     254             :                             fpi_t fpi_flags);
     255             : 
     256             : /*
     257             :  * results with 256, 32 in the lowmem_reserve sysctl:
     258             :  *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
     259             :  *      1G machine -> (16M dma, 784M normal, 224M high)
     260             :  *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
     261             :  *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
     262             :  *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
     263             :  *
     264             :  * TBD: should special case ZONE_DMA32 machines here - in those we normally
     265             :  * don't need any ZONE_NORMAL reservation
     266             :  */
     267             : int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
     268             : #ifdef CONFIG_ZONE_DMA
     269             :         [ZONE_DMA] = 256,
     270             : #endif
     271             : #ifdef CONFIG_ZONE_DMA32
     272             :         [ZONE_DMA32] = 256,
     273             : #endif
     274             :         [ZONE_NORMAL] = 32,
     275             : #ifdef CONFIG_HIGHMEM
     276             :         [ZONE_HIGHMEM] = 0,
     277             : #endif
     278             :         [ZONE_MOVABLE] = 0,
     279             : };
     280             : 
     281             : static char * const zone_names[MAX_NR_ZONES] = {
     282             : #ifdef CONFIG_ZONE_DMA
     283             :          "DMA",
     284             : #endif
     285             : #ifdef CONFIG_ZONE_DMA32
     286             :          "DMA32",
     287             : #endif
     288             :          "Normal",
     289             : #ifdef CONFIG_HIGHMEM
     290             :          "HighMem",
     291             : #endif
     292             :          "Movable",
     293             : #ifdef CONFIG_ZONE_DEVICE
     294             :          "Device",
     295             : #endif
     296             : };
     297             : 
     298             : const char * const migratetype_names[MIGRATE_TYPES] = {
     299             :         "Unmovable",
     300             :         "Movable",
     301             :         "Reclaimable",
     302             :         "HighAtomic",
     303             : #ifdef CONFIG_CMA
     304             :         "CMA",
     305             : #endif
     306             : #ifdef CONFIG_MEMORY_ISOLATION
     307             :         "Isolate",
     308             : #endif
     309             : };
     310             : 
     311             : compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
     312             :         [NULL_COMPOUND_DTOR] = NULL,
     313             :         [COMPOUND_PAGE_DTOR] = free_compound_page,
     314             : #ifdef CONFIG_HUGETLB_PAGE
     315             :         [HUGETLB_PAGE_DTOR] = free_huge_page,
     316             : #endif
     317             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     318             :         [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
     319             : #endif
     320             : };
     321             : 
     322             : int min_free_kbytes = 1024;
     323             : int user_min_free_kbytes = -1;
     324             : #ifdef CONFIG_DISCONTIGMEM
     325             : /*
     326             :  * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
     327             :  * are not on separate NUMA nodes. Functionally this works but with
     328             :  * watermark_boost_factor, it can reclaim prematurely as the ranges can be
     329             :  * quite small. By default, do not boost watermarks on discontigmem as in
     330             :  * many cases very high-order allocations like THP are likely to be
     331             :  * unsupported and the premature reclaim offsets the advantage of long-term
     332             :  * fragmentation avoidance.
     333             :  */
     334             : int watermark_boost_factor __read_mostly;
     335             : #else
     336             : int watermark_boost_factor __read_mostly = 15000;
     337             : #endif
     338             : int watermark_scale_factor = 10;
     339             : 
     340             : static unsigned long nr_kernel_pages __initdata;
     341             : static unsigned long nr_all_pages __initdata;
     342             : static unsigned long dma_reserve __initdata;
     343             : 
     344             : static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
     345             : static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
     346             : static unsigned long required_kernelcore __initdata;
     347             : static unsigned long required_kernelcore_percent __initdata;
     348             : static unsigned long required_movablecore __initdata;
     349             : static unsigned long required_movablecore_percent __initdata;
     350             : static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
     351             : static bool mirrored_kernelcore __meminitdata;
     352             : 
     353             : /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
     354             : int movable_zone;
     355             : EXPORT_SYMBOL(movable_zone);
     356             : 
     357             : #if MAX_NUMNODES > 1
     358             : unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
     359             : unsigned int nr_online_nodes __read_mostly = 1;
     360             : EXPORT_SYMBOL(nr_node_ids);
     361             : EXPORT_SYMBOL(nr_online_nodes);
     362             : #endif
     363             : 
     364             : int page_group_by_mobility_disabled __read_mostly;
     365             : 
     366             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
     367             : /*
     368             :  * During boot we initialize deferred pages on-demand, as needed, but once
     369             :  * page_alloc_init_late() has finished, the deferred pages are all initialized,
     370             :  * and we can permanently disable that path.
     371             :  */
     372             : static DEFINE_STATIC_KEY_TRUE(deferred_pages);
     373             : 
     374             : /*
     375             :  * Calling kasan_free_pages() only after deferred memory initialization
     376             :  * has completed. Poisoning pages during deferred memory init will greatly
     377             :  * lengthen the process and cause problem in large memory systems as the
     378             :  * deferred pages initialization is done with interrupt disabled.
     379             :  *
     380             :  * Assuming that there will be no reference to those newly initialized
     381             :  * pages before they are ever allocated, this should have no effect on
     382             :  * KASAN memory tracking as the poison will be properly inserted at page
     383             :  * allocation time. The only corner case is when pages are allocated by
     384             :  * on-demand allocation and then freed again before the deferred pages
     385             :  * initialization is done, but this is not likely to happen.
     386             :  */
     387             : static inline void kasan_free_nondeferred_pages(struct page *page, int order)
     388             : {
     389             :         if (!static_branch_unlikely(&deferred_pages))
     390             :                 kasan_free_pages(page, order);
     391             : }
     392             : 
     393             : /* Returns true if the struct page for the pfn is uninitialised */
     394             : static inline bool __meminit early_page_uninitialised(unsigned long pfn)
     395             : {
     396             :         int nid = early_pfn_to_nid(pfn);
     397             : 
     398             :         if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
     399             :                 return true;
     400             : 
     401             :         return false;
     402             : }
     403             : 
     404             : /*
     405             :  * Returns true when the remaining initialisation should be deferred until
     406             :  * later in the boot cycle when it can be parallelised.
     407             :  */
     408             : static bool __meminit
     409             : defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
     410             : {
     411             :         static unsigned long prev_end_pfn, nr_initialised;
     412             : 
     413             :         /*
     414             :          * prev_end_pfn static that contains the end of previous zone
     415             :          * No need to protect because called very early in boot before smp_init.
     416             :          */
     417             :         if (prev_end_pfn != end_pfn) {
     418             :                 prev_end_pfn = end_pfn;
     419             :                 nr_initialised = 0;
     420             :         }
     421             : 
     422             :         /* Always populate low zones for address-constrained allocations */
     423             :         if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
     424             :                 return false;
     425             : 
     426             :         if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
     427             :                 return true;
     428             :         /*
     429             :          * We start only with one section of pages, more pages are added as
     430             :          * needed until the rest of deferred pages are initialized.
     431             :          */
     432             :         nr_initialised++;
     433             :         if ((nr_initialised > PAGES_PER_SECTION) &&
     434             :             (pfn & (PAGES_PER_SECTION - 1)) == 0) {
     435             :                 NODE_DATA(nid)->first_deferred_pfn = pfn;
     436             :                 return true;
     437             :         }
     438             :         return false;
     439             : }
     440             : #else
     441             : #define kasan_free_nondeferred_pages(p, o)      kasan_free_pages(p, o)
     442             : 
     443         233 : static inline bool early_page_uninitialised(unsigned long pfn)
     444             : {
     445         233 :         return false;
     446             : }
     447             : 
     448             : static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
     449             : {
     450             :         return false;
     451             : }
     452             : #endif
     453             : 
     454             : /* Return a pointer to the bitmap storing bits affecting a block of pages */
     455      148615 : static inline unsigned long *get_pageblock_bitmap(struct page *page,
     456             :                                                         unsigned long pfn)
     457             : {
     458             : #ifdef CONFIG_SPARSEMEM
     459      148615 :         return section_to_usemap(__pfn_to_section(pfn));
     460             : #else
     461             :         return page_zone(page)->pageblock_flags;
     462             : #endif /* CONFIG_SPARSEMEM */
     463             : }
     464             : 
     465      148615 : static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
     466             : {
     467             : #ifdef CONFIG_SPARSEMEM
     468      148615 :         pfn &= (PAGES_PER_SECTION-1);
     469             : #else
     470             :         pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
     471             : #endif /* CONFIG_SPARSEMEM */
     472      148615 :         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
     473             : }
     474             : 
     475             : static __always_inline
     476      148322 : unsigned long __get_pfnblock_flags_mask(struct page *page,
     477             :                                         unsigned long pfn,
     478             :                                         unsigned long mask)
     479             : {
     480      148322 :         unsigned long *bitmap;
     481      148322 :         unsigned long bitidx, word_bitidx;
     482      148322 :         unsigned long word;
     483             : 
     484      148322 :         bitmap = get_pageblock_bitmap(page, pfn);
     485      148322 :         bitidx = pfn_to_bitidx(page, pfn);
     486      148322 :         word_bitidx = bitidx / BITS_PER_LONG;
     487      148322 :         bitidx &= (BITS_PER_LONG-1);
     488             : 
     489      148322 :         word = bitmap[word_bitidx];
     490      148322 :         return (word >> bitidx) & mask;
     491             : }
     492             : 
     493             : /**
     494             :  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
     495             :  * @page: The page within the block of interest
     496             :  * @pfn: The target page frame number
     497             :  * @mask: mask of bits that the caller is interested in
     498             :  *
     499             :  * Return: pageblock_bits flags
     500             :  */
     501          38 : unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
     502             :                                         unsigned long mask)
     503             : {
     504          38 :         return __get_pfnblock_flags_mask(page, pfn, mask);
     505             : }
     506             : 
     507      148284 : static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
     508             : {
     509      148284 :         return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
     510             : }
     511             : 
     512             : /**
     513             :  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
     514             :  * @page: The page within the block of interest
     515             :  * @flags: The flags to set
     516             :  * @pfn: The target page frame number
     517             :  * @mask: mask of bits that the caller is interested in
     518             :  */
     519         293 : void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
     520             :                                         unsigned long pfn,
     521             :                                         unsigned long mask)
     522             : {
     523         293 :         unsigned long *bitmap;
     524         293 :         unsigned long bitidx, word_bitidx;
     525         293 :         unsigned long old_word, word;
     526             : 
     527         293 :         BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
     528         293 :         BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
     529             : 
     530         293 :         bitmap = get_pageblock_bitmap(page, pfn);
     531         293 :         bitidx = pfn_to_bitidx(page, pfn);
     532         293 :         word_bitidx = bitidx / BITS_PER_LONG;
     533         293 :         bitidx &= (BITS_PER_LONG-1);
     534             : 
     535         586 :         VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
     536             : 
     537         293 :         mask <<= bitidx;
     538         293 :         flags <<= bitidx;
     539             : 
     540         293 :         word = READ_ONCE(bitmap[word_bitidx]);
     541         293 :         for (;;) {
     542         293 :                 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
     543         293 :                 if (word == old_word)
     544             :                         break;
     545             :                 word = old_word;
     546             :         }
     547         293 : }
     548             : 
     549         293 : void set_pageblock_migratetype(struct page *page, int migratetype)
     550             : {
     551         293 :         if (unlikely(page_group_by_mobility_disabled &&
     552             :                      migratetype < MIGRATE_PCPTYPES))
     553           0 :                 migratetype = MIGRATE_UNMOVABLE;
     554             : 
     555         293 :         set_pfnblock_flags_mask(page, (unsigned long)migratetype,
     556         293 :                                 page_to_pfn(page), MIGRATETYPE_MASK);
     557         293 : }
     558             : 
     559             : #ifdef CONFIG_DEBUG_VM
     560      290654 : static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
     561             : {
     562      290654 :         int ret = 0;
     563      290654 :         unsigned seq;
     564      290654 :         unsigned long pfn = page_to_pfn(page);
     565      290654 :         unsigned long sp, start_pfn;
     566             : 
     567      290654 :         do {
     568      290654 :                 seq = zone_span_seqbegin(zone);
     569      290654 :                 start_pfn = zone->zone_start_pfn;
     570      290654 :                 sp = zone->spanned_pages;
     571      581316 :                 if (!zone_spans_pfn(zone, pfn))
     572           0 :                         ret = 1;
     573           0 :         } while (zone_span_seqretry(zone, seq));
     574             : 
     575           0 :         if (ret)
     576           0 :                 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
     577             :                         pfn, zone_to_nid(zone), zone->name,
     578             :                         start_pfn, start_pfn + sp);
     579             : 
     580      290654 :         return ret;
     581             : }
     582             : 
     583      290662 : static int page_is_consistent(struct zone *zone, struct page *page)
     584             : {
     585      290662 :         if (!pfn_valid_within(page_to_pfn(page)))
     586             :                 return 0;
     587      290662 :         if (zone != page_zone(page))
     588           0 :                 return 0;
     589             : 
     590             :         return 1;
     591             : }
     592             : /*
     593             :  * Temporary debugging check for pages not lying within a given zone.
     594             :  */
     595      290653 : static int __maybe_unused bad_range(struct zone *zone, struct page *page)
     596             : {
     597      290653 :         if (page_outside_zone_boundaries(zone, page))
     598             :                 return 1;
     599      290662 :         if (!page_is_consistent(zone, page))
     600           0 :                 return 1;
     601             : 
     602             :         return 0;
     603             : }
     604             : #else
     605             : static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
     606             : {
     607             :         return 0;
     608             : }
     609             : #endif
     610             : 
     611           0 : static void bad_page(struct page *page, const char *reason)
     612             : {
     613           0 :         static unsigned long resume;
     614           0 :         static unsigned long nr_shown;
     615           0 :         static unsigned long nr_unshown;
     616             : 
     617             :         /*
     618             :          * Allow a burst of 60 reports, then keep quiet for that minute;
     619             :          * or allow a steady drip of one report per second.
     620             :          */
     621           0 :         if (nr_shown == 60) {
     622           0 :                 if (time_before(jiffies, resume)) {
     623           0 :                         nr_unshown++;
     624           0 :                         goto out;
     625             :                 }
     626           0 :                 if (nr_unshown) {
     627           0 :                         pr_alert(
     628             :                               "BUG: Bad page state: %lu messages suppressed\n",
     629             :                                 nr_unshown);
     630           0 :                         nr_unshown = 0;
     631             :                 }
     632           0 :                 nr_shown = 0;
     633             :         }
     634           0 :         if (nr_shown++ == 0)
     635           0 :                 resume = jiffies + 60 * HZ;
     636             : 
     637           0 :         pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
     638             :                 current->comm, page_to_pfn(page));
     639           0 :         __dump_page(page, reason);
     640           0 :         dump_page_owner(page);
     641             : 
     642           0 :         print_modules();
     643           0 :         dump_stack();
     644           0 : out:
     645             :         /* Leave bad fields for debug, except PageBuddy could make trouble */
     646           0 :         page_mapcount_reset(page); /* remove PageBuddy */
     647           0 :         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     648           0 : }
     649             : 
     650             : /*
     651             :  * Higher-order pages are called "compound pages".  They are structured thusly:
     652             :  *
     653             :  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
     654             :  *
     655             :  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
     656             :  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
     657             :  *
     658             :  * The first tail page's ->compound_dtor holds the offset in array of compound
     659             :  * page destructors. See compound_page_dtors.
     660             :  *
     661             :  * The first tail page's ->compound_order holds the order of allocation.
     662             :  * This usage means that zero-order pages may not be compound.
     663             :  */
     664             : 
     665          63 : void free_compound_page(struct page *page)
     666             : {
     667          63 :         mem_cgroup_uncharge(page);
     668          63 :         __free_pages_ok(page, compound_order(page), FPI_NONE);
     669          63 : }
     670             : 
     671       21783 : void prep_compound_page(struct page *page, unsigned int order)
     672             : {
     673       21783 :         int i;
     674       21783 :         int nr_pages = 1 << order;
     675             : 
     676       21783 :         __SetPageHead(page);
     677      129557 :         for (i = 1; i < nr_pages; i++) {
     678       85991 :                 struct page *p = page + i;
     679       85991 :                 set_page_count(p, 0);
     680       85991 :                 p->mapping = TAIL_MAPPING;
     681       85991 :                 set_compound_head(p, page);
     682             :         }
     683             : 
     684       21783 :         set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
     685       21783 :         set_compound_order(page, order);
     686       21783 :         atomic_set(compound_mapcount_ptr(page), -1);
     687       21783 :         if (hpage_pincount_available(page))
     688       10160 :                 atomic_set(compound_pincount_ptr(page), 0);
     689       21783 : }
     690             : 
     691             : #ifdef CONFIG_DEBUG_PAGEALLOC
     692             : unsigned int _debug_guardpage_minorder;
     693             : 
     694             : bool _debug_pagealloc_enabled_early __read_mostly
     695             :                         = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
     696             : EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
     697             : DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
     698             : EXPORT_SYMBOL(_debug_pagealloc_enabled);
     699             : 
     700             : DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
     701             : 
     702             : static int __init early_debug_pagealloc(char *buf)
     703             : {
     704             :         return kstrtobool(buf, &_debug_pagealloc_enabled_early);
     705             : }
     706             : early_param("debug_pagealloc", early_debug_pagealloc);
     707             : 
     708             : static int __init debug_guardpage_minorder_setup(char *buf)
     709             : {
     710             :         unsigned long res;
     711             : 
     712             :         if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
     713             :                 pr_err("Bad debug_guardpage_minorder value\n");
     714             :                 return 0;
     715             :         }
     716             :         _debug_guardpage_minorder = res;
     717             :         pr_info("Setting debug_guardpage_minorder to %lu\n", res);
     718             :         return 0;
     719             : }
     720             : early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
     721             : 
     722             : static inline bool set_page_guard(struct zone *zone, struct page *page,
     723             :                                 unsigned int order, int migratetype)
     724             : {
     725             :         if (!debug_guardpage_enabled())
     726             :                 return false;
     727             : 
     728             :         if (order >= debug_guardpage_minorder())
     729             :                 return false;
     730             : 
     731             :         __SetPageGuard(page);
     732             :         INIT_LIST_HEAD(&page->lru);
     733             :         set_page_private(page, order);
     734             :         /* Guard pages are not available for any usage */
     735             :         __mod_zone_freepage_state(zone, -(1 << order), migratetype);
     736             : 
     737             :         return true;
     738             : }
     739             : 
     740             : static inline void clear_page_guard(struct zone *zone, struct page *page,
     741             :                                 unsigned int order, int migratetype)
     742             : {
     743             :         if (!debug_guardpage_enabled())
     744             :                 return;
     745             : 
     746             :         __ClearPageGuard(page);
     747             : 
     748             :         set_page_private(page, 0);
     749             :         if (!is_migrate_isolate(migratetype))
     750             :                 __mod_zone_freepage_state(zone, (1 << order), migratetype);
     751             : }
     752             : #else
     753       60148 : static inline bool set_page_guard(struct zone *zone, struct page *page,
     754       60148 :                         unsigned int order, int migratetype) { return false; }
     755             : static inline void clear_page_guard(struct zone *zone, struct page *page,
     756             :                                 unsigned int order, int migratetype) {}
     757             : #endif
     758             : 
     759             : /*
     760             :  * Enable static keys related to various memory debugging and hardening options.
     761             :  * Some override others, and depend on early params that are evaluated in the
     762             :  * order of appearance. So we need to first gather the full picture of what was
     763             :  * enabled, and then make decisions.
     764             :  */
     765           1 : void init_mem_debugging_and_hardening(void)
     766             : {
     767           1 :         if (_init_on_alloc_enabled_early) {
     768           0 :                 if (page_poisoning_enabled())
     769             :                         pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
     770             :                                 "will take precedence over init_on_alloc\n");
     771             :                 else
     772           0 :                         static_branch_enable(&init_on_alloc);
     773             :         }
     774           1 :         if (_init_on_free_enabled_early) {
     775           0 :                 if (page_poisoning_enabled())
     776             :                         pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
     777             :                                 "will take precedence over init_on_free\n");
     778             :                 else
     779           0 :                         static_branch_enable(&init_on_free);
     780             :         }
     781             : 
     782             : #ifdef CONFIG_PAGE_POISONING
     783             :         /*
     784             :          * Page poisoning is debug page alloc for some arches. If
     785             :          * either of those options are enabled, enable poisoning.
     786             :          */
     787             :         if (page_poisoning_enabled() ||
     788             :              (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
     789             :               debug_pagealloc_enabled()))
     790             :                 static_branch_enable(&_page_poisoning_enabled);
     791             : #endif
     792             : 
     793             : #ifdef CONFIG_DEBUG_PAGEALLOC
     794             :         if (!debug_pagealloc_enabled())
     795             :                 return;
     796             : 
     797             :         static_branch_enable(&_debug_pagealloc_enabled);
     798             : 
     799             :         if (!debug_guardpage_minorder())
     800             :                 return;
     801             : 
     802             :         static_branch_enable(&_debug_guardpage_enabled);
     803             : #endif
     804           1 : }
     805             : 
     806       97598 : static inline void set_buddy_order(struct page *page, unsigned int order)
     807             : {
     808       97598 :         set_page_private(page, order);
     809       97598 :         __SetPageBuddy(page);
     810       97598 : }
     811             : 
     812             : /*
     813             :  * This function checks whether a page is free && is the buddy
     814             :  * we can coalesce a page and its buddy if
     815             :  * (a) the buddy is not in a hole (check before calling!) &&
     816             :  * (b) the buddy is in the buddy system &&
     817             :  * (c) a page and its buddy have the same order &&
     818             :  * (d) a page and its buddy are in the same zone.
     819             :  *
     820             :  * For recording whether a page is in the buddy system, we set PageBuddy.
     821             :  * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
     822             :  *
     823             :  * For recording page's order, we use page_private(page).
     824             :  */
     825       88465 : static inline bool page_is_buddy(struct page *page, struct page *buddy,
     826             :                                                         unsigned int order)
     827             : {
     828       88465 :         if (!page_is_guard(buddy) && !PageBuddy(buddy))
     829             :                 return false;
     830             : 
     831       22768 :         if (buddy_order(buddy) != order)
     832             :                 return false;
     833             : 
     834             :         /*
     835             :          * zone check is done late to avoid uselessly calculating
     836             :          * zone/node ids for pages that could never merge.
     837             :          */
     838       19774 :         if (page_zone_id(page) != page_zone_id(buddy))
     839             :                 return false;
     840             : 
     841       19774 :         VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
     842             : 
     843             :         return true;
     844             : }
     845             : 
     846             : #ifdef CONFIG_COMPACTION
     847       37450 : static inline struct capture_control *task_capc(struct zone *zone)
     848             : {
     849       37450 :         struct capture_control *capc = current->capture_control;
     850             : 
     851       37450 :         return unlikely(capc) &&
     852           0 :                 !(current->flags & PF_KTHREAD) &&
     853           0 :                 !capc->page &&
     854       37450 :                 capc->cc->zone == zone ? capc : NULL;
     855             : }
     856             : 
     857             : static inline bool
     858       51272 : compaction_capture(struct capture_control *capc, struct page *page,
     859             :                    int order, int migratetype)
     860             : {
     861           0 :         if (!capc || order != capc->cc->order)
     862             :                 return false;
     863             : 
     864             :         /* Do not accidentally pollute CMA or isolated regions*/
     865           0 :         if (is_migrate_cma(migratetype) ||
     866           0 :             is_migrate_isolate(migratetype))
     867             :                 return false;
     868             : 
     869             :         /*
     870             :          * Do not let lower order allocations polluate a movable pageblock.
     871             :          * This might let an unmovable request use a reclaimable pageblock
     872             :          * and vice-versa but no more than normal fallback logic which can
     873             :          * have trouble finding a high-order free page.
     874             :          */
     875           0 :         if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
     876             :                 return false;
     877             : 
     878           0 :         capc->page = page;
     879           0 :         return true;
     880             : }
     881             : 
     882             : #else
     883             : static inline struct capture_control *task_capc(struct zone *zone)
     884             : {
     885             :         return NULL;
     886             : }
     887             : 
     888             : static inline bool
     889             : compaction_capture(struct capture_control *capc, struct page *page,
     890             :                    int order, int migratetype)
     891             : {
     892             :         return false;
     893             : }
     894             : #endif /* CONFIG_COMPACTION */
     895             : 
     896             : /* Used for pages not on another list */
     897       91611 : static inline void add_to_free_list(struct page *page, struct zone *zone,
     898             :                                     unsigned int order, int migratetype)
     899             : {
     900       91611 :         struct free_area *area = &zone->free_area[order];
     901             : 
     902       91611 :         list_add(&page->lru, &area->free_list[migratetype]);
     903       91611 :         area->nr_free++;
     904       31463 : }
     905             : 
     906             : /* Used for pages not on another list */
     907        5987 : static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
     908             :                                          unsigned int order, int migratetype)
     909             : {
     910        5987 :         struct free_area *area = &zone->free_area[order];
     911             : 
     912        5987 :         list_add_tail(&page->lru, &area->free_list[migratetype]);
     913        5987 :         area->nr_free++;
     914        5987 : }
     915             : 
     916             : /*
     917             :  * Used for pages which are on another list. Move the pages to the tail
     918             :  * of the list - so the moved pages won't immediately be considered for
     919             :  * allocation again (e.g., optimization for memory onlining).
     920             :  */
     921          38 : static inline void move_to_free_list(struct page *page, struct zone *zone,
     922             :                                      unsigned int order, int migratetype)
     923             : {
     924          38 :         struct free_area *area = &zone->free_area[order];
     925             : 
     926          38 :         list_move_tail(&page->lru, &area->free_list[migratetype]);
     927             : }
     928             : 
     929       97180 : static inline void del_page_from_free_list(struct page *page, struct zone *zone,
     930             :                                            unsigned int order)
     931             : {
     932             :         /* clear reported state and update reported page count */
     933       97180 :         if (page_reported(page))
     934           0 :                 __ClearPageReported(page);
     935             : 
     936       97180 :         list_del(&page->lru);
     937       97180 :         __ClearPageBuddy(page);
     938       97180 :         set_page_private(page, 0);
     939       97180 :         zone->free_area[order].nr_free--;
     940       97180 : }
     941             : 
     942             : /*
     943             :  * If this is not the largest possible page, check if the buddy
     944             :  * of the next-highest order is free. If it is, it's possible
     945             :  * that pages are being freed that will coalesce soon. In case,
     946             :  * that is happening, add the free page to the tail of the list
     947             :  * so it's less likely to be used soon and more likely to be merged
     948             :  * as a higher order page
     949             :  */
     950             : static inline bool
     951       37217 : buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
     952             :                    struct page *page, unsigned int order)
     953             : {
     954       37217 :         struct page *higher_page, *higher_buddy;
     955       37217 :         unsigned long combined_pfn;
     956             : 
     957       37217 :         if (order >= MAX_ORDER - 2)
     958             :                 return false;
     959             : 
     960       37193 :         if (!pfn_valid_within(buddy_pfn))
     961             :                 return false;
     962             : 
     963       37193 :         combined_pfn = buddy_pfn & pfn;
     964       37193 :         higher_page = page + (combined_pfn - pfn);
     965       37193 :         buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
     966       37193 :         higher_buddy = higher_page + (buddy_pfn - combined_pfn);
     967             : 
     968       37193 :         return pfn_valid_within(buddy_pfn) &&
     969       37193 :                page_is_buddy(higher_page, higher_buddy, order + 1);
     970             : }
     971             : 
     972             : /*
     973             :  * Freeing function for a buddy system allocator.
     974             :  *
     975             :  * The concept of a buddy system is to maintain direct-mapped table
     976             :  * (containing bit values) for memory blocks of various "orders".
     977             :  * The bottom level table contains the map for the smallest allocatable
     978             :  * units of memory (here, pages), and each level above it describes
     979             :  * pairs of units from the levels below, hence, "buddies".
     980             :  * At a high level, all that happens here is marking the table entry
     981             :  * at the bottom level available, and propagating the changes upward
     982             :  * as necessary, plus some accounting needed to play nicely with other
     983             :  * parts of the VM system.
     984             :  * At each level, we keep a list of pages, which are heads of continuous
     985             :  * free pages of length of (1 << order) and marked with PageBuddy.
     986             :  * Page's order is recorded in page_private(page) field.
     987             :  * So when we are allocating or freeing one, we can derive the state of the
     988             :  * other.  That is, if we allocate a small block, and both were
     989             :  * free, the remainder of the region must be split into blocks.
     990             :  * If a block is freed, and its buddy is also free, then this
     991             :  * triggers coalescing into a block of larger size.
     992             :  *
     993             :  * -- nyc
     994             :  */
     995             : 
     996       37450 : static inline void __free_one_page(struct page *page,
     997             :                 unsigned long pfn,
     998             :                 struct zone *zone, unsigned int order,
     999             :                 int migratetype, fpi_t fpi_flags)
    1000             : {
    1001       37450 :         struct capture_control *capc = task_capc(zone);
    1002       37450 :         unsigned long buddy_pfn;
    1003       37450 :         unsigned long combined_pfn;
    1004       37450 :         unsigned int max_order;
    1005       37450 :         struct page *buddy;
    1006       37450 :         bool to_tail;
    1007             : 
    1008       37450 :         max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
    1009             : 
    1010       37450 :         VM_BUG_ON(!zone_is_initialized(zone));
    1011       37450 :         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
    1012             : 
    1013       37450 :         VM_BUG_ON(migratetype == -1);
    1014       37450 :         if (likely(!is_migrate_isolate(migratetype)))
    1015       37450 :                 __mod_zone_freepage_state(zone, 1 << order, migratetype);
    1016             : 
    1017       37450 :         VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
    1018       37450 :         VM_BUG_ON_PAGE(bad_range(zone, page), page);
    1019             : 
    1020       37450 : continue_merging:
    1021       51470 :         while (order < max_order) {
    1022       51272 :                 if (compaction_capture(capc, page, order, migratetype)) {
    1023           0 :                         __mod_zone_freepage_state(zone, -(1 << order),
    1024             :                                                                 migratetype);
    1025           0 :                         return;
    1026             :                 }
    1027       51272 :                 buddy_pfn = __find_buddy_pfn(pfn, order);
    1028       51272 :                 buddy = page + (buddy_pfn - pfn);
    1029             : 
    1030       51272 :                 if (!pfn_valid_within(buddy_pfn))
    1031             :                         goto done_merging;
    1032       51272 :                 if (!page_is_buddy(page, buddy, order))
    1033       37252 :                         goto done_merging;
    1034             :                 /*
    1035             :                  * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
    1036             :                  * merge with it and move up one order.
    1037             :                  */
    1038       14020 :                 if (page_is_guard(buddy))
    1039       14020 :                         clear_page_guard(zone, buddy, order, migratetype);
    1040             :                 else
    1041       14020 :                         del_page_from_free_list(buddy, zone, order);
    1042       14020 :                 combined_pfn = buddy_pfn & pfn;
    1043       14020 :                 page = page + (combined_pfn - pfn);
    1044       14020 :                 pfn = combined_pfn;
    1045       14020 :                 order++;
    1046             :         }
    1047         198 :         if (order < MAX_ORDER - 1) {
    1048             :                 /* If we are here, it means order is >= pageblock_order.
    1049             :                  * We want to prevent merge between freepages on isolate
    1050             :                  * pageblock and normal pageblock. Without this, pageblock
    1051             :                  * isolation could cause incorrect freepage or CMA accounting.
    1052             :                  *
    1053             :                  * We don't want to hit this code for the more frequent
    1054             :                  * low-order merging.
    1055             :                  */
    1056           0 :                 if (unlikely(has_isolate_pageblock(zone))) {
    1057             :                         int buddy_mt;
    1058             : 
    1059             :                         buddy_pfn = __find_buddy_pfn(pfn, order);
    1060             :                         buddy = page + (buddy_pfn - pfn);
    1061             :                         buddy_mt = get_pageblock_migratetype(buddy);
    1062             : 
    1063             :                         if (migratetype != buddy_mt
    1064             :                                         && (is_migrate_isolate(migratetype) ||
    1065             :                                                 is_migrate_isolate(buddy_mt)))
    1066             :                                 goto done_merging;
    1067             :                 }
    1068           0 :                 max_order = order + 1;
    1069           0 :                 goto continue_merging;
    1070             :         }
    1071             : 
    1072         198 : done_merging:
    1073       37450 :         set_buddy_order(page, order);
    1074             : 
    1075       37450 :         if (fpi_flags & FPI_TO_TAIL)
    1076             :                 to_tail = true;
    1077       37217 :         else if (is_shuffle_order(order))
    1078             :                 to_tail = shuffle_pick_tail();
    1079             :         else
    1080       37217 :                 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
    1081             : 
    1082       37217 :         if (to_tail)
    1083        5987 :                 add_to_free_list_tail(page, zone, order, migratetype);
    1084             :         else
    1085       31463 :                 add_to_free_list(page, zone, order, migratetype);
    1086             : 
    1087             :         /* Notify page reporting subsystem of freed page */
    1088       37450 :         if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
    1089       37450 :                 page_reporting_notify_free(order);
    1090             : }
    1091             : 
    1092             : /*
    1093             :  * A bad page could be due to a number of fields. Instead of multiple branches,
    1094             :  * try and check multiple fields with one check. The caller must do a detailed
    1095             :  * check if necessary.
    1096             :  */
    1097      719859 : static inline bool page_expected_state(struct page *page,
    1098             :                                         unsigned long check_flags)
    1099             : {
    1100      719859 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
    1101             :                 return false;
    1102             : 
    1103      719857 :         if (unlikely((unsigned long)page->mapping |
    1104             :                         page_ref_count(page) |
    1105             : #ifdef CONFIG_MEMCG
    1106             :                         (unsigned long)page_memcg(page) |
    1107             : #endif
    1108             :                         (page->flags & check_flags)))
    1109           0 :                 return false;
    1110             : 
    1111             :         return true;
    1112             : }
    1113             : 
    1114           0 : static const char *page_bad_reason(struct page *page, unsigned long flags)
    1115             : {
    1116           0 :         const char *bad_reason = NULL;
    1117             : 
    1118           0 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
    1119           0 :                 bad_reason = "nonzero mapcount";
    1120           0 :         if (unlikely(page->mapping != NULL))
    1121           0 :                 bad_reason = "non-NULL mapping";
    1122           0 :         if (unlikely(page_ref_count(page) != 0))
    1123           0 :                 bad_reason = "nonzero _refcount";
    1124           0 :         if (unlikely(page->flags & flags)) {
    1125           0 :                 if (flags == PAGE_FLAGS_CHECK_AT_PREP)
    1126             :                         bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
    1127             :                 else
    1128           0 :                         bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
    1129             :         }
    1130             : #ifdef CONFIG_MEMCG
    1131             :         if (unlikely(page_memcg(page)))
    1132             :                 bad_reason = "page still charged to cgroup";
    1133             : #endif
    1134           0 :         return bad_reason;
    1135             : }
    1136             : 
    1137           0 : static void check_free_page_bad(struct page *page)
    1138             : {
    1139           0 :         bad_page(page,
    1140             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
    1141           0 : }
    1142             : 
    1143      428426 : static inline int check_free_page(struct page *page)
    1144             : {
    1145      428426 :         if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
    1146             :                 return 0;
    1147             : 
    1148             :         /* Something has gone sideways, find it */
    1149           0 :         check_free_page_bad(page);
    1150           0 :         return 1;
    1151             : }
    1152             : 
    1153       68885 : static int free_tail_pages_check(struct page *head_page, struct page *page)
    1154             : {
    1155       68885 :         int ret = 1;
    1156             : 
    1157             :         /*
    1158             :          * We rely page->lru.next never has bit 0 set, unless the page
    1159             :          * is PageTail(). Let's make sure that's true even for poisoned ->lru.
    1160             :          */
    1161       68885 :         BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
    1162             : 
    1163       68885 :         if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
    1164             :                 ret = 0;
    1165             :                 goto out;
    1166             :         }
    1167       68885 :         switch (page - head_page) {
    1168       13475 :         case 1:
    1169             :                 /* the first tail page: ->mapping may be compound_mapcount() */
    1170       13475 :                 if (unlikely(compound_mapcount(page))) {
    1171           0 :                         bad_page(page, "nonzero compound_mapcount");
    1172           0 :                         goto out;
    1173             :                 }
    1174             :                 break;
    1175             :         case 2:
    1176             :                 /*
    1177             :                  * the second tail page: ->mapping is
    1178             :                  * deferred_list.next -- ignore value.
    1179             :                  */
    1180             :                 break;
    1181       46729 :         default:
    1182       46729 :                 if (page->mapping != TAIL_MAPPING) {
    1183           0 :                         bad_page(page, "corrupted mapping in tail page");
    1184           0 :                         goto out;
    1185             :                 }
    1186             :                 break;
    1187             :         }
    1188       68885 :         if (unlikely(!PageTail(page))) {
    1189           0 :                 bad_page(page, "PageTail not set");
    1190           0 :                 goto out;
    1191             :         }
    1192      137770 :         if (unlikely(compound_head(page) != head_page)) {
    1193           0 :                 bad_page(page, "compound_head not consistent");
    1194           0 :                 goto out;
    1195             :         }
    1196             :         ret = 0;
    1197       68885 : out:
    1198       68885 :         page->mapping = NULL;
    1199       68885 :         clear_compound_head(page);
    1200       68885 :         return ret;
    1201             : }
    1202             : 
    1203       73017 : static void kernel_init_free_pages(struct page *page, int numpages)
    1204             : {
    1205       73017 :         int i;
    1206             : 
    1207             :         /* s390's use of memset() could override KASAN redzones. */
    1208       73017 :         kasan_disable_current();
    1209      230719 :         for (i = 0; i < numpages; i++) {
    1210       84685 :                 u8 tag = page_kasan_tag(page + i);
    1211       84685 :                 page_kasan_tag_reset(page + i);
    1212       84685 :                 clear_highpage(page + i);
    1213       84685 :                 page_kasan_tag_set(page + i, tag);
    1214             :         }
    1215       73017 :         kasan_enable_current();
    1216       73017 : }
    1217             : 
    1218      148328 : static __always_inline bool free_pages_prepare(struct page *page,
    1219             :                                         unsigned int order, bool check_free)
    1220             : {
    1221      148328 :         int bad = 0;
    1222             : 
    1223           0 :         VM_BUG_ON_PAGE(PageTail(page), page);
    1224             : 
    1225      148328 :         trace_mm_page_free(page, order);
    1226             : 
    1227      148341 :         if (unlikely(PageHWPoison(page)) && !order) {
    1228             :                 /*
    1229             :                  * Do not let hwpoison pages hit pcplists/buddy
    1230             :                  * Untie memcg state and reset page's owner
    1231             :                  */
    1232             :                 if (memcg_kmem_enabled() && PageMemcgKmem(page))
    1233             :                         __memcg_kmem_uncharge_page(page, order);
    1234             :                 reset_page_owner(page, order);
    1235             :                 return false;
    1236             :         }
    1237             : 
    1238             :         /*
    1239             :          * Check tail pages before head page information is cleared to
    1240             :          * avoid checking PageCompound for order-0 pages.
    1241             :          */
    1242       14959 :         if (unlikely(order)) {
    1243       14957 :                 bool compound = PageCompound(page);
    1244       14957 :                 int i;
    1245             : 
    1246       14957 :                 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
    1247             : 
    1248       14957 :                 if (compound)
    1249       13475 :                         ClearPageDoubleMap(page);
    1250      295050 :                 for (i = 1; i < (1 << order); i++) {
    1251      280093 :                         if (compound)
    1252       68885 :                                 bad += free_tail_pages_check(page, page + i);
    1253      280093 :                         if (unlikely(check_free_page(page + i))) {
    1254           0 :                                 bad++;
    1255           0 :                                 continue;
    1256             :                         }
    1257      280093 :                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1258             :                 }
    1259             :         }
    1260      148341 :         if (PageMappingFlags(page))
    1261       65808 :                 page->mapping = NULL;
    1262      148341 :         if (memcg_kmem_enabled() && PageMemcgKmem(page))
    1263             :                 __memcg_kmem_uncharge_page(page, order);
    1264      148341 :         if (check_free)
    1265      148341 :                 bad += check_free_page(page);
    1266      148340 :         if (bad)
    1267             :                 return false;
    1268             : 
    1269      148340 :         page_cpupid_reset_last(page);
    1270      148340 :         page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1271      148340 :         reset_page_owner(page, order);
    1272             : 
    1273      148340 :         if (!PageHighMem(page)) {
    1274      148340 :                 debug_check_no_locks_freed(page_address(page),
    1275             :                                            PAGE_SIZE << order);
    1276      148343 :                 debug_check_no_obj_freed(page_address(page),
    1277             :                                            PAGE_SIZE << order);
    1278             :         }
    1279      148329 :         if (want_init_on_free())
    1280           0 :                 kernel_init_free_pages(page, 1 << order);
    1281             : 
    1282      148329 :         kernel_poison_pages(page, 1 << order);
    1283             : 
    1284             :         /*
    1285             :          * With hardware tag-based KASAN, memory tags must be set before the
    1286             :          * page becomes unavailable via debug_pagealloc or arch_free_page.
    1287             :          */
    1288      148329 :         kasan_free_nondeferred_pages(page, order);
    1289             : 
    1290             :         /*
    1291             :          * arch_free_page() can make the page's contents inaccessible.  s390
    1292             :          * does this.  So nothing which can access the page's contents should
    1293             :          * happen after this.
    1294             :          */
    1295      148329 :         arch_free_page(page, order);
    1296             : 
    1297      148329 :         debug_pagealloc_unmap_pages(page, 1 << order);
    1298             : 
    1299      133370 :         return true;
    1300             : }
    1301             : 
    1302             : #ifdef CONFIG_DEBUG_VM
    1303             : /*
    1304             :  * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
    1305             :  * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
    1306             :  * moved from pcp lists to free lists.
    1307             :  */
    1308      133369 : static bool free_pcp_prepare(struct page *page)
    1309             : {
    1310      133369 :         return free_pages_prepare(page, 0, true);
    1311             : }
    1312             : 
    1313       22491 : static bool bulkfree_pcp_prepare(struct page *page)
    1314             : {
    1315       22491 :         if (debug_pagealloc_enabled_static())
    1316             :                 return check_free_page(page);
    1317             :         else
    1318       22491 :                 return false;
    1319             : }
    1320             : #else
    1321             : /*
    1322             :  * With DEBUG_VM disabled, order-0 pages being freed are checked only when
    1323             :  * moving from pcp lists to free list in order to reduce overhead. With
    1324             :  * debug_pagealloc enabled, they are checked also immediately when being freed
    1325             :  * to the pcp lists.
    1326             :  */
    1327             : static bool free_pcp_prepare(struct page *page)
    1328             : {
    1329             :         if (debug_pagealloc_enabled_static())
    1330             :                 return free_pages_prepare(page, 0, true);
    1331             :         else
    1332             :                 return free_pages_prepare(page, 0, false);
    1333             : }
    1334             : 
    1335             : static bool bulkfree_pcp_prepare(struct page *page)
    1336             : {
    1337             :         return check_free_page(page);
    1338             : }
    1339             : #endif /* CONFIG_DEBUG_VM */
    1340             : 
    1341       22491 : static inline void prefetch_buddy(struct page *page)
    1342             : {
    1343       22491 :         unsigned long pfn = page_to_pfn(page);
    1344       22491 :         unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
    1345       22491 :         struct page *buddy = page + (buddy_pfn - pfn);
    1346             : 
    1347       22491 :         prefetch(buddy);
    1348       22491 : }
    1349             : 
    1350             : /*
    1351             :  * Frees a number of pages from the PCP lists
    1352             :  * Assumes all pages on list are in same zone, and of same order.
    1353             :  * count is the number of pages to free.
    1354             :  *
    1355             :  * If the zone was previously in an "all pages pinned" state then look to
    1356             :  * see if this freeing clears that state.
    1357             :  *
    1358             :  * And clear the zone's pages_scanned counter, to hold off the "all pages are
    1359             :  * pinned" detection logic.
    1360             :  */
    1361         357 : static void free_pcppages_bulk(struct zone *zone, int count,
    1362             :                                         struct per_cpu_pages *pcp)
    1363             : {
    1364         357 :         int migratetype = 0;
    1365         357 :         int batch_free = 0;
    1366         357 :         int prefetch_nr = READ_ONCE(pcp->batch);
    1367         357 :         bool isolated_pageblocks;
    1368         357 :         struct page *page, *tmp;
    1369         357 :         LIST_HEAD(head);
    1370             : 
    1371             :         /*
    1372             :          * Ensure proper count is passed which otherwise would stuck in the
    1373             :          * below while (list_empty(list)) loop.
    1374             :          */
    1375         357 :         count = min(pcp->count, count);
    1376       14201 :         while (count) {
    1377       17133 :                 struct list_head *list;
    1378             : 
    1379             :                 /*
    1380             :                  * Remove pages from lists in a round-robin fashion. A
    1381             :                  * batch_free count is maintained that is incremented when an
    1382             :                  * empty list is encountered.  This is so more pages are freed
    1383             :                  * off fuller lists instead of spinning excessively around empty
    1384             :                  * lists
    1385             :                  */
    1386       17133 :                 do {
    1387       17133 :                         batch_free++;
    1388       17133 :                         if (++migratetype == MIGRATE_PCPTYPES)
    1389        5672 :                                 migratetype = 0;
    1390       17133 :                         list = &pcp->lists[migratetype];
    1391       17133 :                 } while (list_empty(list));
    1392             : 
    1393             :                 /* This is the only non-empty list. Free them all. */
    1394       13844 :                 if (batch_free == MIGRATE_PCPTYPES)
    1395         114 :                         batch_free = count;
    1396             : 
    1397       22491 :                 do {
    1398       22491 :                         page = list_last_entry(list, struct page, lru);
    1399             :                         /* must delete to avoid corrupting pcp list */
    1400       22491 :                         list_del(&page->lru);
    1401       22491 :                         pcp->count--;
    1402             : 
    1403       22491 :                         if (bulkfree_pcp_prepare(page))
    1404             :                                 continue;
    1405             : 
    1406       22491 :                         list_add_tail(&page->lru, &head);
    1407             : 
    1408             :                         /*
    1409             :                          * We are going to put the page back to the global
    1410             :                          * pool, prefetch its buddy to speed up later access
    1411             :                          * under zone->lock. It is believed the overhead of
    1412             :                          * an additional test and calculating buddy_pfn here
    1413             :                          * can be offset by reduced memory latency later. To
    1414             :                          * avoid excessive prefetching due to large count, only
    1415             :                          * prefetch buddy for the first pcp->batch nr of pages.
    1416             :                          */
    1417       22491 :                         if (prefetch_nr) {
    1418       22491 :                                 prefetch_buddy(page);
    1419       22491 :                                 prefetch_nr--;
    1420             :                         }
    1421       22491 :                 } while (--count && --batch_free && !list_empty(list));
    1422             :         }
    1423             : 
    1424         357 :         spin_lock(&zone->lock);
    1425         357 :         isolated_pageblocks = has_isolate_pageblock(zone);
    1426             : 
    1427             :         /*
    1428             :          * Use safe version since after __free_one_page(),
    1429             :          * page->lru.next will not point to original list.
    1430             :          */
    1431       22848 :         list_for_each_entry_safe(page, tmp, &head, lru) {
    1432       22491 :                 int mt = get_pcppage_migratetype(page);
    1433             :                 /* MIGRATE_ISOLATE page should not go to pcplists */
    1434       22491 :                 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
    1435             :                 /* Pageblock could have been isolated meanwhile */
    1436       22491 :                 if (unlikely(isolated_pageblocks))
    1437             :                         mt = get_pageblock_migratetype(page);
    1438             : 
    1439       22491 :                 __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
    1440       22491 :                 trace_mm_page_pcpu_drain(page, 0, mt);
    1441             :         }
    1442         357 :         spin_unlock(&zone->lock);
    1443         357 : }
    1444             : 
    1445       14959 : static void free_one_page(struct zone *zone,
    1446             :                                 struct page *page, unsigned long pfn,
    1447             :                                 unsigned int order,
    1448             :                                 int migratetype, fpi_t fpi_flags)
    1449             : {
    1450       14959 :         spin_lock(&zone->lock);
    1451       14959 :         if (unlikely(has_isolate_pageblock(zone) ||
    1452             :                 is_migrate_isolate(migratetype))) {
    1453             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1454             :         }
    1455       14959 :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1456       14959 :         spin_unlock(&zone->lock);
    1457       14959 : }
    1458             : 
    1459      262144 : static void __meminit __init_single_page(struct page *page, unsigned long pfn,
    1460             :                                 unsigned long zone, int nid)
    1461             : {
    1462      262144 :         mm_zero_struct_page(page);
    1463      262144 :         set_page_links(page, zone, nid, pfn);
    1464      262144 :         init_page_count(page);
    1465      262144 :         page_mapcount_reset(page);
    1466      262144 :         page_cpupid_reset_last(page);
    1467      262144 :         page_kasan_tag_reset(page);
    1468             : 
    1469      262144 :         INIT_LIST_HEAD(&page->lru);
    1470             : #ifdef WANT_PAGE_VIRTUAL
    1471             :         /* The shift won't overflow because ZONE_NORMAL is below 4G. */
    1472             :         if (!is_highmem_idx(zone))
    1473             :                 set_page_address(page, __va(pfn << PAGE_SHIFT));
    1474             : #endif
    1475      262144 : }
    1476             : 
    1477             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    1478             : static void __meminit init_reserved_page(unsigned long pfn)
    1479             : {
    1480             :         pg_data_t *pgdat;
    1481             :         int nid, zid;
    1482             : 
    1483             :         if (!early_page_uninitialised(pfn))
    1484             :                 return;
    1485             : 
    1486             :         nid = early_pfn_to_nid(pfn);
    1487             :         pgdat = NODE_DATA(nid);
    1488             : 
    1489             :         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    1490             :                 struct zone *zone = &pgdat->node_zones[zid];
    1491             : 
    1492             :                 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
    1493             :                         break;
    1494             :         }
    1495             :         __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
    1496             : }
    1497             : #else
    1498       59461 : static inline void init_reserved_page(unsigned long pfn)
    1499             : {
    1500       59461 : }
    1501             : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
    1502             : 
    1503             : /*
    1504             :  * Initialised pages do not have PageReserved set. This function is
    1505             :  * called for each range allocated by the bootmem allocator and
    1506             :  * marks the pages PageReserved. The remaining valid pages are later
    1507             :  * sent to the buddy page allocator.
    1508             :  */
    1509          20 : void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
    1510             : {
    1511          20 :         unsigned long start_pfn = PFN_DOWN(start);
    1512          20 :         unsigned long end_pfn = PFN_UP(end);
    1513             : 
    1514       59481 :         for (; start_pfn < end_pfn; start_pfn++) {
    1515       59461 :                 if (pfn_valid(start_pfn)) {
    1516       59461 :                         struct page *page = pfn_to_page(start_pfn);
    1517             : 
    1518       59461 :                         init_reserved_page(start_pfn);
    1519             : 
    1520             :                         /* Avoid false-positive PageTail() */
    1521       59461 :                         INIT_LIST_HEAD(&page->lru);
    1522             : 
    1523             :                         /*
    1524             :                          * no need for atomic set_bit because the struct
    1525             :                          * page is not visible yet so nobody should
    1526             :                          * access it yet.
    1527             :                          */
    1528      118922 :                         __SetPageReserved(page);
    1529             :                 }
    1530             :         }
    1531          20 : }
    1532             : 
    1533       14959 : static void __free_pages_ok(struct page *page, unsigned int order,
    1534             :                             fpi_t fpi_flags)
    1535             : {
    1536       14959 :         unsigned long flags;
    1537       14959 :         int migratetype;
    1538       14959 :         unsigned long pfn = page_to_pfn(page);
    1539             : 
    1540       14959 :         if (!free_pages_prepare(page, order, true))
    1541             :                 return;
    1542             : 
    1543       14959 :         migratetype = get_pfnblock_migratetype(page, pfn);
    1544       29918 :         local_irq_save(flags);
    1545       14959 :         __count_vm_events(PGFREE, 1 << order);
    1546       14959 :         free_one_page(page_zone(page), page, pfn, order, migratetype,
    1547             :                       fpi_flags);
    1548       14959 :         local_irq_restore(flags);
    1549             : }
    1550             : 
    1551         233 : void __free_pages_core(struct page *page, unsigned int order)
    1552             : {
    1553         233 :         unsigned int nr_pages = 1 << order;
    1554         233 :         struct page *p = page;
    1555         233 :         unsigned int loop;
    1556             : 
    1557             :         /*
    1558             :          * When initializing the memmap, __init_single_page() sets the refcount
    1559             :          * of all pages to 1 ("allocated"/"not free"). We have to set the
    1560             :          * refcount of all involved pages to 0.
    1561             :          */
    1562         233 :         prefetchw(p);
    1563      202925 :         for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
    1564      202459 :                 prefetchw(p + 1);
    1565      202459 :                 __ClearPageReserved(p);
    1566      202459 :                 set_page_count(p, 0);
    1567             :         }
    1568         233 :         __ClearPageReserved(p);
    1569         233 :         set_page_count(p, 0);
    1570             : 
    1571         233 :         atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
    1572             : 
    1573             :         /*
    1574             :          * Bypass PCP and place fresh pages right to the tail, primarily
    1575             :          * relevant for memory onlining.
    1576             :          */
    1577         233 :         __free_pages_ok(page, order, FPI_TO_TAIL);
    1578         233 : }
    1579             : 
    1580             : #ifdef CONFIG_NEED_MULTIPLE_NODES
    1581             : 
    1582             : /*
    1583             :  * During memory init memblocks map pfns to nids. The search is expensive and
    1584             :  * this caches recent lookups. The implementation of __early_pfn_to_nid
    1585             :  * treats start/end as pfns.
    1586             :  */
    1587             : struct mminit_pfnnid_cache {
    1588             :         unsigned long last_start;
    1589             :         unsigned long last_end;
    1590             :         int last_nid;
    1591             : };
    1592             : 
    1593             : static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
    1594             : 
    1595             : /*
    1596             :  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
    1597             :  */
    1598           3 : static int __meminit __early_pfn_to_nid(unsigned long pfn,
    1599             :                                         struct mminit_pfnnid_cache *state)
    1600             : {
    1601           3 :         unsigned long start_pfn, end_pfn;
    1602           3 :         int nid;
    1603             : 
    1604           3 :         if (state->last_start <= pfn && pfn < state->last_end)
    1605           0 :                 return state->last_nid;
    1606             : 
    1607           3 :         nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
    1608           3 :         if (nid != NUMA_NO_NODE) {
    1609           1 :                 state->last_start = start_pfn;
    1610           1 :                 state->last_end = end_pfn;
    1611           1 :                 state->last_nid = nid;
    1612             :         }
    1613             : 
    1614             :         return nid;
    1615             : }
    1616             : 
    1617           3 : int __meminit early_pfn_to_nid(unsigned long pfn)
    1618             : {
    1619           3 :         static DEFINE_SPINLOCK(early_pfn_lock);
    1620           3 :         int nid;
    1621             : 
    1622           3 :         spin_lock(&early_pfn_lock);
    1623           3 :         nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
    1624           3 :         if (nid < 0)
    1625           2 :                 nid = first_online_node;
    1626           3 :         spin_unlock(&early_pfn_lock);
    1627             : 
    1628           3 :         return nid;
    1629             : }
    1630             : #endif /* CONFIG_NEED_MULTIPLE_NODES */
    1631             : 
    1632         233 : void __init memblock_free_pages(struct page *page, unsigned long pfn,
    1633             :                                                         unsigned int order)
    1634             : {
    1635         233 :         if (early_page_uninitialised(pfn))
    1636             :                 return;
    1637         233 :         __free_pages_core(page, order);
    1638             : }
    1639             : 
    1640             : /*
    1641             :  * Check that the whole (or subset of) a pageblock given by the interval of
    1642             :  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
    1643             :  * with the migration of free compaction scanner. The scanners then need to
    1644             :  * use only pfn_valid_within() check for arches that allow holes within
    1645             :  * pageblocks.
    1646             :  *
    1647             :  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
    1648             :  *
    1649             :  * It's possible on some configurations to have a setup like node0 node1 node0
    1650             :  * i.e. it's possible that all pages within a zones range of pages do not
    1651             :  * belong to a single zone. We assume that a border between node0 and node1
    1652             :  * can occur within a single pageblock, but not a node0 node1 node0
    1653             :  * interleaving within a single pageblock. It is therefore sufficient to check
    1654             :  * the first and last page of a pageblock and avoid checking each individual
    1655             :  * page in a pageblock.
    1656             :  */
    1657         256 : struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
    1658             :                                      unsigned long end_pfn, struct zone *zone)
    1659             : {
    1660         256 :         struct page *start_page;
    1661         256 :         struct page *end_page;
    1662             : 
    1663             :         /* end_pfn is one past the range we are checking */
    1664         256 :         end_pfn--;
    1665             : 
    1666         256 :         if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
    1667             :                 return NULL;
    1668             : 
    1669         256 :         start_page = pfn_to_online_page(start_pfn);
    1670         256 :         if (!start_page)
    1671             :                 return NULL;
    1672             : 
    1673         256 :         if (page_zone(start_page) != zone)
    1674             :                 return NULL;
    1675             : 
    1676         256 :         end_page = pfn_to_page(end_pfn);
    1677             : 
    1678             :         /* This gives a shorter code than deriving page_zone(end_page) */
    1679         256 :         if (page_zone_id(start_page) != page_zone_id(end_page))
    1680           0 :                 return NULL;
    1681             : 
    1682             :         return start_page;
    1683             : }
    1684             : 
    1685           1 : void set_zone_contiguous(struct zone *zone)
    1686             : {
    1687           1 :         unsigned long block_start_pfn = zone->zone_start_pfn;
    1688           1 :         unsigned long block_end_pfn;
    1689             : 
    1690           1 :         block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
    1691         257 :         for (; block_start_pfn < zone_end_pfn(zone);
    1692         256 :                         block_start_pfn = block_end_pfn,
    1693         256 :                          block_end_pfn += pageblock_nr_pages) {
    1694             : 
    1695         256 :                 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
    1696             : 
    1697         256 :                 if (!__pageblock_pfn_to_page(block_start_pfn,
    1698             :                                              block_end_pfn, zone))
    1699             :                         return;
    1700         256 :                 cond_resched();
    1701             :         }
    1702             : 
    1703             :         /* We confirm that there is no hole */
    1704           1 :         zone->contiguous = true;
    1705             : }
    1706             : 
    1707           0 : void clear_zone_contiguous(struct zone *zone)
    1708             : {
    1709           0 :         zone->contiguous = false;
    1710           0 : }
    1711             : 
    1712             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    1713             : static void __init deferred_free_range(unsigned long pfn,
    1714             :                                        unsigned long nr_pages)
    1715             : {
    1716             :         struct page *page;
    1717             :         unsigned long i;
    1718             : 
    1719             :         if (!nr_pages)
    1720             :                 return;
    1721             : 
    1722             :         page = pfn_to_page(pfn);
    1723             : 
    1724             :         /* Free a large naturally-aligned chunk if possible */
    1725             :         if (nr_pages == pageblock_nr_pages &&
    1726             :             (pfn & (pageblock_nr_pages - 1)) == 0) {
    1727             :                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    1728             :                 __free_pages_core(page, pageblock_order);
    1729             :                 return;
    1730             :         }
    1731             : 
    1732             :         for (i = 0; i < nr_pages; i++, page++, pfn++) {
    1733             :                 if ((pfn & (pageblock_nr_pages - 1)) == 0)
    1734             :                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    1735             :                 __free_pages_core(page, 0);
    1736             :         }
    1737             : }
    1738             : 
    1739             : /* Completion tracking for deferred_init_memmap() threads */
    1740             : static atomic_t pgdat_init_n_undone __initdata;
    1741             : static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
    1742             : 
    1743             : static inline void __init pgdat_init_report_one_done(void)
    1744             : {
    1745             :         if (atomic_dec_and_test(&pgdat_init_n_undone))
    1746             :                 complete(&pgdat_init_all_done_comp);
    1747             : }
    1748             : 
    1749             : /*
    1750             :  * Returns true if page needs to be initialized or freed to buddy allocator.
    1751             :  *
    1752             :  * First we check if pfn is valid on architectures where it is possible to have
    1753             :  * holes within pageblock_nr_pages. On systems where it is not possible, this
    1754             :  * function is optimized out.
    1755             :  *
    1756             :  * Then, we check if a current large page is valid by only checking the validity
    1757             :  * of the head pfn.
    1758             :  */
    1759             : static inline bool __init deferred_pfn_valid(unsigned long pfn)
    1760             : {
    1761             :         if (!pfn_valid_within(pfn))
    1762             :                 return false;
    1763             :         if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
    1764             :                 return false;
    1765             :         return true;
    1766             : }
    1767             : 
    1768             : /*
    1769             :  * Free pages to buddy allocator. Try to free aligned pages in
    1770             :  * pageblock_nr_pages sizes.
    1771             :  */
    1772             : static void __init deferred_free_pages(unsigned long pfn,
    1773             :                                        unsigned long end_pfn)
    1774             : {
    1775             :         unsigned long nr_pgmask = pageblock_nr_pages - 1;
    1776             :         unsigned long nr_free = 0;
    1777             : 
    1778             :         for (; pfn < end_pfn; pfn++) {
    1779             :                 if (!deferred_pfn_valid(pfn)) {
    1780             :                         deferred_free_range(pfn - nr_free, nr_free);
    1781             :                         nr_free = 0;
    1782             :                 } else if (!(pfn & nr_pgmask)) {
    1783             :                         deferred_free_range(pfn - nr_free, nr_free);
    1784             :                         nr_free = 1;
    1785             :                 } else {
    1786             :                         nr_free++;
    1787             :                 }
    1788             :         }
    1789             :         /* Free the last block of pages to allocator */
    1790             :         deferred_free_range(pfn - nr_free, nr_free);
    1791             : }
    1792             : 
    1793             : /*
    1794             :  * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
    1795             :  * by performing it only once every pageblock_nr_pages.
    1796             :  * Return number of pages initialized.
    1797             :  */
    1798             : static unsigned long  __init deferred_init_pages(struct zone *zone,
    1799             :                                                  unsigned long pfn,
    1800             :                                                  unsigned long end_pfn)
    1801             : {
    1802             :         unsigned long nr_pgmask = pageblock_nr_pages - 1;
    1803             :         int nid = zone_to_nid(zone);
    1804             :         unsigned long nr_pages = 0;
    1805             :         int zid = zone_idx(zone);
    1806             :         struct page *page = NULL;
    1807             : 
    1808             :         for (; pfn < end_pfn; pfn++) {
    1809             :                 if (!deferred_pfn_valid(pfn)) {
    1810             :                         page = NULL;
    1811             :                         continue;
    1812             :                 } else if (!page || !(pfn & nr_pgmask)) {
    1813             :                         page = pfn_to_page(pfn);
    1814             :                 } else {
    1815             :                         page++;
    1816             :                 }
    1817             :                 __init_single_page(page, pfn, zid, nid);
    1818             :                 nr_pages++;
    1819             :         }
    1820             :         return (nr_pages);
    1821             : }
    1822             : 
    1823             : /*
    1824             :  * This function is meant to pre-load the iterator for the zone init.
    1825             :  * Specifically it walks through the ranges until we are caught up to the
    1826             :  * first_init_pfn value and exits there. If we never encounter the value we
    1827             :  * return false indicating there are no valid ranges left.
    1828             :  */
    1829             : static bool __init
    1830             : deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
    1831             :                                     unsigned long *spfn, unsigned long *epfn,
    1832             :                                     unsigned long first_init_pfn)
    1833             : {
    1834             :         u64 j;
    1835             : 
    1836             :         /*
    1837             :          * Start out by walking through the ranges in this zone that have
    1838             :          * already been initialized. We don't need to do anything with them
    1839             :          * so we just need to flush them out of the system.
    1840             :          */
    1841             :         for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
    1842             :                 if (*epfn <= first_init_pfn)
    1843             :                         continue;
    1844             :                 if (*spfn < first_init_pfn)
    1845             :                         *spfn = first_init_pfn;
    1846             :                 *i = j;
    1847             :                 return true;
    1848             :         }
    1849             : 
    1850             :         return false;
    1851             : }
    1852             : 
    1853             : /*
    1854             :  * Initialize and free pages. We do it in two loops: first we initialize
    1855             :  * struct page, then free to buddy allocator, because while we are
    1856             :  * freeing pages we can access pages that are ahead (computing buddy
    1857             :  * page in __free_one_page()).
    1858             :  *
    1859             :  * In order to try and keep some memory in the cache we have the loop
    1860             :  * broken along max page order boundaries. This way we will not cause
    1861             :  * any issues with the buddy page computation.
    1862             :  */
    1863             : static unsigned long __init
    1864             : deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
    1865             :                        unsigned long *end_pfn)
    1866             : {
    1867             :         unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
    1868             :         unsigned long spfn = *start_pfn, epfn = *end_pfn;
    1869             :         unsigned long nr_pages = 0;
    1870             :         u64 j = *i;
    1871             : 
    1872             :         /* First we loop through and initialize the page values */
    1873             :         for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
    1874             :                 unsigned long t;
    1875             : 
    1876             :                 if (mo_pfn <= *start_pfn)
    1877             :                         break;
    1878             : 
    1879             :                 t = min(mo_pfn, *end_pfn);
    1880             :                 nr_pages += deferred_init_pages(zone, *start_pfn, t);
    1881             : 
    1882             :                 if (mo_pfn < *end_pfn) {
    1883             :                         *start_pfn = mo_pfn;
    1884             :                         break;
    1885             :                 }
    1886             :         }
    1887             : 
    1888             :         /* Reset values and now loop through freeing pages as needed */
    1889             :         swap(j, *i);
    1890             : 
    1891             :         for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
    1892             :                 unsigned long t;
    1893             : 
    1894             :                 if (mo_pfn <= spfn)
    1895             :                         break;
    1896             : 
    1897             :                 t = min(mo_pfn, epfn);
    1898             :                 deferred_free_pages(spfn, t);
    1899             : 
    1900             :                 if (mo_pfn <= epfn)
    1901             :                         break;
    1902             :         }
    1903             : 
    1904             :         return nr_pages;
    1905             : }
    1906             : 
    1907             : static void __init
    1908             : deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
    1909             :                            void *arg)
    1910             : {
    1911             :         unsigned long spfn, epfn;
    1912             :         struct zone *zone = arg;
    1913             :         u64 i;
    1914             : 
    1915             :         deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
    1916             : 
    1917             :         /*
    1918             :          * Initialize and free pages in MAX_ORDER sized increments so that we
    1919             :          * can avoid introducing any issues with the buddy allocator.
    1920             :          */
    1921             :         while (spfn < end_pfn) {
    1922             :                 deferred_init_maxorder(&i, zone, &spfn, &epfn);
    1923             :                 cond_resched();
    1924             :         }
    1925             : }
    1926             : 
    1927             : /* An arch may override for more concurrency. */
    1928             : __weak int __init
    1929             : deferred_page_init_max_threads(const struct cpumask *node_cpumask)
    1930             : {
    1931             :         return 1;
    1932             : }
    1933             : 
    1934             : /* Initialise remaining memory on a node */
    1935             : static int __init deferred_init_memmap(void *data)
    1936             : {
    1937             :         pg_data_t *pgdat = data;
    1938             :         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
    1939             :         unsigned long spfn = 0, epfn = 0;
    1940             :         unsigned long first_init_pfn, flags;
    1941             :         unsigned long start = jiffies;
    1942             :         struct zone *zone;
    1943             :         int zid, max_threads;
    1944             :         u64 i;
    1945             : 
    1946             :         /* Bind memory initialisation thread to a local node if possible */
    1947             :         if (!cpumask_empty(cpumask))
    1948             :                 set_cpus_allowed_ptr(current, cpumask);
    1949             : 
    1950             :         pgdat_resize_lock(pgdat, &flags);
    1951             :         first_init_pfn = pgdat->first_deferred_pfn;
    1952             :         if (first_init_pfn == ULONG_MAX) {
    1953             :                 pgdat_resize_unlock(pgdat, &flags);
    1954             :                 pgdat_init_report_one_done();
    1955             :                 return 0;
    1956             :         }
    1957             : 
    1958             :         /* Sanity check boundaries */
    1959             :         BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
    1960             :         BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
    1961             :         pgdat->first_deferred_pfn = ULONG_MAX;
    1962             : 
    1963             :         /*
    1964             :          * Once we unlock here, the zone cannot be grown anymore, thus if an
    1965             :          * interrupt thread must allocate this early in boot, zone must be
    1966             :          * pre-grown prior to start of deferred page initialization.
    1967             :          */
    1968             :         pgdat_resize_unlock(pgdat, &flags);
    1969             : 
    1970             :         /* Only the highest zone is deferred so find it */
    1971             :         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    1972             :                 zone = pgdat->node_zones + zid;
    1973             :                 if (first_init_pfn < zone_end_pfn(zone))
    1974             :                         break;
    1975             :         }
    1976             : 
    1977             :         /* If the zone is empty somebody else may have cleared out the zone */
    1978             :         if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    1979             :                                                  first_init_pfn))
    1980             :                 goto zone_empty;
    1981             : 
    1982             :         max_threads = deferred_page_init_max_threads(cpumask);
    1983             : 
    1984             :         while (spfn < epfn) {
    1985             :                 unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
    1986             :                 struct padata_mt_job job = {
    1987             :                         .thread_fn   = deferred_init_memmap_chunk,
    1988             :                         .fn_arg      = zone,
    1989             :                         .start       = spfn,
    1990             :                         .size        = epfn_align - spfn,
    1991             :                         .align       = PAGES_PER_SECTION,
    1992             :                         .min_chunk   = PAGES_PER_SECTION,
    1993             :                         .max_threads = max_threads,
    1994             :                 };
    1995             : 
    1996             :                 padata_do_multithreaded(&job);
    1997             :                 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    1998             :                                                     epfn_align);
    1999             :         }
    2000             : zone_empty:
    2001             :         /* Sanity check that the next zone really is unpopulated */
    2002             :         WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
    2003             : 
    2004             :         pr_info("node %d deferred pages initialised in %ums\n",
    2005             :                 pgdat->node_id, jiffies_to_msecs(jiffies - start));
    2006             : 
    2007             :         pgdat_init_report_one_done();
    2008             :         return 0;
    2009             : }
    2010             : 
    2011             : /*
    2012             :  * If this zone has deferred pages, try to grow it by initializing enough
    2013             :  * deferred pages to satisfy the allocation specified by order, rounded up to
    2014             :  * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
    2015             :  * of SECTION_SIZE bytes by initializing struct pages in increments of
    2016             :  * PAGES_PER_SECTION * sizeof(struct page) bytes.
    2017             :  *
    2018             :  * Return true when zone was grown, otherwise return false. We return true even
    2019             :  * when we grow less than requested, to let the caller decide if there are
    2020             :  * enough pages to satisfy the allocation.
    2021             :  *
    2022             :  * Note: We use noinline because this function is needed only during boot, and
    2023             :  * it is called from a __ref function _deferred_grow_zone. This way we are
    2024             :  * making sure that it is not inlined into permanent text section.
    2025             :  */
    2026             : static noinline bool __init
    2027             : deferred_grow_zone(struct zone *zone, unsigned int order)
    2028             : {
    2029             :         unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
    2030             :         pg_data_t *pgdat = zone->zone_pgdat;
    2031             :         unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
    2032             :         unsigned long spfn, epfn, flags;
    2033             :         unsigned long nr_pages = 0;
    2034             :         u64 i;
    2035             : 
    2036             :         /* Only the last zone may have deferred pages */
    2037             :         if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
    2038             :                 return false;
    2039             : 
    2040             :         pgdat_resize_lock(pgdat, &flags);
    2041             : 
    2042             :         /*
    2043             :          * If someone grew this zone while we were waiting for spinlock, return
    2044             :          * true, as there might be enough pages already.
    2045             :          */
    2046             :         if (first_deferred_pfn != pgdat->first_deferred_pfn) {
    2047             :                 pgdat_resize_unlock(pgdat, &flags);
    2048             :                 return true;
    2049             :         }
    2050             : 
    2051             :         /* If the zone is empty somebody else may have cleared out the zone */
    2052             :         if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    2053             :                                                  first_deferred_pfn)) {
    2054             :                 pgdat->first_deferred_pfn = ULONG_MAX;
    2055             :                 pgdat_resize_unlock(pgdat, &flags);
    2056             :                 /* Retry only once. */
    2057             :                 return first_deferred_pfn != ULONG_MAX;
    2058             :         }
    2059             : 
    2060             :         /*
    2061             :          * Initialize and free pages in MAX_ORDER sized increments so
    2062             :          * that we can avoid introducing any issues with the buddy
    2063             :          * allocator.
    2064             :          */
    2065             :         while (spfn < epfn) {
    2066             :                 /* update our first deferred PFN for this section */
    2067             :                 first_deferred_pfn = spfn;
    2068             : 
    2069             :                 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
    2070             :                 touch_nmi_watchdog();
    2071             : 
    2072             :                 /* We should only stop along section boundaries */
    2073             :                 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
    2074             :                         continue;
    2075             : 
    2076             :                 /* If our quota has been met we can stop here */
    2077             :                 if (nr_pages >= nr_pages_needed)
    2078             :                         break;
    2079             :         }
    2080             : 
    2081             :         pgdat->first_deferred_pfn = spfn;
    2082             :         pgdat_resize_unlock(pgdat, &flags);
    2083             : 
    2084             :         return nr_pages > 0;
    2085             : }
    2086             : 
    2087             : /*
    2088             :  * deferred_grow_zone() is __init, but it is called from
    2089             :  * get_page_from_freelist() during early boot until deferred_pages permanently
    2090             :  * disables this call. This is why we have refdata wrapper to avoid warning,
    2091             :  * and to ensure that the function body gets unloaded.
    2092             :  */
    2093             : static bool __ref
    2094             : _deferred_grow_zone(struct zone *zone, unsigned int order)
    2095             : {
    2096             :         return deferred_grow_zone(zone, order);
    2097             : }
    2098             : 
    2099             : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
    2100             : 
    2101           1 : void __init page_alloc_init_late(void)
    2102             : {
    2103           1 :         struct zone *zone;
    2104           1 :         int nid;
    2105             : 
    2106             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    2107             : 
    2108             :         /* There will be num_node_state(N_MEMORY) threads */
    2109             :         atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
    2110             :         for_each_node_state(nid, N_MEMORY) {
    2111             :                 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
    2112             :         }
    2113             : 
    2114             :         /* Block until all are initialised */
    2115             :         wait_for_completion(&pgdat_init_all_done_comp);
    2116             : 
    2117             :         /*
    2118             :          * The number of managed pages has changed due to the initialisation
    2119             :          * so the pcpu batch and high limits needs to be updated or the limits
    2120             :          * will be artificially small.
    2121             :          */
    2122             :         for_each_populated_zone(zone)
    2123             :                 zone_pcp_update(zone);
    2124             : 
    2125             :         /*
    2126             :          * We initialized the rest of the deferred pages.  Permanently disable
    2127             :          * on-demand struct page initialization.
    2128             :          */
    2129             :         static_branch_disable(&deferred_pages);
    2130             : 
    2131             :         /* Reinit limits that are based on free pages after the kernel is up */
    2132             :         files_maxfiles_init();
    2133             : #endif
    2134             : 
    2135           1 :         buffer_init();
    2136             : 
    2137             :         /* Discard memblock private memory */
    2138           1 :         memblock_discard();
    2139             : 
    2140           2 :         for_each_node_state(nid, N_MEMORY)
    2141           1 :                 shuffle_free_memory(NODE_DATA(nid));
    2142             : 
    2143           4 :         for_each_populated_zone(zone)
    2144           1 :                 set_zone_contiguous(zone);
    2145           1 : }
    2146             : 
    2147             : #ifdef CONFIG_CMA
    2148             : /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
    2149             : void __init init_cma_reserved_pageblock(struct page *page)
    2150             : {
    2151             :         unsigned i = pageblock_nr_pages;
    2152             :         struct page *p = page;
    2153             : 
    2154             :         do {
    2155             :                 __ClearPageReserved(p);
    2156             :                 set_page_count(p, 0);
    2157             :         } while (++p, --i);
    2158             : 
    2159             :         set_pageblock_migratetype(page, MIGRATE_CMA);
    2160             : 
    2161             :         if (pageblock_order >= MAX_ORDER) {
    2162             :                 i = pageblock_nr_pages;
    2163             :                 p = page;
    2164             :                 do {
    2165             :                         set_page_refcounted(p);
    2166             :                         __free_pages(p, MAX_ORDER - 1);
    2167             :                         p += MAX_ORDER_NR_PAGES;
    2168             :                 } while (i -= MAX_ORDER_NR_PAGES);
    2169             :         } else {
    2170             :                 set_page_refcounted(page);
    2171             :                 __free_pages(page, pageblock_order);
    2172             :         }
    2173             : 
    2174             :         adjust_managed_page_count(page, pageblock_nr_pages);
    2175             :         page_zone(page)->cma_pages += pageblock_nr_pages;
    2176             : }
    2177             : #endif
    2178             : 
    2179             : /*
    2180             :  * The order of subdivision here is critical for the IO subsystem.
    2181             :  * Please do not alter this order without good reasons and regression
    2182             :  * testing. Specifically, as large blocks of memory are subdivided,
    2183             :  * the order in which smaller blocks are delivered depends on the order
    2184             :  * they're subdivided in this function. This is the primary factor
    2185             :  * influencing the order in which pages are delivered to the IO
    2186             :  * subsystem according to empirical testing, and this is also justified
    2187             :  * by considering the behavior of a buddy system containing a single
    2188             :  * large block of memory acted on by a series of small allocations.
    2189             :  * This behavior is a critical factor in sglist merging's success.
    2190             :  *
    2191             :  * -- nyc
    2192             :  */
    2193       83160 : static inline void expand(struct zone *zone, struct page *page,
    2194             :         int low, int high, int migratetype)
    2195             : {
    2196       83160 :         unsigned long size = 1 << high;
    2197             : 
    2198      143308 :         while (high > low) {
    2199       60148 :                 high--;
    2200       60148 :                 size >>= 1;
    2201       60148 :                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
    2202             : 
    2203             :                 /*
    2204             :                  * Mark as guard pages (or page), that will allow to
    2205             :                  * merge back to allocator when buddy will be freed.
    2206             :                  * Corresponding page table entries will not be touched,
    2207             :                  * pages will stay not present in virtual address space
    2208             :                  */
    2209       60148 :                 if (set_page_guard(zone, &page[size], high, migratetype))
    2210             :                         continue;
    2211             : 
    2212       60148 :                 add_to_free_list(&page[size], zone, high, migratetype);
    2213       60148 :                 set_buddy_order(&page[size], high);
    2214             :         }
    2215       83160 : }
    2216             : 
    2217           0 : static void check_new_page_bad(struct page *page)
    2218             : {
    2219           0 :         if (unlikely(page->flags & __PG_HWPOISON)) {
    2220             :                 /* Don't complain about hwpoisoned pages */
    2221             :                 page_mapcount_reset(page); /* remove PageBuddy */
    2222             :                 return;
    2223             :         }
    2224             : 
    2225           0 :         bad_page(page,
    2226             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
    2227             : }
    2228             : 
    2229             : /*
    2230             :  * This page is about to be returned from the page allocator
    2231             :  */
    2232      291457 : static inline int check_new_page(struct page *page)
    2233             : {
    2234      291457 :         if (likely(page_expected_state(page,
    2235             :                                 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
    2236             :                 return 0;
    2237             : 
    2238           0 :         check_new_page_bad(page);
    2239           0 :         return 1;
    2240             : }
    2241             : 
    2242             : #ifdef CONFIG_DEBUG_VM
    2243             : /*
    2244             :  * With DEBUG_VM enabled, order-0 pages are checked for expected state when
    2245             :  * being allocated from pcp lists. With debug_pagealloc also enabled, they are
    2246             :  * also checked when pcp lists are refilled from the free lists.
    2247             :  */
    2248       59738 : static inline bool check_pcp_refill(struct page *page)
    2249             : {
    2250       59738 :         if (debug_pagealloc_enabled_static())
    2251             :                 return check_new_page(page);
    2252             :         else
    2253       59738 :                 return false;
    2254             : }
    2255             : 
    2256      169567 : static inline bool check_new_pcp(struct page *page)
    2257             : {
    2258      169567 :         return check_new_page(page);
    2259             : }
    2260             : #else
    2261             : /*
    2262             :  * With DEBUG_VM disabled, free order-0 pages are checked for expected state
    2263             :  * when pcp lists are being refilled from the free lists. With debug_pagealloc
    2264             :  * enabled, they are also checked when being allocated from the pcp lists.
    2265             :  */
    2266             : static inline bool check_pcp_refill(struct page *page)
    2267             : {
    2268             :         return check_new_page(page);
    2269             : }
    2270             : static inline bool check_new_pcp(struct page *page)
    2271             : {
    2272             :         if (debug_pagealloc_enabled_static())
    2273             :                 return check_new_page(page);
    2274             :         else
    2275             :                 return false;
    2276             : }
    2277             : #endif /* CONFIG_DEBUG_VM */
    2278             : 
    2279       23422 : static bool check_new_pages(struct page *page, unsigned int order)
    2280             : {
    2281       23422 :         int i;
    2282      145284 :         for (i = 0; i < (1 << order); i++) {
    2283      121862 :                 struct page *p = page + i;
    2284             : 
    2285      121862 :                 if (unlikely(check_new_page(p)))
    2286             :                         return true;
    2287             :         }
    2288             : 
    2289             :         return false;
    2290             : }
    2291             : 
    2292      193027 : inline void post_alloc_hook(struct page *page, unsigned int order,
    2293             :                                 gfp_t gfp_flags)
    2294             : {
    2295      193027 :         set_page_private(page, 0);
    2296      193027 :         set_page_refcounted(page);
    2297             : 
    2298      193031 :         arch_alloc_page(page, order);
    2299      193031 :         debug_pagealloc_map_pages(page, 1 << order);
    2300      193031 :         kasan_alloc_pages(page, order);
    2301      192995 :         kernel_unpoison_pages(page, 1 << order);
    2302      192995 :         set_page_owner(page, order, gfp_flags);
    2303             : 
    2304      192995 :         if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
    2305       73017 :                 kernel_init_free_pages(page, 1 << order);
    2306      193015 : }
    2307             : 
    2308      193027 : static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
    2309             :                                                         unsigned int alloc_flags)
    2310             : {
    2311      193027 :         post_alloc_hook(page, order, gfp_flags);
    2312             : 
    2313      193025 :         if (order && (gfp_flags & __GFP_COMP))
    2314       21783 :                 prep_compound_page(page, order);
    2315             : 
    2316             :         /*
    2317             :          * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
    2318             :          * allocate the page. The expectation is that the caller is taking
    2319             :          * steps that will free more memory. The caller should avoid the page
    2320             :          * being used for !PFMEMALLOC purposes.
    2321             :          */
    2322      193025 :         if (alloc_flags & ALLOC_NO_WATERMARKS)
    2323           0 :                 set_page_pfmemalloc(page);
    2324             :         else
    2325      193025 :                 clear_page_pfmemalloc(page);
    2326      193025 : }
    2327             : 
    2328             : /*
    2329             :  * Go through the free lists for the given migratetype and remove
    2330             :  * the smallest available page from the freelists
    2331             :  */
    2332             : static __always_inline
    2333       83198 : struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
    2334             :                                                 int migratetype)
    2335             : {
    2336       83198 :         unsigned int current_order;
    2337       83198 :         struct free_area *area;
    2338       83198 :         struct page *page;
    2339             : 
    2340             :         /* Find a page of the appropriate size in the preferred list */
    2341      143685 :         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
    2342      143647 :                 area = &(zone->free_area[current_order]);
    2343      143647 :                 page = get_page_from_free_area(area, migratetype);
    2344       83160 :                 if (!page)
    2345       60487 :                         continue;
    2346       83160 :                 del_page_from_free_list(page, zone, current_order);
    2347       83160 :                 expand(zone, page, order, current_order, migratetype);
    2348       83160 :                 set_pcppage_migratetype(page, migratetype);
    2349             :                 return page;
    2350             :         }
    2351             : 
    2352             :         return NULL;
    2353             : }
    2354             : 
    2355             : 
    2356             : /*
    2357             :  * This array describes the order lists are fallen back to when
    2358             :  * the free lists for the desirable migrate type are depleted
    2359             :  */
    2360             : static int fallbacks[MIGRATE_TYPES][3] = {
    2361             :         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
    2362             :         [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
    2363             :         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
    2364             : #ifdef CONFIG_CMA
    2365             :         [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
    2366             : #endif
    2367             : #ifdef CONFIG_MEMORY_ISOLATION
    2368             :         [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
    2369             : #endif
    2370             : };
    2371             : 
    2372             : #ifdef CONFIG_CMA
    2373             : static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    2374             :                                         unsigned int order)
    2375             : {
    2376             :         return __rmqueue_smallest(zone, order, MIGRATE_CMA);
    2377             : }
    2378             : #else
    2379             : static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    2380             :                                         unsigned int order) { return NULL; }
    2381             : #endif
    2382             : 
    2383             : /*
    2384             :  * Move the free pages in a range to the freelist tail of the requested type.
    2385             :  * Note that start_page and end_pages are not aligned on a pageblock
    2386             :  * boundary. If alignment is required, use move_freepages_block()
    2387             :  */
    2388           0 : static int move_freepages(struct zone *zone,
    2389             :                           struct page *start_page, struct page *end_page,
    2390             :                           int migratetype, int *num_movable)
    2391             : {
    2392           0 :         struct page *page;
    2393           0 :         unsigned int order;
    2394           0 :         int pages_moved = 0;
    2395             : 
    2396           0 :         for (page = start_page; page <= end_page;) {
    2397           0 :                 if (!pfn_valid_within(page_to_pfn(page))) {
    2398             :                         page++;
    2399             :                         continue;
    2400             :                 }
    2401             : 
    2402           0 :                 if (!PageBuddy(page)) {
    2403             :                         /*
    2404             :                          * We assume that pages that could be isolated for
    2405             :                          * migration are movable. But we don't actually try
    2406             :                          * isolating, as that would be expensive.
    2407             :                          */
    2408           0 :                         if (num_movable &&
    2409           0 :                                         (PageLRU(page) || __PageMovable(page)))
    2410           0 :                                 (*num_movable)++;
    2411             : 
    2412           0 :                         page++;
    2413           0 :                         continue;
    2414             :                 }
    2415             : 
    2416             :                 /* Make sure we are not inadvertently changing nodes */
    2417           0 :                 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
    2418           0 :                 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
    2419             : 
    2420           0 :                 order = buddy_order(page);
    2421           0 :                 move_to_free_list(page, zone, order, migratetype);
    2422           0 :                 page += 1 << order;
    2423           0 :                 pages_moved += 1 << order;
    2424             :         }
    2425             : 
    2426           0 :         return pages_moved;
    2427             : }
    2428             : 
    2429           0 : int move_freepages_block(struct zone *zone, struct page *page,
    2430             :                                 int migratetype, int *num_movable)
    2431             : {
    2432           0 :         unsigned long start_pfn, end_pfn;
    2433           0 :         struct page *start_page, *end_page;
    2434             : 
    2435           0 :         if (num_movable)
    2436           0 :                 *num_movable = 0;
    2437             : 
    2438           0 :         start_pfn = page_to_pfn(page);
    2439           0 :         start_pfn = start_pfn & ~(pageblock_nr_pages-1);
    2440           0 :         start_page = pfn_to_page(start_pfn);
    2441           0 :         end_page = start_page + pageblock_nr_pages - 1;
    2442           0 :         end_pfn = start_pfn + pageblock_nr_pages - 1;
    2443             : 
    2444             :         /* Do not cross zone boundaries */
    2445           0 :         if (!zone_spans_pfn(zone, start_pfn))
    2446           0 :                 start_page = page;
    2447           0 :         if (!zone_spans_pfn(zone, end_pfn))
    2448             :                 return 0;
    2449             : 
    2450           0 :         return move_freepages(zone, start_page, end_page, migratetype,
    2451             :                                                                 num_movable);
    2452             : }
    2453             : 
    2454          38 : static void change_pageblock_range(struct page *pageblock_page,
    2455             :                                         int start_order, int migratetype)
    2456             : {
    2457          38 :         int nr_pageblocks = 1 << (start_order - pageblock_order);
    2458             : 
    2459          76 :         while (nr_pageblocks--) {
    2460          38 :                 set_pageblock_migratetype(pageblock_page, migratetype);
    2461          38 :                 pageblock_page += pageblock_nr_pages;
    2462             :         }
    2463          38 : }
    2464             : 
    2465             : /*
    2466             :  * When we are falling back to another migratetype during allocation, try to
    2467             :  * steal extra free pages from the same pageblocks to satisfy further
    2468             :  * allocations, instead of polluting multiple pageblocks.
    2469             :  *
    2470             :  * If we are stealing a relatively large buddy page, it is likely there will
    2471             :  * be more free pages in the pageblock, so try to steal them all. For
    2472             :  * reclaimable and unmovable allocations, we steal regardless of page size,
    2473             :  * as fragmentation caused by those allocations polluting movable pageblocks
    2474             :  * is worse than movable allocations stealing from unmovable and reclaimable
    2475             :  * pageblocks.
    2476             :  */
    2477          38 : static bool can_steal_fallback(unsigned int order, int start_mt)
    2478             : {
    2479             :         /*
    2480             :          * Leaving this order check is intended, although there is
    2481             :          * relaxed order check in next check. The reason is that
    2482             :          * we can actually steal whole pageblock if this condition met,
    2483             :          * but, below check doesn't guarantee it and that is just heuristic
    2484             :          * so could be changed anytime.
    2485             :          */
    2486          38 :         if (order >= pageblock_order)
    2487             :                 return true;
    2488             : 
    2489           0 :         if (order >= pageblock_order / 2 ||
    2490           0 :                 start_mt == MIGRATE_RECLAIMABLE ||
    2491           0 :                 start_mt == MIGRATE_UNMOVABLE ||
    2492             :                 page_group_by_mobility_disabled)
    2493           0 :                 return true;
    2494             : 
    2495             :         return false;
    2496             : }
    2497             : 
    2498           0 : static inline bool boost_watermark(struct zone *zone)
    2499             : {
    2500           0 :         unsigned long max_boost;
    2501             : 
    2502           0 :         if (!watermark_boost_factor)
    2503             :                 return false;
    2504             :         /*
    2505             :          * Don't bother in zones that are unlikely to produce results.
    2506             :          * On small machines, including kdump capture kernels running
    2507             :          * in a small area, boosting the watermark can cause an out of
    2508             :          * memory situation immediately.
    2509             :          */
    2510           0 :         if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
    2511             :                 return false;
    2512             : 
    2513           0 :         max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
    2514             :                         watermark_boost_factor, 10000);
    2515             : 
    2516             :         /*
    2517             :          * high watermark may be uninitialised if fragmentation occurs
    2518             :          * very early in boot so do not boost. We do not fall
    2519             :          * through and boost by pageblock_nr_pages as failing
    2520             :          * allocations that early means that reclaim is not going
    2521             :          * to help and it may even be impossible to reclaim the
    2522             :          * boosted watermark resulting in a hang.
    2523             :          */
    2524           0 :         if (!max_boost)
    2525             :                 return false;
    2526             : 
    2527           0 :         max_boost = max(pageblock_nr_pages, max_boost);
    2528             : 
    2529           0 :         zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
    2530             :                 max_boost);
    2531             : 
    2532           0 :         return true;
    2533             : }
    2534             : 
    2535             : /*
    2536             :  * This function implements actual steal behaviour. If order is large enough,
    2537             :  * we can steal whole pageblock. If not, we first move freepages in this
    2538             :  * pageblock to our migratetype and determine how many already-allocated pages
    2539             :  * are there in the pageblock with a compatible migratetype. If at least half
    2540             :  * of pages are free or compatible, we can change migratetype of the pageblock
    2541             :  * itself, so pages freed in the future will be put on the correct free list.
    2542             :  */
    2543          38 : static void steal_suitable_fallback(struct zone *zone, struct page *page,
    2544             :                 unsigned int alloc_flags, int start_type, bool whole_block)
    2545             : {
    2546          38 :         unsigned int current_order = buddy_order(page);
    2547          38 :         int free_pages, movable_pages, alike_pages;
    2548          38 :         int old_block_type;
    2549             : 
    2550          38 :         old_block_type = get_pageblock_migratetype(page);
    2551             : 
    2552             :         /*
    2553             :          * This can happen due to races and we want to prevent broken
    2554             :          * highatomic accounting.
    2555             :          */
    2556          38 :         if (is_migrate_highatomic(old_block_type))
    2557           0 :                 goto single_page;
    2558             : 
    2559             :         /* Take ownership for orders >= pageblock_order */
    2560          38 :         if (current_order >= pageblock_order) {
    2561          38 :                 change_pageblock_range(page, current_order, start_type);
    2562          38 :                 goto single_page;
    2563             :         }
    2564             : 
    2565             :         /*
    2566             :          * Boost watermarks to increase reclaim pressure to reduce the
    2567             :          * likelihood of future fallbacks. Wake kswapd now as the node
    2568             :          * may be balanced overall and kswapd will not wake naturally.
    2569             :          */
    2570           0 :         if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
    2571           0 :                 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    2572             : 
    2573             :         /* We are not allowed to try stealing from the whole block */
    2574           0 :         if (!whole_block)
    2575           0 :                 goto single_page;
    2576             : 
    2577           0 :         free_pages = move_freepages_block(zone, page, start_type,
    2578             :                                                 &movable_pages);
    2579             :         /*
    2580             :          * Determine how many pages are compatible with our allocation.
    2581             :          * For movable allocation, it's the number of movable pages which
    2582             :          * we just obtained. For other types it's a bit more tricky.
    2583             :          */
    2584           0 :         if (start_type == MIGRATE_MOVABLE) {
    2585           0 :                 alike_pages = movable_pages;
    2586             :         } else {
    2587             :                 /*
    2588             :                  * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
    2589             :                  * to MOVABLE pageblock, consider all non-movable pages as
    2590             :                  * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
    2591             :                  * vice versa, be conservative since we can't distinguish the
    2592             :                  * exact migratetype of non-movable pages.
    2593             :                  */
    2594           0 :                 if (old_block_type == MIGRATE_MOVABLE)
    2595           0 :                         alike_pages = pageblock_nr_pages
    2596           0 :                                                 - (free_pages + movable_pages);
    2597             :                 else
    2598             :                         alike_pages = 0;
    2599             :         }
    2600             : 
    2601             :         /* moving whole block can fail due to zone boundary conditions */
    2602           0 :         if (!free_pages)
    2603           0 :                 goto single_page;
    2604             : 
    2605             :         /*
    2606             :          * If a sufficient number of pages in the block are either free or of
    2607             :          * comparable migratability as our allocation, claim the whole block.
    2608             :          */
    2609           0 :         if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
    2610             :                         page_group_by_mobility_disabled)
    2611           0 :                 set_pageblock_migratetype(page, start_type);
    2612             : 
    2613           0 :         return;
    2614             : 
    2615          38 : single_page:
    2616          38 :         move_to_free_list(page, zone, current_order, start_type);
    2617             : }
    2618             : 
    2619             : /*
    2620             :  * Check whether there is a suitable fallback freepage with requested order.
    2621             :  * If only_stealable is true, this function returns fallback_mt only if
    2622             :  * we can steal other freepages all together. This would help to reduce
    2623             :  * fragmentation due to mixed migratetype pages in one pageblock.
    2624             :  */
    2625          38 : int find_suitable_fallback(struct free_area *area, unsigned int order,
    2626             :                         int migratetype, bool only_stealable, bool *can_steal)
    2627             : {
    2628          38 :         int i;
    2629          38 :         int fallback_mt;
    2630             : 
    2631          38 :         if (area->nr_free == 0)
    2632             :                 return -1;
    2633             : 
    2634          38 :         *can_steal = false;
    2635          76 :         for (i = 0;; i++) {
    2636          76 :                 fallback_mt = fallbacks[migratetype][i];
    2637          76 :                 if (fallback_mt == MIGRATE_TYPES)
    2638             :                         break;
    2639             : 
    2640          76 :                 if (free_area_empty(area, fallback_mt))
    2641          38 :                         continue;
    2642             : 
    2643          38 :                 if (can_steal_fallback(order, migratetype))
    2644          38 :                         *can_steal = true;
    2645             : 
    2646          38 :                 if (!only_stealable)
    2647          38 :                         return fallback_mt;
    2648             : 
    2649           0 :                 if (*can_steal)
    2650           0 :                         return fallback_mt;
    2651             :         }
    2652             : 
    2653             :         return -1;
    2654             : }
    2655             : 
    2656             : /*
    2657             :  * Reserve a pageblock for exclusive use of high-order atomic allocations if
    2658             :  * there are no empty page blocks that contain a page with a suitable order
    2659             :  */
    2660           0 : static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
    2661             :                                 unsigned int alloc_order)
    2662             : {
    2663           0 :         int mt;
    2664           0 :         unsigned long max_managed, flags;
    2665             : 
    2666             :         /*
    2667             :          * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
    2668             :          * Check is race-prone but harmless.
    2669             :          */
    2670           0 :         max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
    2671           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    2672             :                 return;
    2673             : 
    2674           0 :         spin_lock_irqsave(&zone->lock, flags);
    2675             : 
    2676             :         /* Recheck the nr_reserved_highatomic limit under the lock */
    2677           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    2678           0 :                 goto out_unlock;
    2679             : 
    2680             :         /* Yoink! */
    2681           0 :         mt = get_pageblock_migratetype(page);
    2682           0 :         if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
    2683             :             && !is_migrate_cma(mt)) {
    2684           0 :                 zone->nr_reserved_highatomic += pageblock_nr_pages;
    2685           0 :                 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
    2686           0 :                 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
    2687             :         }
    2688             : 
    2689           0 : out_unlock:
    2690           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    2691             : }
    2692             : 
    2693             : /*
    2694             :  * Used when an allocation is about to fail under memory pressure. This
    2695             :  * potentially hurts the reliability of high-order allocations when under
    2696             :  * intense memory pressure but failed atomic allocations should be easier
    2697             :  * to recover from than an OOM.
    2698             :  *
    2699             :  * If @force is true, try to unreserve a pageblock even though highatomic
    2700             :  * pageblock is exhausted.
    2701             :  */
    2702           0 : static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
    2703             :                                                 bool force)
    2704             : {
    2705           0 :         struct zonelist *zonelist = ac->zonelist;
    2706           0 :         unsigned long flags;
    2707           0 :         struct zoneref *z;
    2708           0 :         struct zone *zone;
    2709           0 :         struct page *page;
    2710           0 :         int order;
    2711           0 :         bool ret;
    2712             : 
    2713           0 :         for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
    2714             :                                                                 ac->nodemask) {
    2715             :                 /*
    2716             :                  * Preserve at least one pageblock unless memory pressure
    2717             :                  * is really high.
    2718             :                  */
    2719           0 :                 if (!force && zone->nr_reserved_highatomic <=
    2720             :                                         pageblock_nr_pages)
    2721           0 :                         continue;
    2722             : 
    2723           0 :                 spin_lock_irqsave(&zone->lock, flags);
    2724           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    2725           0 :                         struct free_area *area = &(zone->free_area[order]);
    2726             : 
    2727           0 :                         page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
    2728           0 :                         if (!page)
    2729           0 :                                 continue;
    2730             : 
    2731             :                         /*
    2732             :                          * In page freeing path, migratetype change is racy so
    2733             :                          * we can counter several free pages in a pageblock
    2734             :                          * in this loop althoug we changed the pageblock type
    2735             :                          * from highatomic to ac->migratetype. So we should
    2736             :                          * adjust the count once.
    2737             :                          */
    2738           0 :                         if (is_migrate_highatomic_page(page)) {
    2739             :                                 /*
    2740             :                                  * It should never happen but changes to
    2741             :                                  * locking could inadvertently allow a per-cpu
    2742             :                                  * drain to add pages to MIGRATE_HIGHATOMIC
    2743             :                                  * while unreserving so be safe and watch for
    2744             :                                  * underflows.
    2745             :                                  */
    2746           0 :                                 zone->nr_reserved_highatomic -= min(
    2747             :                                                 pageblock_nr_pages,
    2748             :                                                 zone->nr_reserved_highatomic);
    2749             :                         }
    2750             : 
    2751             :                         /*
    2752             :                          * Convert to ac->migratetype and avoid the normal
    2753             :                          * pageblock stealing heuristics. Minimally, the caller
    2754             :                          * is doing the work and needs the pages. More
    2755             :                          * importantly, if the block was always converted to
    2756             :                          * MIGRATE_UNMOVABLE or another type then the number
    2757             :                          * of pageblocks that cannot be completely freed
    2758             :                          * may increase.
    2759             :                          */
    2760           0 :                         set_pageblock_migratetype(page, ac->migratetype);
    2761           0 :                         ret = move_freepages_block(zone, page, ac->migratetype,
    2762             :                                                                         NULL);
    2763           0 :                         if (ret) {
    2764           0 :                                 spin_unlock_irqrestore(&zone->lock, flags);
    2765           0 :                                 return ret;
    2766             :                         }
    2767             :                 }
    2768           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    2769             :         }
    2770             : 
    2771             :         return false;
    2772             : }
    2773             : 
    2774             : /*
    2775             :  * Try finding a free buddy page on the fallback list and put it on the free
    2776             :  * list of requested migratetype, possibly along with other pages from the same
    2777             :  * block, depending on fragmentation avoidance heuristics. Returns true if
    2778             :  * fallback was found so that __rmqueue_smallest() can grab it.
    2779             :  *
    2780             :  * The use of signed ints for order and current_order is a deliberate
    2781             :  * deviation from the rest of this file, to make the for loop
    2782             :  * condition simpler.
    2783             :  */
    2784             : static __always_inline bool
    2785          38 : __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
    2786             :                                                 unsigned int alloc_flags)
    2787             : {
    2788          38 :         struct free_area *area;
    2789          38 :         int current_order;
    2790          38 :         int min_order = order;
    2791          38 :         struct page *page;
    2792          38 :         int fallback_mt;
    2793          38 :         bool can_steal;
    2794             : 
    2795             :         /*
    2796             :          * Do not steal pages from freelists belonging to other pageblocks
    2797             :          * i.e. orders < pageblock_order. If there are no local zones free,
    2798             :          * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
    2799             :          */
    2800          38 :         if (alloc_flags & ALLOC_NOFRAGMENT)
    2801           0 :                 min_order = pageblock_order;
    2802             : 
    2803             :         /*
    2804             :          * Find the largest available free page in the other list. This roughly
    2805             :          * approximates finding the pageblock with the most free pages, which
    2806             :          * would be too costly to do exactly.
    2807             :          */
    2808          38 :         for (current_order = MAX_ORDER - 1; current_order >= min_order;
    2809           0 :                                 --current_order) {
    2810          38 :                 area = &(zone->free_area[current_order]);
    2811          38 :                 fallback_mt = find_suitable_fallback(area, current_order,
    2812             :                                 start_migratetype, false, &can_steal);
    2813          38 :                 if (fallback_mt == -1)
    2814           0 :                         continue;
    2815             : 
    2816             :                 /*
    2817             :                  * We cannot steal all free pages from the pageblock and the
    2818             :                  * requested migratetype is movable. In that case it's better to
    2819             :                  * steal and split the smallest available page instead of the
    2820             :                  * largest available page, because even if the next movable
    2821             :                  * allocation falls back into a different pageblock than this
    2822             :                  * one, it won't cause permanent fragmentation.
    2823             :                  */
    2824          38 :                 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
    2825           0 :                                         && current_order > order)
    2826           0 :                         goto find_smallest;
    2827             : 
    2828          38 :                 goto do_steal;
    2829             :         }
    2830             : 
    2831             :         return false;
    2832             : 
    2833           0 : find_smallest:
    2834           0 :         for (current_order = order; current_order < MAX_ORDER;
    2835           0 :                                                         current_order++) {
    2836           0 :                 area = &(zone->free_area[current_order]);
    2837           0 :                 fallback_mt = find_suitable_fallback(area, current_order,
    2838             :                                 start_migratetype, false, &can_steal);
    2839           0 :                 if (fallback_mt != -1)
    2840             :                         break;
    2841             :         }
    2842             : 
    2843             :         /*
    2844             :          * This should not happen - we already found a suitable fallback
    2845             :          * when looking for the largest page.
    2846             :          */
    2847           0 :         VM_BUG_ON(current_order == MAX_ORDER);
    2848             : 
    2849           0 : do_steal:
    2850          38 :         page = get_page_from_free_area(area, fallback_mt);
    2851             : 
    2852          38 :         steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
    2853             :                                                                 can_steal);
    2854             : 
    2855          38 :         trace_mm_page_alloc_extfrag(page, order, current_order,
    2856             :                 start_migratetype, fallback_mt);
    2857             : 
    2858          38 :         return true;
    2859             : 
    2860             : }
    2861             : 
    2862             : /*
    2863             :  * Do the hard work of removing an element from the buddy allocator.
    2864             :  * Call me with the zone->lock already held.
    2865             :  */
    2866             : static __always_inline struct page *
    2867             : __rmqueue(struct zone *zone, unsigned int order, int migratetype,
    2868             :                                                 unsigned int alloc_flags)
    2869             : {
    2870             :         struct page *page;
    2871             : 
    2872             :         if (IS_ENABLED(CONFIG_CMA)) {
    2873             :                 /*
    2874             :                  * Balance movable allocations between regular and CMA areas by
    2875             :                  * allocating from CMA when over half of the zone's free memory
    2876             :                  * is in the CMA area.
    2877             :                  */
    2878             :                 if (alloc_flags & ALLOC_CMA &&
    2879             :                     zone_page_state(zone, NR_FREE_CMA_PAGES) >
    2880             :                     zone_page_state(zone, NR_FREE_PAGES) / 2) {
    2881             :                         page = __rmqueue_cma_fallback(zone, order);
    2882             :                         if (page)
    2883             :                                 goto out;
    2884             :                 }
    2885             :         }
    2886       59742 : retry:
    2887      166396 :         page = __rmqueue_smallest(zone, order, migratetype);
    2888       83198 :         if (unlikely(!page)) {
    2889          38 :                 if (alloc_flags & ALLOC_CMA)
    2890          38 :                         page = __rmqueue_cma_fallback(zone, order);
    2891             : 
    2892          38 :                 if (!page && __rmqueue_fallback(zone, order, migratetype,
    2893             :                                                                 alloc_flags))
    2894          38 :                         goto retry;
    2895             :         }
    2896       83160 : out:
    2897       83160 :         if (page)
    2898       83160 :                 trace_mm_page_alloc_zone_locked(page, order, migratetype);
    2899       59738 :         return page;
    2900             : }
    2901             : 
    2902             : /*
    2903             :  * Obtain a specified number of elements from the buddy allocator, all under
    2904             :  * a single hold of the lock, for efficiency.  Add them to the supplied list.
    2905             :  * Returns the number of new pages which were placed at *list.
    2906             :  */
    2907        1086 : static int rmqueue_bulk(struct zone *zone, unsigned int order,
    2908             :                         unsigned long count, struct list_head *list,
    2909             :                         int migratetype, unsigned int alloc_flags)
    2910             : {
    2911        1086 :         int i, alloced = 0;
    2912             : 
    2913        1086 :         spin_lock(&zone->lock);
    2914       61910 :         for (i = 0; i < count; ++i) {
    2915       59742 :                 struct page *page = __rmqueue(zone, order, migratetype,
    2916             :                                                                 alloc_flags);
    2917       59738 :                 if (unlikely(page == NULL))
    2918             :                         break;
    2919             : 
    2920       59738 :                 if (unlikely(check_pcp_refill(page)))
    2921             :                         continue;
    2922             : 
    2923             :                 /*
    2924             :                  * Split buddy pages returned by expand() are received here in
    2925             :                  * physical page order. The page is added to the tail of
    2926             :                  * caller's list. From the callers perspective, the linked list
    2927             :                  * is ordered by page number under some conditions. This is
    2928             :                  * useful for IO devices that can forward direction from the
    2929             :                  * head, thus also in the physical page order. This is useful
    2930             :                  * for IO devices that can merge IO requests if the physical
    2931             :                  * pages are ordered properly.
    2932             :                  */
    2933       59738 :                 list_add_tail(&page->lru, list);
    2934       59738 :                 alloced++;
    2935       59738 :                 if (is_migrate_cma(get_pcppage_migratetype(page)))
    2936             :                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
    2937             :                                               -(1 << order));
    2938             :         }
    2939             : 
    2940             :         /*
    2941             :          * i pages were removed from the buddy list even if some leak due
    2942             :          * to check_pcp_refill failing so adjust NR_FREE_PAGES based
    2943             :          * on i. Do not confuse with 'alloced' which is the number of
    2944             :          * pages added to the pcp list.
    2945             :          */
    2946        1086 :         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
    2947        1086 :         spin_unlock(&zone->lock);
    2948        1086 :         return alloced;
    2949             : }
    2950             : 
    2951             : #ifdef CONFIG_NUMA
    2952             : /*
    2953             :  * Called from the vmstat counter updater to drain pagesets of this
    2954             :  * currently executing processor on remote nodes after they have
    2955             :  * expired.
    2956             :  *
    2957             :  * Note that this function must be called with the thread pinned to
    2958             :  * a single processor.
    2959             :  */
    2960           0 : void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
    2961             : {
    2962           0 :         unsigned long flags;
    2963           0 :         int to_drain, batch;
    2964             : 
    2965           0 :         local_irq_save(flags);
    2966           0 :         batch = READ_ONCE(pcp->batch);
    2967           0 :         to_drain = min(pcp->count, batch);
    2968           0 :         if (to_drain > 0)
    2969           0 :                 free_pcppages_bulk(zone, to_drain, pcp);
    2970           0 :         local_irq_restore(flags);
    2971           0 : }
    2972             : #endif
    2973             : 
    2974             : /*
    2975             :  * Drain pcplists of the indicated processor and zone.
    2976             :  *
    2977             :  * The processor must either be the current processor and the
    2978             :  * thread pinned to the current processor or a processor that
    2979             :  * is not online.
    2980             :  */
    2981           0 : static void drain_pages_zone(unsigned int cpu, struct zone *zone)
    2982             : {
    2983           0 :         unsigned long flags;
    2984           0 :         struct per_cpu_pageset *pset;
    2985           0 :         struct per_cpu_pages *pcp;
    2986             : 
    2987           0 :         local_irq_save(flags);
    2988           0 :         pset = per_cpu_ptr(zone->pageset, cpu);
    2989             : 
    2990           0 :         pcp = &pset->pcp;
    2991           0 :         if (pcp->count)
    2992           0 :                 free_pcppages_bulk(zone, pcp->count, pcp);
    2993           0 :         local_irq_restore(flags);
    2994           0 : }
    2995             : 
    2996             : /*
    2997             :  * Drain pcplists of all zones on the indicated processor.
    2998             :  *
    2999             :  * The processor must either be the current processor and the
    3000             :  * thread pinned to the current processor or a processor that
    3001             :  * is not online.
    3002             :  */
    3003           0 : static void drain_pages(unsigned int cpu)
    3004             : {
    3005           0 :         struct zone *zone;
    3006             : 
    3007           0 :         for_each_populated_zone(zone) {
    3008           0 :                 drain_pages_zone(cpu, zone);
    3009             :         }
    3010           0 : }
    3011             : 
    3012             : /*
    3013             :  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
    3014             :  *
    3015             :  * The CPU has to be pinned. When zone parameter is non-NULL, spill just
    3016             :  * the single zone's pages.
    3017             :  */
    3018           0 : void drain_local_pages(struct zone *zone)
    3019             : {
    3020           0 :         int cpu = smp_processor_id();
    3021             : 
    3022           0 :         if (zone)
    3023           0 :                 drain_pages_zone(cpu, zone);
    3024             :         else
    3025           0 :                 drain_pages(cpu);
    3026           0 : }
    3027             : 
    3028           0 : static void drain_local_pages_wq(struct work_struct *work)
    3029             : {
    3030           0 :         struct pcpu_drain *drain;
    3031             : 
    3032           0 :         drain = container_of(work, struct pcpu_drain, work);
    3033             : 
    3034             :         /*
    3035             :          * drain_all_pages doesn't use proper cpu hotplug protection so
    3036             :          * we can race with cpu offline when the WQ can move this from
    3037             :          * a cpu pinned worker to an unbound one. We can operate on a different
    3038             :          * cpu which is allright but we also have to make sure to not move to
    3039             :          * a different one.
    3040             :          */
    3041           0 :         preempt_disable();
    3042           0 :         drain_local_pages(drain->zone);
    3043           0 :         preempt_enable();
    3044           0 : }
    3045             : 
    3046             : /*
    3047             :  * The implementation of drain_all_pages(), exposing an extra parameter to
    3048             :  * drain on all cpus.
    3049             :  *
    3050             :  * drain_all_pages() is optimized to only execute on cpus where pcplists are
    3051             :  * not empty. The check for non-emptiness can however race with a free to
    3052             :  * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
    3053             :  * that need the guarantee that every CPU has drained can disable the
    3054             :  * optimizing racy check.
    3055             :  */
    3056           0 : static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
    3057             : {
    3058           0 :         int cpu;
    3059             : 
    3060             :         /*
    3061             :          * Allocate in the BSS so we wont require allocation in
    3062             :          * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
    3063             :          */
    3064           0 :         static cpumask_t cpus_with_pcps;
    3065             : 
    3066             :         /*
    3067             :          * Make sure nobody triggers this path before mm_percpu_wq is fully
    3068             :          * initialized.
    3069             :          */
    3070           0 :         if (WARN_ON_ONCE(!mm_percpu_wq))
    3071             :                 return;
    3072             : 
    3073             :         /*
    3074             :          * Do not drain if one is already in progress unless it's specific to
    3075             :          * a zone. Such callers are primarily CMA and memory hotplug and need
    3076             :          * the drain to be complete when the call returns.
    3077             :          */
    3078           0 :         if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
    3079           0 :                 if (!zone)
    3080             :                         return;
    3081           0 :                 mutex_lock(&pcpu_drain_mutex);
    3082             :         }
    3083             : 
    3084             :         /*
    3085             :          * We don't care about racing with CPU hotplug event
    3086             :          * as offline notification will cause the notified
    3087             :          * cpu to drain that CPU pcps and on_each_cpu_mask
    3088             :          * disables preemption as part of its processing
    3089             :          */
    3090           0 :         for_each_online_cpu(cpu) {
    3091           0 :                 struct per_cpu_pageset *pcp;
    3092           0 :                 struct zone *z;
    3093           0 :                 bool has_pcps = false;
    3094             : 
    3095           0 :                 if (force_all_cpus) {
    3096             :                         /*
    3097             :                          * The pcp.count check is racy, some callers need a
    3098             :                          * guarantee that no cpu is missed.
    3099             :                          */
    3100             :                         has_pcps = true;
    3101           0 :                 } else if (zone) {
    3102           0 :                         pcp = per_cpu_ptr(zone->pageset, cpu);
    3103           0 :                         if (pcp->pcp.count)
    3104             :                                 has_pcps = true;
    3105             :                 } else {
    3106           0 :                         for_each_populated_zone(z) {
    3107           0 :                                 pcp = per_cpu_ptr(z->pageset, cpu);
    3108           0 :                                 if (pcp->pcp.count) {
    3109             :                                         has_pcps = true;
    3110             :                                         break;
    3111             :                                 }
    3112             :                         }
    3113             :                 }
    3114             : 
    3115           0 :                 if (has_pcps)
    3116           0 :                         cpumask_set_cpu(cpu, &cpus_with_pcps);
    3117             :                 else
    3118           0 :                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
    3119             :         }
    3120             : 
    3121           0 :         for_each_cpu(cpu, &cpus_with_pcps) {
    3122           0 :                 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
    3123             : 
    3124           0 :                 drain->zone = zone;
    3125           0 :                 INIT_WORK(&drain->work, drain_local_pages_wq);
    3126           0 :                 queue_work_on(cpu, mm_percpu_wq, &drain->work);
    3127             :         }
    3128           0 :         for_each_cpu(cpu, &cpus_with_pcps)
    3129           0 :                 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
    3130             : 
    3131           0 :         mutex_unlock(&pcpu_drain_mutex);
    3132             : }
    3133             : 
    3134             : /*
    3135             :  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
    3136             :  *
    3137             :  * When zone parameter is non-NULL, spill just the single zone's pages.
    3138             :  *
    3139             :  * Note that this can be extremely slow as the draining happens in a workqueue.
    3140             :  */
    3141           0 : void drain_all_pages(struct zone *zone)
    3142             : {
    3143           0 :         __drain_all_pages(zone, false);
    3144           0 : }
    3145             : 
    3146             : #ifdef CONFIG_HIBERNATION
    3147             : 
    3148             : /*
    3149             :  * Touch the watchdog for every WD_PAGE_COUNT pages.
    3150             :  */
    3151             : #define WD_PAGE_COUNT   (128*1024)
    3152             : 
    3153             : void mark_free_pages(struct zone *zone)
    3154             : {
    3155             :         unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
    3156             :         unsigned long flags;
    3157             :         unsigned int order, t;
    3158             :         struct page *page;
    3159             : 
    3160             :         if (zone_is_empty(zone))
    3161             :                 return;
    3162             : 
    3163             :         spin_lock_irqsave(&zone->lock, flags);
    3164             : 
    3165             :         max_zone_pfn = zone_end_pfn(zone);
    3166             :         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
    3167             :                 if (pfn_valid(pfn)) {
    3168             :                         page = pfn_to_page(pfn);
    3169             : 
    3170             :                         if (!--page_count) {
    3171             :                                 touch_nmi_watchdog();
    3172             :                                 page_count = WD_PAGE_COUNT;
    3173             :                         }
    3174             : 
    3175             :                         if (page_zone(page) != zone)
    3176             :                                 continue;
    3177             : 
    3178             :                         if (!swsusp_page_is_forbidden(page))
    3179             :                                 swsusp_unset_page_free(page);
    3180             :                 }
    3181             : 
    3182             :         for_each_migratetype_order(order, t) {
    3183             :                 list_for_each_entry(page,
    3184             :                                 &zone->free_area[order].free_list[t], lru) {
    3185             :                         unsigned long i;
    3186             : 
    3187             :                         pfn = page_to_pfn(page);
    3188             :                         for (i = 0; i < (1UL << order); i++) {
    3189             :                                 if (!--page_count) {
    3190             :                                         touch_nmi_watchdog();
    3191             :                                         page_count = WD_PAGE_COUNT;
    3192             :                                 }
    3193             :                                 swsusp_set_page_free(pfn_to_page(pfn + i));
    3194             :                         }
    3195             :                 }
    3196             :         }
    3197             :         spin_unlock_irqrestore(&zone->lock, flags);
    3198             : }
    3199             : #endif /* CONFIG_PM */
    3200             : 
    3201      133369 : static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
    3202             : {
    3203      133369 :         int migratetype;
    3204             : 
    3205      133369 :         if (!free_pcp_prepare(page))
    3206             :                 return false;
    3207             : 
    3208      133325 :         migratetype = get_pfnblock_migratetype(page, pfn);
    3209      133325 :         set_pcppage_migratetype(page, migratetype);
    3210      133325 :         return true;
    3211             : }
    3212             : 
    3213      133385 : static void free_unref_page_commit(struct page *page, unsigned long pfn)
    3214             : {
    3215      133385 :         struct zone *zone = page_zone(page);
    3216      133385 :         struct per_cpu_pages *pcp;
    3217      133385 :         int migratetype;
    3218             : 
    3219      133385 :         migratetype = get_pcppage_migratetype(page);
    3220      133385 :         __count_vm_event(PGFREE);
    3221             : 
    3222             :         /*
    3223             :          * We only track unmovable, reclaimable and movable on pcp lists.
    3224             :          * Free ISOLATE pages back to the allocator because they are being
    3225             :          * offlined but treat HIGHATOMIC as movable pages so we can get those
    3226             :          * areas back if necessary. Otherwise, we may have to free
    3227             :          * excessively into the page allocator
    3228             :          */
    3229      133385 :         if (migratetype >= MIGRATE_PCPTYPES) {
    3230           0 :                 if (unlikely(is_migrate_isolate(migratetype))) {
    3231             :                         free_one_page(zone, page, pfn, 0, migratetype,
    3232             :                                       FPI_NONE);
    3233             :                         return;
    3234             :                 }
    3235           0 :                 migratetype = MIGRATE_MOVABLE;
    3236             :         }
    3237             : 
    3238      133385 :         pcp = &this_cpu_ptr(zone->pageset)->pcp;
    3239      133388 :         list_add(&page->lru, &pcp->lists[migratetype]);
    3240      133388 :         pcp->count++;
    3241      133388 :         if (pcp->count >= READ_ONCE(pcp->high))
    3242         357 :                 free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
    3243             : }
    3244             : 
    3245             : /*
    3246             :  * Free a 0-order page
    3247             :  */
    3248       65686 : void free_unref_page(struct page *page)
    3249             : {
    3250       65686 :         unsigned long flags;
    3251       65686 :         unsigned long pfn = page_to_pfn(page);
    3252             : 
    3253       65686 :         if (!free_unref_page_prepare(page, pfn))
    3254             :                 return;
    3255             : 
    3256      131363 :         local_irq_save(flags);
    3257       65684 :         free_unref_page_commit(page, pfn);
    3258       65685 :         local_irq_restore(flags);
    3259             : }
    3260             : 
    3261             : /*
    3262             :  * Free a list of 0-order pages
    3263             :  */
    3264       27546 : void free_unref_page_list(struct list_head *list)
    3265             : {
    3266       27546 :         struct page *page, *next;
    3267       27546 :         unsigned long flags, pfn;
    3268       27546 :         int batch_count = 0;
    3269             : 
    3270             :         /* Prepare pages for freeing */
    3271       95205 :         list_for_each_entry_safe(page, next, list, lru) {
    3272       67659 :                 pfn = page_to_pfn(page);
    3273       67659 :                 if (!free_unref_page_prepare(page, pfn))
    3274           0 :                         list_del(&page->lru);
    3275       67659 :                 set_page_private(page, pfn);
    3276             :         }
    3277             : 
    3278       55092 :         local_irq_save(flags);
    3279       95247 :         list_for_each_entry_safe(page, next, list, lru) {
    3280       67701 :                 unsigned long pfn = page_private(page);
    3281             : 
    3282       67701 :                 set_page_private(page, 0);
    3283       67701 :                 trace_mm_page_free_batched(page);
    3284       67702 :                 free_unref_page_commit(page, pfn);
    3285             : 
    3286             :                 /*
    3287             :                  * Guard against excessive IRQ disabled times when we get
    3288             :                  * a large list of pages to free.
    3289             :                  */
    3290       67701 :                 if (++batch_count == SWAP_CLUSTER_MAX) {
    3291         662 :                         local_irq_restore(flags);
    3292         662 :                         batch_count = 0;
    3293       68363 :                         local_irq_save(flags);
    3294             :                 }
    3295             :         }
    3296       27546 :         local_irq_restore(flags);
    3297       27546 : }
    3298             : 
    3299             : /*
    3300             :  * split_page takes a non-compound higher-order page, and splits it into
    3301             :  * n (1<<order) sub-pages: page[0..n]
    3302             :  * Each sub-page must be freed individually.
    3303             :  *
    3304             :  * Note: this is probably too low level an operation for use in drivers.
    3305             :  * Please consult with lkml before using this in your driver.
    3306             :  */
    3307          12 : void split_page(struct page *page, unsigned int order)
    3308             : {
    3309          12 :         int i;
    3310             : 
    3311          24 :         VM_BUG_ON_PAGE(PageCompound(page), page);
    3312          12 :         VM_BUG_ON_PAGE(!page_count(page), page);
    3313             : 
    3314         424 :         for (i = 1; i < (1 << order); i++)
    3315         412 :                 set_page_refcounted(page + i);
    3316          12 :         split_page_owner(page, 1 << order);
    3317          12 :         split_page_memcg(page, 1 << order);
    3318          12 : }
    3319             : EXPORT_SYMBOL_GPL(split_page);
    3320             : 
    3321           0 : int __isolate_free_page(struct page *page, unsigned int order)
    3322             : {
    3323           0 :         unsigned long watermark;
    3324           0 :         struct zone *zone;
    3325           0 :         int mt;
    3326             : 
    3327           0 :         BUG_ON(!PageBuddy(page));
    3328             : 
    3329           0 :         zone = page_zone(page);
    3330           0 :         mt = get_pageblock_migratetype(page);
    3331             : 
    3332           0 :         if (!is_migrate_isolate(mt)) {
    3333             :                 /*
    3334             :                  * Obey watermarks as if the page was being allocated. We can
    3335             :                  * emulate a high-order watermark check with a raised order-0
    3336             :                  * watermark, because we already know our high-order page
    3337             :                  * exists.
    3338             :                  */
    3339           0 :                 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
    3340           0 :                 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
    3341             :                         return 0;
    3342             : 
    3343           0 :                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
    3344             :         }
    3345             : 
    3346             :         /* Remove page from free list */
    3347             : 
    3348           0 :         del_page_from_free_list(page, zone, order);
    3349             : 
    3350             :         /*
    3351             :          * Set the pageblock if the isolated page is at least half of a
    3352             :          * pageblock
    3353             :          */
    3354           0 :         if (order >= pageblock_order - 1) {
    3355           0 :                 struct page *endpage = page + (1 << order) - 1;
    3356           0 :                 for (; page < endpage; page += pageblock_nr_pages) {
    3357           0 :                         int mt = get_pageblock_migratetype(page);
    3358           0 :                         if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
    3359           0 :                             && !is_migrate_highatomic(mt))
    3360           0 :                                 set_pageblock_migratetype(page,
    3361             :                                                           MIGRATE_MOVABLE);
    3362             :                 }
    3363             :         }
    3364             : 
    3365             : 
    3366           0 :         return 1UL << order;
    3367             : }
    3368             : 
    3369             : /**
    3370             :  * __putback_isolated_page - Return a now-isolated page back where we got it
    3371             :  * @page: Page that was isolated
    3372             :  * @order: Order of the isolated page
    3373             :  * @mt: The page's pageblock's migratetype
    3374             :  *
    3375             :  * This function is meant to return a page pulled from the free lists via
    3376             :  * __isolate_free_page back to the free lists they were pulled from.
    3377             :  */
    3378           0 : void __putback_isolated_page(struct page *page, unsigned int order, int mt)
    3379             : {
    3380           0 :         struct zone *zone = page_zone(page);
    3381             : 
    3382             :         /* zone lock should be held when this function is called */
    3383           0 :         lockdep_assert_held(&zone->lock);
    3384             : 
    3385             :         /* Return isolated page to tail of freelist. */
    3386           0 :         __free_one_page(page, page_to_pfn(page), zone, order, mt,
    3387             :                         FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
    3388           0 : }
    3389             : 
    3390             : /*
    3391             :  * Update NUMA hit/miss statistics
    3392             :  *
    3393             :  * Must be called with interrupts disabled.
    3394             :  */
    3395      193003 : static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
    3396             : {
    3397             : #ifdef CONFIG_NUMA
    3398      193003 :         enum numa_stat_item local_stat = NUMA_LOCAL;
    3399             : 
    3400             :         /* skip numa counters update if numa stats is disabled */
    3401      193003 :         if (!static_branch_likely(&vm_numa_stat_key))
    3402             :                 return;
    3403             : 
    3404      193004 :         if (zone_to_nid(z) != numa_node_id())
    3405           0 :                 local_stat = NUMA_OTHER;
    3406             : 
    3407      193004 :         if (zone_to_nid(z) == zone_to_nid(preferred_zone))
    3408      193004 :                 __inc_numa_state(z, NUMA_HIT);
    3409             :         else {
    3410           0 :                 __inc_numa_state(z, NUMA_MISS);
    3411           0 :                 __inc_numa_state(preferred_zone, NUMA_FOREIGN);
    3412             :         }
    3413      193006 :         __inc_numa_state(z, local_stat);
    3414             : #endif
    3415             : }
    3416             : 
    3417             : /* Remove page from the per-cpu list, caller must protect the list */
    3418      169567 : static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
    3419             :                         unsigned int alloc_flags,
    3420             :                         struct per_cpu_pages *pcp,
    3421             :                         struct list_head *list)
    3422             : {
    3423      169567 :         struct page *page;
    3424             : 
    3425      169567 :         do {
    3426      169567 :                 if (list_empty(list)) {
    3427        2170 :                         pcp->count += rmqueue_bulk(zone, 0,
    3428        1085 :                                         READ_ONCE(pcp->batch), list,
    3429             :                                         migratetype, alloc_flags);
    3430        1085 :                         if (unlikely(list_empty(list)))
    3431             :                                 return NULL;
    3432             :                 }
    3433             : 
    3434      169567 :                 page = list_first_entry(list, struct page, lru);
    3435      169567 :                 list_del(&page->lru);
    3436      169567 :                 pcp->count--;
    3437      169567 :         } while (check_new_pcp(page));
    3438             : 
    3439             :         return page;
    3440             : }
    3441             : 
    3442             : /* Lock and remove page from the per-cpu list */
    3443      169570 : static struct page *rmqueue_pcplist(struct zone *preferred_zone,
    3444             :                         struct zone *zone, gfp_t gfp_flags,
    3445             :                         int migratetype, unsigned int alloc_flags)
    3446             : {
    3447      169570 :         struct per_cpu_pages *pcp;
    3448      169570 :         struct list_head *list;
    3449      169570 :         struct page *page;
    3450      169570 :         unsigned long flags;
    3451             : 
    3452      339146 :         local_irq_save(flags);
    3453      169566 :         pcp = &this_cpu_ptr(zone->pageset)->pcp;
    3454      169567 :         list = &pcp->lists[migratetype];
    3455      169567 :         page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
    3456      169584 :         if (page) {
    3457      169584 :                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
    3458      169584 :                 zone_statistics(preferred_zone, zone);
    3459             :         }
    3460      169587 :         local_irq_restore(flags);
    3461      169577 :         return page;
    3462             : }
    3463             : 
    3464             : /*
    3465             :  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
    3466             :  */
    3467             : static inline
    3468      192984 : struct page *rmqueue(struct zone *preferred_zone,
    3469             :                         struct zone *zone, unsigned int order,
    3470             :                         gfp_t gfp_flags, unsigned int alloc_flags,
    3471             :                         int migratetype)
    3472             : {
    3473      192984 :         unsigned long flags;
    3474      192984 :         struct page *page;
    3475             : 
    3476      192984 :         if (likely(order == 0)) {
    3477             :                 /*
    3478             :                  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
    3479             :                  * we need to skip it when CMA area isn't allowed.
    3480             :                  */
    3481      169568 :                 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
    3482             :                                 migratetype != MIGRATE_MOVABLE) {
    3483      169568 :                         page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
    3484             :                                         migratetype, alloc_flags);
    3485      169576 :                         goto out;
    3486             :                 }
    3487             :         }
    3488             : 
    3489             :         /*
    3490             :          * We most definitely don't want callers attempting to
    3491             :          * allocate greater than order-1 page units with __GFP_NOFAIL.
    3492             :          */
    3493       46832 :         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
    3494       23416 :         spin_lock_irqsave(&zone->lock, flags);
    3495             : 
    3496       23422 :         do {
    3497       23422 :                 page = NULL;
    3498             :                 /*
    3499             :                  * order-0 request can reach here when the pcplist is skipped
    3500             :                  * due to non-CMA allocation context. HIGHATOMIC area is
    3501             :                  * reserved for high-order atomic allocation, so order-0
    3502             :                  * request should skip it.
    3503             :                  */
    3504       23422 :                 if (order > 0 && alloc_flags & ALLOC_HARDER) {
    3505           0 :                         page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
    3506           0 :                         if (page)
    3507           0 :                                 trace_mm_page_alloc_zone_locked(page, order, migratetype);
    3508             :                 }
    3509           0 :                 if (!page)
    3510       23456 :                         page = __rmqueue(zone, order, migratetype, alloc_flags);
    3511       23422 :         } while (page && check_new_pages(page, order));
    3512       23422 :         spin_unlock(&zone->lock);
    3513       23422 :         if (!page)
    3514           0 :                 goto failed;
    3515       23422 :         __mod_zone_freepage_state(zone, -(1 << order),
    3516             :                                   get_pcppage_migratetype(page));
    3517             : 
    3518       23422 :         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    3519       23422 :         zone_statistics(preferred_zone, zone);
    3520       23422 :         local_irq_restore(flags);
    3521             : 
    3522      192998 : out:
    3523             :         /* Separate test+clear to avoid unnecessary atomics */
    3524      192998 :         if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
    3525           0 :                 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    3526           0 :                 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
    3527             :         }
    3528             : 
    3529      192989 :         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
    3530             :         return page;
    3531             : 
    3532           0 : failed:
    3533           0 :         local_irq_restore(flags);
    3534             :         return NULL;
    3535             : }
    3536             : 
    3537             : #ifdef CONFIG_FAIL_PAGE_ALLOC
    3538             : 
    3539             : static struct {
    3540             :         struct fault_attr attr;
    3541             : 
    3542             :         bool ignore_gfp_highmem;
    3543             :         bool ignore_gfp_reclaim;
    3544             :         u32 min_order;
    3545             : } fail_page_alloc = {
    3546             :         .attr = FAULT_ATTR_INITIALIZER,
    3547             :         .ignore_gfp_reclaim = true,
    3548             :         .ignore_gfp_highmem = true,
    3549             :         .min_order = 1,
    3550             : };
    3551             : 
    3552             : static int __init setup_fail_page_alloc(char *str)
    3553             : {
    3554             :         return setup_fault_attr(&fail_page_alloc.attr, str);
    3555             : }
    3556             : __setup("fail_page_alloc=", setup_fail_page_alloc);
    3557             : 
    3558             : static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3559             : {
    3560             :         if (order < fail_page_alloc.min_order)
    3561             :                 return false;
    3562             :         if (gfp_mask & __GFP_NOFAIL)
    3563             :                 return false;
    3564             :         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
    3565             :                 return false;
    3566             :         if (fail_page_alloc.ignore_gfp_reclaim &&
    3567             :                         (gfp_mask & __GFP_DIRECT_RECLAIM))
    3568             :                 return false;
    3569             : 
    3570             :         return should_fail(&fail_page_alloc.attr, 1 << order);
    3571             : }
    3572             : 
    3573             : #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
    3574             : 
    3575             : static int __init fail_page_alloc_debugfs(void)
    3576             : {
    3577             :         umode_t mode = S_IFREG | 0600;
    3578             :         struct dentry *dir;
    3579             : 
    3580             :         dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
    3581             :                                         &fail_page_alloc.attr);
    3582             : 
    3583             :         debugfs_create_bool("ignore-gfp-wait", mode, dir,
    3584             :                             &fail_page_alloc.ignore_gfp_reclaim);
    3585             :         debugfs_create_bool("ignore-gfp-highmem", mode, dir,
    3586             :                             &fail_page_alloc.ignore_gfp_highmem);
    3587             :         debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
    3588             : 
    3589             :         return 0;
    3590             : }
    3591             : 
    3592             : late_initcall(fail_page_alloc_debugfs);
    3593             : 
    3594             : #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
    3595             : 
    3596             : #else /* CONFIG_FAIL_PAGE_ALLOC */
    3597             : 
    3598      192976 : static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3599             : {
    3600      192976 :         return false;
    3601             : }
    3602             : 
    3603             : #endif /* CONFIG_FAIL_PAGE_ALLOC */
    3604             : 
    3605      192976 : noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3606             : {
    3607      192976 :         return __should_fail_alloc_page(gfp_mask, order);
    3608             : }
    3609             : ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
    3610             : 
    3611      192976 : static inline long __zone_watermark_unusable_free(struct zone *z,
    3612             :                                 unsigned int order, unsigned int alloc_flags)
    3613             : {
    3614      192976 :         const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
    3615      192976 :         long unusable_free = (1 << order) - 1;
    3616             : 
    3617             :         /*
    3618             :          * If the caller does not have rights to ALLOC_HARDER then subtract
    3619             :          * the high-atomic reserves. This will over-estimate the size of the
    3620             :          * atomic reserve but it avoids a search.
    3621             :          */
    3622      192976 :         if (likely(!alloc_harder))
    3623      192976 :                 unusable_free += z->nr_reserved_highatomic;
    3624             : 
    3625             : #ifdef CONFIG_CMA
    3626             :         /* If allocation can't use CMA areas don't use free CMA pages */
    3627             :         if (!(alloc_flags & ALLOC_CMA))
    3628             :                 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
    3629             : #endif
    3630             : 
    3631      192976 :         return unusable_free;
    3632             : }
    3633             : 
    3634             : /*
    3635             :  * Return true if free base pages are above 'mark'. For high-order checks it
    3636             :  * will return true of the order-0 watermark is reached and there is at least
    3637             :  * one free page of a suitable size. Checking now avoids taking the zone lock
    3638             :  * to check in the allocation paths if no pages are free.
    3639             :  */
    3640       23417 : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    3641             :                          int highest_zoneidx, unsigned int alloc_flags,
    3642             :                          long free_pages)
    3643             : {
    3644       23417 :         long min = mark;
    3645       23417 :         int o;
    3646       23417 :         const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
    3647             : 
    3648             :         /* free_pages may go negative - that's OK */
    3649       23417 :         free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
    3650             : 
    3651       23417 :         if (alloc_flags & ALLOC_HIGH)
    3652           0 :                 min -= min / 2;
    3653             : 
    3654       23417 :         if (unlikely(alloc_harder)) {
    3655             :                 /*
    3656             :                  * OOM victims can try even harder than normal ALLOC_HARDER
    3657             :                  * users on the grounds that it's definitely going to be in
    3658             :                  * the exit path shortly and free memory. Any allocation it
    3659             :                  * makes during the free path will be small and short-lived.
    3660             :                  */
    3661           0 :                 if (alloc_flags & ALLOC_OOM)
    3662           0 :                         min -= min / 2;
    3663             :                 else
    3664           0 :                         min -= min / 4;
    3665             :         }
    3666             : 
    3667             :         /*
    3668             :          * Check watermarks for an order-0 allocation request. If these
    3669             :          * are not met, then a high-order request also cannot go ahead
    3670             :          * even if a suitable page happened to be free.
    3671             :          */
    3672       23417 :         if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
    3673             :                 return false;
    3674             : 
    3675             :         /* If this is an order-0 request then the watermark is fine */
    3676       23417 :         if (!order)
    3677             :                 return true;
    3678             : 
    3679             :         /* For a high-order request, check at least one suitable page is free */
    3680       24342 :         for (o = order; o < MAX_ORDER; o++) {
    3681       24342 :                 struct free_area *area = &z->free_area[o];
    3682       24342 :                 int mt;
    3683             : 
    3684       24342 :                 if (!area->nr_free)
    3685         927 :                         continue;
    3686             : 
    3687       30466 :                 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
    3688       30466 :                         if (!free_area_empty(area, mt))
    3689             :                                 return true;
    3690             :                 }
    3691             : 
    3692             : #ifdef CONFIG_CMA
    3693             :                 if ((alloc_flags & ALLOC_CMA) &&
    3694             :                     !free_area_empty(area, MIGRATE_CMA)) {
    3695             :                         return true;
    3696             :                 }
    3697             : #endif
    3698           0 :                 if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
    3699             :                         return true;
    3700             :         }
    3701             :         return false;
    3702             : }
    3703             : 
    3704           0 : bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    3705             :                       int highest_zoneidx, unsigned int alloc_flags)
    3706             : {
    3707           0 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    3708           0 :                                         zone_page_state(z, NR_FREE_PAGES));
    3709             : }
    3710             : 
    3711      192968 : static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
    3712             :                                 unsigned long mark, int highest_zoneidx,
    3713             :                                 unsigned int alloc_flags, gfp_t gfp_mask)
    3714             : {
    3715      192968 :         long free_pages;
    3716             : 
    3717      192968 :         free_pages = zone_page_state(z, NR_FREE_PAGES);
    3718             : 
    3719             :         /*
    3720             :          * Fast check for order-0 only. If this fails then the reserves
    3721             :          * need to be calculated.
    3722             :          */
    3723      192975 :         if (!order) {
    3724      169559 :                 long fast_free;
    3725             : 
    3726      169559 :                 fast_free = free_pages;
    3727      169559 :                 fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
    3728      169559 :                 if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
    3729             :                         return true;
    3730             :         }
    3731             : 
    3732       23410 :         if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    3733             :                                         free_pages))
    3734             :                 return true;
    3735             :         /*
    3736             :          * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
    3737             :          * when checking the min watermark. The min watermark is the
    3738             :          * point where boosting is ignored so that kswapd is woken up
    3739             :          * when below the low watermark.
    3740             :          */
    3741           0 :         if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
    3742             :                 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
    3743           0 :                 mark = z->_watermark[WMARK_MIN];
    3744           0 :                 return __zone_watermark_ok(z, order, mark, highest_zoneidx,
    3745             :                                         alloc_flags, free_pages);
    3746             :         }
    3747             : 
    3748             :         return false;
    3749             : }
    3750             : 
    3751           2 : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
    3752             :                         unsigned long mark, int highest_zoneidx)
    3753             : {
    3754           2 :         long free_pages = zone_page_state(z, NR_FREE_PAGES);
    3755             : 
    3756           2 :         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
    3757           0 :                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
    3758             : 
    3759           2 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
    3760             :                                                                 free_pages);
    3761             : }
    3762             : 
    3763             : #ifdef CONFIG_NUMA
    3764           0 : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    3765             : {
    3766           0 :         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
    3767             :                                 node_reclaim_distance;
    3768             : }
    3769             : #else   /* CONFIG_NUMA */
    3770             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    3771             : {
    3772             :         return true;
    3773             : }
    3774             : #endif  /* CONFIG_NUMA */
    3775             : 
    3776             : /*
    3777             :  * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
    3778             :  * fragmentation is subtle. If the preferred zone was HIGHMEM then
    3779             :  * premature use of a lower zone may cause lowmem pressure problems that
    3780             :  * are worse than fragmentation. If the next zone is ZONE_DMA then it is
    3781             :  * probably too small. It only makes sense to spread allocations to avoid
    3782             :  * fragmentation between the Normal and DMA32 zones.
    3783             :  */
    3784             : static inline unsigned int
    3785      192987 : alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
    3786             : {
    3787      192987 :         unsigned int alloc_flags;
    3788             : 
    3789             :         /*
    3790             :          * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    3791             :          * to save a branch.
    3792             :          */
    3793      192987 :         alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
    3794             : 
    3795             : #ifdef CONFIG_ZONE_DMA32
    3796      192987 :         if (!zone)
    3797             :                 return alloc_flags;
    3798             : 
    3799      192987 :         if (zone_idx(zone) != ZONE_NORMAL)
    3800             :                 return alloc_flags;
    3801             : 
    3802             :         /*
    3803             :          * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
    3804             :          * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
    3805             :          * on UMA that if Normal is populated then so is DMA32.
    3806             :          */
    3807           0 :         BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
    3808           0 :         if (nr_online_nodes > 1 && !populated_zone(--zone))
    3809             :                 return alloc_flags;
    3810             : 
    3811           0 :         alloc_flags |= ALLOC_NOFRAGMENT;
    3812             : #endif /* CONFIG_ZONE_DMA32 */
    3813           0 :         return alloc_flags;
    3814             : }
    3815             : 
    3816      192945 : static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
    3817             :                                         unsigned int alloc_flags)
    3818             : {
    3819             : #ifdef CONFIG_CMA
    3820             :         unsigned int pflags = current->flags;
    3821             : 
    3822             :         if (!(pflags & PF_MEMALLOC_NOCMA) &&
    3823             :                         gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
    3824             :                 alloc_flags |= ALLOC_CMA;
    3825             : 
    3826             : #endif
    3827      192945 :         return alloc_flags;
    3828             : }
    3829             : 
    3830             : /*
    3831             :  * get_page_from_freelist goes through the zonelist trying to allocate
    3832             :  * a page.
    3833             :  */
    3834             : static struct page *
    3835      192979 : get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
    3836             :                                                 const struct alloc_context *ac)
    3837             : {
    3838      192979 :         struct zoneref *z;
    3839      192979 :         struct zone *zone;
    3840      192979 :         struct pglist_data *last_pgdat_dirty_limit = NULL;
    3841      192979 :         bool no_fallback;
    3842             : 
    3843             : retry:
    3844             :         /*
    3845             :          * Scan zonelist, looking for a zone with enough free.
    3846             :          * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
    3847             :          */
    3848      192979 :         no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
    3849      192979 :         z = ac->preferred_zoneref;
    3850      192978 :         for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
    3851             :                                         ac->nodemask) {
    3852      192978 :                 struct page *page;
    3853      192978 :                 unsigned long mark;
    3854             : 
    3855      192978 :                 if (cpusets_enabled() &&
    3856             :                         (alloc_flags & ALLOC_CPUSET) &&
    3857             :                         !__cpuset_zone_allowed(zone, gfp_mask))
    3858             :                                 continue;
    3859             :                 /*
    3860             :                  * When allocating a page cache page for writing, we
    3861             :                  * want to get it from a node that is within its dirty
    3862             :                  * limit, such that no single node holds more than its
    3863             :                  * proportional share of globally allowed dirty pages.
    3864             :                  * The dirty limits take into account the node's
    3865             :                  * lowmem reserves and high watermark so that kswapd
    3866             :                  * should be able to balance it without having to
    3867             :                  * write pages from its LRU list.
    3868             :                  *
    3869             :                  * XXX: For now, allow allocations to potentially
    3870             :                  * exceed the per-node dirty limit in the slowpath
    3871             :                  * (spread_dirty_pages unset) before going into reclaim,
    3872             :                  * which is important when on a NUMA setup the allowed
    3873             :                  * nodes are together not big enough to reach the
    3874             :                  * global limit.  The proper fix for these situations
    3875             :                  * will require awareness of nodes in the
    3876             :                  * dirty-throttling and the flusher threads.
    3877             :                  */
    3878      192978 :                 if (ac->spread_dirty_pages) {
    3879        1583 :                         if (last_pgdat_dirty_limit == zone->zone_pgdat)
    3880           0 :                                 continue;
    3881             : 
    3882        1583 :                         if (!node_dirty_ok(zone->zone_pgdat)) {
    3883           0 :                                 last_pgdat_dirty_limit = zone->zone_pgdat;
    3884           0 :                                 continue;
    3885             :                         }
    3886             :                 }
    3887             : 
    3888      192978 :                 if (no_fallback && nr_online_nodes > 1 &&
    3889           0 :                     zone != ac->preferred_zoneref->zone) {
    3890           0 :                         int local_nid;
    3891             : 
    3892             :                         /*
    3893             :                          * If moving to a remote node, retry but allow
    3894             :                          * fragmenting fallbacks. Locality is more important
    3895             :                          * than fragmentation avoidance.
    3896             :                          */
    3897           0 :                         local_nid = zone_to_nid(ac->preferred_zoneref->zone);
    3898           0 :                         if (zone_to_nid(zone) != local_nid) {
    3899           0 :                                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    3900           0 :                                 goto retry;
    3901             :                         }
    3902             :                 }
    3903             : 
    3904      192978 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
    3905      192971 :                 if (!zone_watermark_fast(zone, order, mark,
    3906      192978 :                                        ac->highest_zoneidx, alloc_flags,
    3907             :                                        gfp_mask)) {
    3908           0 :                         int ret;
    3909             : 
    3910             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    3911             :                         /*
    3912             :                          * Watermark failed for this zone, but see if we can
    3913             :                          * grow this zone if it contains deferred pages.
    3914             :                          */
    3915             :                         if (static_branch_unlikely(&deferred_pages)) {
    3916             :                                 if (_deferred_grow_zone(zone, order))
    3917             :                                         goto try_this_zone;
    3918             :                         }
    3919             : #endif
    3920             :                         /* Checked here to keep the fast path fast */
    3921           0 :                         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
    3922           0 :                         if (alloc_flags & ALLOC_NO_WATERMARKS)
    3923           0 :                                 goto try_this_zone;
    3924             : 
    3925           0 :                         if (node_reclaim_mode == 0 ||
    3926           0 :                             !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
    3927           0 :                                 continue;
    3928             : 
    3929           0 :                         ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
    3930           0 :                         switch (ret) {
    3931           0 :                         case NODE_RECLAIM_NOSCAN:
    3932             :                                 /* did not scan */
    3933           0 :                                 continue;
    3934           0 :                         case NODE_RECLAIM_FULL:
    3935             :                                 /* scanned but unreclaimable */
    3936           0 :                                 continue;
    3937           0 :                         default:
    3938             :                                 /* did we reclaim enough */
    3939           0 :                                 if (zone_watermark_ok(zone, order, mark,
    3940           0 :                                         ac->highest_zoneidx, alloc_flags))
    3941           0 :                                         goto try_this_zone;
    3942             : 
    3943           0 :                                 continue;
    3944             :                         }
    3945             :                 }
    3946             : 
    3947      192971 : try_this_zone:
    3948      192971 :                 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
    3949             :                                 gfp_mask, alloc_flags, ac->migratetype);
    3950      192983 :                 if (page) {
    3951      192984 :                         prep_new_page(page, order, gfp_mask, alloc_flags);
    3952             : 
    3953             :                         /*
    3954             :                          * If this is a high-order atomic allocation then check
    3955             :                          * if the pageblock should be reserved for the future
    3956             :                          */
    3957      192980 :                         if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
    3958           0 :                                 reserve_highatomic_pageblock(page, zone, order);
    3959             : 
    3960      192980 :                         return page;
    3961             :                 } else {
    3962             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    3963             :                         /* Try again if zone has deferred pages */
    3964             :                         if (static_branch_unlikely(&deferred_pages)) {
    3965             :                                 if (_deferred_grow_zone(zone, order))
    3966             :                                         goto try_this_zone;
    3967             :                         }
    3968             : #endif
    3969           0 :                 }
    3970             :         }
    3971             : 
    3972             :         /*
    3973             :          * It's possible on a UMA machine to get through all zones that are
    3974             :          * fragmented. If avoiding fragmentation, reset and try again.
    3975             :          */
    3976           0 :         if (no_fallback) {
    3977           0 :                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    3978           0 :                 goto retry;
    3979             :         }
    3980             : 
    3981             :         return NULL;
    3982             : }
    3983             : 
    3984           0 : static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
    3985             : {
    3986           0 :         unsigned int filter = SHOW_MEM_FILTER_NODES;
    3987             : 
    3988             :         /*
    3989             :          * This documents exceptions given to allocations in certain
    3990             :          * contexts that are allowed to allocate outside current's set
    3991             :          * of allowed nodes.
    3992             :          */
    3993           0 :         if (!(gfp_mask & __GFP_NOMEMALLOC))
    3994           0 :                 if (tsk_is_oom_victim(current) ||
    3995           0 :                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
    3996             :                         filter &= ~SHOW_MEM_FILTER_NODES;
    3997           0 :         if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
    3998           0 :                 filter &= ~SHOW_MEM_FILTER_NODES;
    3999             : 
    4000           0 :         show_mem(filter, nodemask);
    4001           0 : }
    4002             : 
    4003           0 : void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
    4004             : {
    4005           0 :         struct va_format vaf;
    4006           0 :         va_list args;
    4007           0 :         static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
    4008             : 
    4009           0 :         if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
    4010           0 :                 return;
    4011             : 
    4012           0 :         va_start(args, fmt);
    4013           0 :         vaf.fmt = fmt;
    4014           0 :         vaf.va = &args;
    4015           0 :         pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
    4016             :                         current->comm, &vaf, gfp_mask, &gfp_mask,
    4017             :                         nodemask_pr_args(nodemask));
    4018           0 :         va_end(args);
    4019             : 
    4020           0 :         cpuset_print_current_mems_allowed();
    4021           0 :         pr_cont("\n");
    4022           0 :         dump_stack();
    4023           0 :         warn_alloc_show_mem(gfp_mask, nodemask);
    4024             : }
    4025             : 
    4026             : static inline struct page *
    4027           0 : __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
    4028             :                               unsigned int alloc_flags,
    4029             :                               const struct alloc_context *ac)
    4030             : {
    4031           0 :         struct page *page;
    4032             : 
    4033           0 :         page = get_page_from_freelist(gfp_mask, order,
    4034           0 :                         alloc_flags|ALLOC_CPUSET, ac);
    4035             :         /*
    4036             :          * fallback to ignore cpuset restriction if our nodes
    4037             :          * are depleted
    4038             :          */
    4039           0 :         if (!page)
    4040           0 :                 page = get_page_from_freelist(gfp_mask, order,
    4041             :                                 alloc_flags, ac);
    4042             : 
    4043           0 :         return page;
    4044             : }
    4045             : 
    4046             : static inline struct page *
    4047           0 : __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
    4048             :         const struct alloc_context *ac, unsigned long *did_some_progress)
    4049             : {
    4050           0 :         struct oom_control oc = {
    4051           0 :                 .zonelist = ac->zonelist,
    4052           0 :                 .nodemask = ac->nodemask,
    4053             :                 .memcg = NULL,
    4054             :                 .gfp_mask = gfp_mask,
    4055             :                 .order = order,
    4056             :         };
    4057           0 :         struct page *page;
    4058             : 
    4059           0 :         *did_some_progress = 0;
    4060             : 
    4061             :         /*
    4062             :          * Acquire the oom lock.  If that fails, somebody else is
    4063             :          * making progress for us.
    4064             :          */
    4065           0 :         if (!mutex_trylock(&oom_lock)) {
    4066           0 :                 *did_some_progress = 1;
    4067           0 :                 schedule_timeout_uninterruptible(1);
    4068           0 :                 return NULL;
    4069             :         }
    4070             : 
    4071             :         /*
    4072             :          * Go through the zonelist yet one more time, keep very high watermark
    4073             :          * here, this is only to catch a parallel oom killing, we must fail if
    4074             :          * we're still under heavy pressure. But make sure that this reclaim
    4075             :          * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
    4076             :          * allocation which will never fail due to oom_lock already held.
    4077             :          */
    4078           0 :         page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
    4079             :                                       ~__GFP_DIRECT_RECLAIM, order,
    4080             :                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
    4081           0 :         if (page)
    4082           0 :                 goto out;
    4083             : 
    4084             :         /* Coredumps can quickly deplete all memory reserves */
    4085           0 :         if (current->flags & PF_DUMPCORE)
    4086           0 :                 goto out;
    4087             :         /* The OOM killer will not help higher order allocs */
    4088           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    4089           0 :                 goto out;
    4090             :         /*
    4091             :          * We have already exhausted all our reclaim opportunities without any
    4092             :          * success so it is time to admit defeat. We will skip the OOM killer
    4093             :          * because it is very likely that the caller has a more reasonable
    4094             :          * fallback than shooting a random task.
    4095             :          *
    4096             :          * The OOM killer may not free memory on a specific node.
    4097             :          */
    4098           0 :         if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
    4099           0 :                 goto out;
    4100             :         /* The OOM killer does not needlessly kill tasks for lowmem */
    4101           0 :         if (ac->highest_zoneidx < ZONE_NORMAL)
    4102           0 :                 goto out;
    4103           0 :         if (pm_suspended_storage())
    4104             :                 goto out;
    4105             :         /*
    4106             :          * XXX: GFP_NOFS allocations should rather fail than rely on
    4107             :          * other request to make a forward progress.
    4108             :          * We are in an unfortunate situation where out_of_memory cannot
    4109             :          * do much for this context but let's try it to at least get
    4110             :          * access to memory reserved if the current task is killed (see
    4111             :          * out_of_memory). Once filesystems are ready to handle allocation
    4112             :          * failures more gracefully we should just bail out here.
    4113             :          */
    4114             : 
    4115             :         /* Exhausted what can be done so it's blame time */
    4116           0 :         if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
    4117           0 :                 *did_some_progress = 1;
    4118             : 
    4119             :                 /*
    4120             :                  * Help non-failing allocations by giving them access to memory
    4121             :                  * reserves
    4122             :                  */
    4123           0 :                 if (gfp_mask & __GFP_NOFAIL)
    4124           0 :                         page = __alloc_pages_cpuset_fallback(gfp_mask, order,
    4125             :                                         ALLOC_NO_WATERMARKS, ac);
    4126             :         }
    4127           0 : out:
    4128           0 :         mutex_unlock(&oom_lock);
    4129           0 :         return page;
    4130             : }
    4131             : 
    4132             : /*
    4133             :  * Maximum number of compaction retries wit a progress before OOM
    4134             :  * killer is consider as the only way to move forward.
    4135             :  */
    4136             : #define MAX_COMPACT_RETRIES 16
    4137             : 
    4138             : #ifdef CONFIG_COMPACTION
    4139             : /* Try memory compaction for high-order allocations before reclaim */
    4140             : static struct page *
    4141           0 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    4142             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4143             :                 enum compact_priority prio, enum compact_result *compact_result)
    4144             : {
    4145           0 :         struct page *page = NULL;
    4146           0 :         unsigned long pflags;
    4147           0 :         unsigned int noreclaim_flag;
    4148             : 
    4149           0 :         if (!order)
    4150             :                 return NULL;
    4151             : 
    4152           0 :         psi_memstall_enter(&pflags);
    4153           0 :         noreclaim_flag = memalloc_noreclaim_save();
    4154             : 
    4155           0 :         *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
    4156             :                                                                 prio, &page);
    4157             : 
    4158           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    4159           0 :         psi_memstall_leave(&pflags);
    4160             : 
    4161             :         /*
    4162             :          * At least in one zone compaction wasn't deferred or skipped, so let's
    4163             :          * count a compaction stall
    4164             :          */
    4165           0 :         count_vm_event(COMPACTSTALL);
    4166             : 
    4167             :         /* Prep a captured page if available */
    4168           0 :         if (page)
    4169           0 :                 prep_new_page(page, order, gfp_mask, alloc_flags);
    4170             : 
    4171             :         /* Try get a page from the freelist if available */
    4172           0 :         if (!page)
    4173           0 :                 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4174             : 
    4175           0 :         if (page) {
    4176           0 :                 struct zone *zone = page_zone(page);
    4177             : 
    4178           0 :                 zone->compact_blockskip_flush = false;
    4179           0 :                 compaction_defer_reset(zone, order, true);
    4180           0 :                 count_vm_event(COMPACTSUCCESS);
    4181           0 :                 return page;
    4182             :         }
    4183             : 
    4184             :         /*
    4185             :          * It's bad if compaction run occurs and fails. The most likely reason
    4186             :          * is that pages exist, but not enough to satisfy watermarks.
    4187             :          */
    4188           0 :         count_vm_event(COMPACTFAIL);
    4189             : 
    4190           0 :         cond_resched();
    4191             : 
    4192           0 :         return NULL;
    4193             : }
    4194             : 
    4195             : static inline bool
    4196           0 : should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
    4197             :                      enum compact_result compact_result,
    4198             :                      enum compact_priority *compact_priority,
    4199             :                      int *compaction_retries)
    4200             : {
    4201           0 :         int max_retries = MAX_COMPACT_RETRIES;
    4202           0 :         int min_priority;
    4203           0 :         bool ret = false;
    4204           0 :         int retries = *compaction_retries;
    4205           0 :         enum compact_priority priority = *compact_priority;
    4206             : 
    4207           0 :         if (!order)
    4208             :                 return false;
    4209             : 
    4210           0 :         if (compaction_made_progress(compact_result))
    4211           0 :                 (*compaction_retries)++;
    4212             : 
    4213             :         /*
    4214             :          * compaction considers all the zone as desperately out of memory
    4215             :          * so it doesn't really make much sense to retry except when the
    4216             :          * failure could be caused by insufficient priority
    4217             :          */
    4218           0 :         if (compaction_failed(compact_result))
    4219           0 :                 goto check_priority;
    4220             : 
    4221             :         /*
    4222             :          * compaction was skipped because there are not enough order-0 pages
    4223             :          * to work with, so we retry only if it looks like reclaim can help.
    4224             :          */
    4225           0 :         if (compaction_needs_reclaim(compact_result)) {
    4226           0 :                 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
    4227           0 :                 goto out;
    4228             :         }
    4229             : 
    4230             :         /*
    4231             :          * make sure the compaction wasn't deferred or didn't bail out early
    4232             :          * due to locks contention before we declare that we should give up.
    4233             :          * But the next retry should use a higher priority if allowed, so
    4234             :          * we don't just keep bailing out endlessly.
    4235             :          */
    4236           0 :         if (compaction_withdrawn(compact_result)) {
    4237           0 :                 goto check_priority;
    4238             :         }
    4239             : 
    4240             :         /*
    4241             :          * !costly requests are much more important than __GFP_RETRY_MAYFAIL
    4242             :          * costly ones because they are de facto nofail and invoke OOM
    4243             :          * killer to move on while costly can fail and users are ready
    4244             :          * to cope with that. 1/4 retries is rather arbitrary but we
    4245             :          * would need much more detailed feedback from compaction to
    4246             :          * make a better decision.
    4247             :          */
    4248           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    4249           0 :                 max_retries /= 4;
    4250           0 :         if (*compaction_retries <= max_retries) {
    4251           0 :                 ret = true;
    4252           0 :                 goto out;
    4253             :         }
    4254             : 
    4255             :         /*
    4256             :          * Make sure there are attempts at the highest priority if we exhausted
    4257             :          * all retries or failed at the lower priorities.
    4258             :          */
    4259           0 : check_priority:
    4260           0 :         min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
    4261           0 :                         MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
    4262             : 
    4263           0 :         if (*compact_priority > min_priority) {
    4264           0 :                 (*compact_priority)--;
    4265           0 :                 *compaction_retries = 0;
    4266           0 :                 ret = true;
    4267             :         }
    4268           0 : out:
    4269           0 :         trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
    4270           0 :         return ret;
    4271             : }
    4272             : #else
    4273             : static inline struct page *
    4274             : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    4275             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4276             :                 enum compact_priority prio, enum compact_result *compact_result)
    4277             : {
    4278             :         *compact_result = COMPACT_SKIPPED;
    4279             :         return NULL;
    4280             : }
    4281             : 
    4282             : static inline bool
    4283             : should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
    4284             :                      enum compact_result compact_result,
    4285             :                      enum compact_priority *compact_priority,
    4286             :                      int *compaction_retries)
    4287             : {
    4288             :         struct zone *zone;
    4289             :         struct zoneref *z;
    4290             : 
    4291             :         if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
    4292             :                 return false;
    4293             : 
    4294             :         /*
    4295             :          * There are setups with compaction disabled which would prefer to loop
    4296             :          * inside the allocator rather than hit the oom killer prematurely.
    4297             :          * Let's give them a good hope and keep retrying while the order-0
    4298             :          * watermarks are OK.
    4299             :          */
    4300             :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    4301             :                                 ac->highest_zoneidx, ac->nodemask) {
    4302             :                 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
    4303             :                                         ac->highest_zoneidx, alloc_flags))
    4304             :                         return true;
    4305             :         }
    4306             :         return false;
    4307             : }
    4308             : #endif /* CONFIG_COMPACTION */
    4309             : 
    4310             : #ifdef CONFIG_LOCKDEP
    4311             : static struct lockdep_map __fs_reclaim_map =
    4312             :         STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
    4313             : 
    4314     3295452 : static bool __need_reclaim(gfp_t gfp_mask)
    4315             : {
    4316             :         /* no reclaim without waiting on it */
    4317     3295452 :         if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
    4318             :                 return false;
    4319             : 
    4320             :         /* this guy won't enter reclaim */
    4321     2852458 :         if (current->flags & PF_MEMALLOC)
    4322             :                 return false;
    4323             : 
    4324     2852458 :         if (gfp_mask & __GFP_NOLOCKDEP)
    4325           0 :                 return false;
    4326             : 
    4327             :         return true;
    4328             : }
    4329             : 
    4330     1319244 : void __fs_reclaim_acquire(void)
    4331             : {
    4332     1319244 :         lock_map_acquire(&__fs_reclaim_map);
    4333     1319445 : }
    4334             : 
    4335     1319507 : void __fs_reclaim_release(void)
    4336             : {
    4337           0 :         lock_map_release(&__fs_reclaim_map);
    4338     1319224 : }
    4339             : 
    4340     1649009 : void fs_reclaim_acquire(gfp_t gfp_mask)
    4341             : {
    4342     1649009 :         gfp_mask = current_gfp_context(gfp_mask);
    4343             : 
    4344     1649047 :         if (__need_reclaim(gfp_mask)) {
    4345     1427604 :                 if (gfp_mask & __GFP_FS)
    4346     1319226 :                         __fs_reclaim_acquire();
    4347             : 
    4348             : #ifdef CONFIG_MMU_NOTIFIER
    4349             :                 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
    4350             :                 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
    4351             : #endif
    4352             : 
    4353             :         }
    4354     1649239 : }
    4355             : EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
    4356             : 
    4357     1648965 : void fs_reclaim_release(gfp_t gfp_mask)
    4358             : {
    4359     1648965 :         gfp_mask = current_gfp_context(gfp_mask);
    4360             : 
    4361     1649007 :         if (__need_reclaim(gfp_mask)) {
    4362     1427774 :                 if (gfp_mask & __GFP_FS)
    4363     1319507 :                         __fs_reclaim_release();
    4364             :         }
    4365     1648724 : }
    4366             : EXPORT_SYMBOL_GPL(fs_reclaim_release);
    4367             : #endif
    4368             : 
    4369             : /* Perform direct synchronous page reclaim */
    4370             : static unsigned long
    4371           0 : __perform_reclaim(gfp_t gfp_mask, unsigned int order,
    4372             :                                         const struct alloc_context *ac)
    4373             : {
    4374           0 :         unsigned int noreclaim_flag;
    4375           0 :         unsigned long pflags, progress;
    4376             : 
    4377           0 :         cond_resched();
    4378             : 
    4379             :         /* We now go into synchronous reclaim */
    4380           0 :         cpuset_memory_pressure_bump();
    4381           0 :         psi_memstall_enter(&pflags);
    4382           0 :         fs_reclaim_acquire(gfp_mask);
    4383           0 :         noreclaim_flag = memalloc_noreclaim_save();
    4384             : 
    4385           0 :         progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
    4386             :                                                                 ac->nodemask);
    4387             : 
    4388           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    4389           0 :         fs_reclaim_release(gfp_mask);
    4390           0 :         psi_memstall_leave(&pflags);
    4391             : 
    4392           0 :         cond_resched();
    4393             : 
    4394           0 :         return progress;
    4395             : }
    4396             : 
    4397             : /* The really slow allocator path where we enter direct reclaim */
    4398             : static inline struct page *
    4399           0 : __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
    4400             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4401             :                 unsigned long *did_some_progress)
    4402             : {
    4403           0 :         struct page *page = NULL;
    4404           0 :         bool drained = false;
    4405             : 
    4406           0 :         *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
    4407           0 :         if (unlikely(!(*did_some_progress)))
    4408             :                 return NULL;
    4409             : 
    4410           0 : retry:
    4411           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4412             : 
    4413             :         /*
    4414             :          * If an allocation failed after direct reclaim, it could be because
    4415             :          * pages are pinned on the per-cpu lists or in high alloc reserves.
    4416             :          * Shrink them and try again
    4417             :          */
    4418           0 :         if (!page && !drained) {
    4419           0 :                 unreserve_highatomic_pageblock(ac, false);
    4420           0 :                 drain_all_pages(NULL);
    4421           0 :                 drained = true;
    4422           0 :                 goto retry;
    4423             :         }
    4424             : 
    4425             :         return page;
    4426             : }
    4427             : 
    4428           0 : static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
    4429             :                              const struct alloc_context *ac)
    4430             : {
    4431           0 :         struct zoneref *z;
    4432           0 :         struct zone *zone;
    4433           0 :         pg_data_t *last_pgdat = NULL;
    4434           0 :         enum zone_type highest_zoneidx = ac->highest_zoneidx;
    4435             : 
    4436           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
    4437             :                                         ac->nodemask) {
    4438           0 :                 if (last_pgdat != zone->zone_pgdat)
    4439           0 :                         wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
    4440           0 :                 last_pgdat = zone->zone_pgdat;
    4441             :         }
    4442           0 : }
    4443             : 
    4444             : static inline unsigned int
    4445           0 : gfp_to_alloc_flags(gfp_t gfp_mask)
    4446             : {
    4447           0 :         unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
    4448             : 
    4449             :         /*
    4450             :          * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
    4451             :          * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    4452             :          * to save two branches.
    4453             :          */
    4454           0 :         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
    4455           0 :         BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
    4456             : 
    4457             :         /*
    4458             :          * The caller may dip into page reserves a bit more if the caller
    4459             :          * cannot run direct reclaim, or if the caller has realtime scheduling
    4460             :          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
    4461             :          * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
    4462             :          */
    4463           0 :         alloc_flags |= (__force int)
    4464             :                 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
    4465             : 
    4466           0 :         if (gfp_mask & __GFP_ATOMIC) {
    4467             :                 /*
    4468             :                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
    4469             :                  * if it can't schedule.
    4470             :                  */
    4471           0 :                 if (!(gfp_mask & __GFP_NOMEMALLOC))
    4472           0 :                         alloc_flags |= ALLOC_HARDER;
    4473             :                 /*
    4474             :                  * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
    4475             :                  * comment for __cpuset_node_allowed().
    4476             :                  */
    4477           0 :                 alloc_flags &= ~ALLOC_CPUSET;
    4478           0 :         } else if (unlikely(rt_task(current)) && !in_interrupt())
    4479           0 :                 alloc_flags |= ALLOC_HARDER;
    4480             : 
    4481           0 :         alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
    4482             : 
    4483           0 :         return alloc_flags;
    4484             : }
    4485             : 
    4486           0 : static bool oom_reserves_allowed(struct task_struct *tsk)
    4487             : {
    4488           0 :         if (!tsk_is_oom_victim(tsk))
    4489             :                 return false;
    4490             : 
    4491             :         /*
    4492             :          * !MMU doesn't have oom reaper so give access to memory reserves
    4493             :          * only to the thread with TIF_MEMDIE set
    4494             :          */
    4495             :         if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
    4496             :                 return false;
    4497             : 
    4498             :         return true;
    4499             : }
    4500             : 
    4501             : /*
    4502             :  * Distinguish requests which really need access to full memory
    4503             :  * reserves from oom victims which can live with a portion of it
    4504             :  */
    4505           0 : static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
    4506             : {
    4507           0 :         if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
    4508             :                 return 0;
    4509           0 :         if (gfp_mask & __GFP_MEMALLOC)
    4510             :                 return ALLOC_NO_WATERMARKS;
    4511           0 :         if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
    4512             :                 return ALLOC_NO_WATERMARKS;
    4513           0 :         if (!in_interrupt()) {
    4514           0 :                 if (current->flags & PF_MEMALLOC)
    4515             :                         return ALLOC_NO_WATERMARKS;
    4516           0 :                 else if (oom_reserves_allowed(current))
    4517           0 :                         return ALLOC_OOM;
    4518             :         }
    4519             : 
    4520             :         return 0;
    4521             : }
    4522             : 
    4523           0 : bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
    4524             : {
    4525           0 :         return !!__gfp_pfmemalloc_flags(gfp_mask);
    4526             : }
    4527             : 
    4528             : /*
    4529             :  * Checks whether it makes sense to retry the reclaim to make a forward progress
    4530             :  * for the given allocation request.
    4531             :  *
    4532             :  * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
    4533             :  * without success, or when we couldn't even meet the watermark if we
    4534             :  * reclaimed all remaining pages on the LRU lists.
    4535             :  *
    4536             :  * Returns true if a retry is viable or false to enter the oom path.
    4537             :  */
    4538             : static inline bool
    4539           0 : should_reclaim_retry(gfp_t gfp_mask, unsigned order,
    4540             :                      struct alloc_context *ac, int alloc_flags,
    4541             :                      bool did_some_progress, int *no_progress_loops)
    4542             : {
    4543           0 :         struct zone *zone;
    4544           0 :         struct zoneref *z;
    4545           0 :         bool ret = false;
    4546             : 
    4547             :         /*
    4548             :          * Costly allocations might have made a progress but this doesn't mean
    4549             :          * their order will become available due to high fragmentation so
    4550             :          * always increment the no progress counter for them
    4551             :          */
    4552           0 :         if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
    4553           0 :                 *no_progress_loops = 0;
    4554             :         else
    4555           0 :                 (*no_progress_loops)++;
    4556             : 
    4557             :         /*
    4558             :          * Make sure we converge to OOM if we cannot make any progress
    4559             :          * several times in the row.
    4560             :          */
    4561           0 :         if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
    4562             :                 /* Before OOM, exhaust highatomic_reserve */
    4563           0 :                 return unreserve_highatomic_pageblock(ac, true);
    4564             :         }
    4565             : 
    4566             :         /*
    4567             :          * Keep reclaiming pages while there is a chance this will lead
    4568             :          * somewhere.  If none of the target zones can satisfy our allocation
    4569             :          * request even if all reclaimable pages are considered then we are
    4570             :          * screwed and have to go OOM.
    4571             :          */
    4572           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    4573             :                                 ac->highest_zoneidx, ac->nodemask) {
    4574           0 :                 unsigned long available;
    4575           0 :                 unsigned long reclaimable;
    4576           0 :                 unsigned long min_wmark = min_wmark_pages(zone);
    4577           0 :                 bool wmark;
    4578             : 
    4579           0 :                 available = reclaimable = zone_reclaimable_pages(zone);
    4580           0 :                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
    4581             : 
    4582             :                 /*
    4583             :                  * Would the allocation succeed if we reclaimed all
    4584             :                  * reclaimable pages?
    4585             :                  */
    4586           0 :                 wmark = __zone_watermark_ok(zone, order, min_wmark,
    4587           0 :                                 ac->highest_zoneidx, alloc_flags, available);
    4588           0 :                 trace_reclaim_retry_zone(z, order, reclaimable,
    4589             :                                 available, min_wmark, *no_progress_loops, wmark);
    4590           0 :                 if (wmark) {
    4591             :                         /*
    4592             :                          * If we didn't make any progress and have a lot of
    4593             :                          * dirty + writeback pages then we should wait for
    4594             :                          * an IO to complete to slow down the reclaim and
    4595             :                          * prevent from pre mature OOM
    4596             :                          */
    4597           0 :                         if (!did_some_progress) {
    4598           0 :                                 unsigned long write_pending;
    4599             : 
    4600           0 :                                 write_pending = zone_page_state_snapshot(zone,
    4601             :                                                         NR_ZONE_WRITE_PENDING);
    4602             : 
    4603           0 :                                 if (2 * write_pending > reclaimable) {
    4604           0 :                                         congestion_wait(BLK_RW_ASYNC, HZ/10);
    4605           0 :                                         return true;
    4606             :                                 }
    4607             :                         }
    4608             : 
    4609           0 :                         ret = true;
    4610           0 :                         goto out;
    4611             :                 }
    4612             :         }
    4613             : 
    4614           0 : out:
    4615             :         /*
    4616             :          * Memory allocation/reclaim might be called from a WQ context and the
    4617             :          * current implementation of the WQ concurrency control doesn't
    4618             :          * recognize that a particular WQ is congested if the worker thread is
    4619             :          * looping without ever sleeping. Therefore we have to do a short sleep
    4620             :          * here rather than calling cond_resched().
    4621             :          */
    4622           0 :         if (current->flags & PF_WQ_WORKER)
    4623           0 :                 schedule_timeout_uninterruptible(1);
    4624             :         else
    4625           0 :                 cond_resched();
    4626             :         return ret;
    4627             : }
    4628             : 
    4629             : static inline bool
    4630           0 : check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
    4631             : {
    4632             :         /*
    4633             :          * It's possible that cpuset's mems_allowed and the nodemask from
    4634             :          * mempolicy don't intersect. This should be normally dealt with by
    4635             :          * policy_nodemask(), but it's possible to race with cpuset update in
    4636             :          * such a way the check therein was true, and then it became false
    4637             :          * before we got our cpuset_mems_cookie here.
    4638             :          * This assumes that for all allocations, ac->nodemask can come only
    4639             :          * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
    4640             :          * when it does not intersect with the cpuset restrictions) or the
    4641             :          * caller can deal with a violated nodemask.
    4642             :          */
    4643           0 :         if (cpusets_enabled() && ac->nodemask &&
    4644             :                         !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
    4645             :                 ac->nodemask = NULL;
    4646             :                 return true;
    4647             :         }
    4648             : 
    4649             :         /*
    4650             :          * When updating a task's mems_allowed or mempolicy nodemask, it is
    4651             :          * possible to race with parallel threads in such a way that our
    4652             :          * allocation can fail while the mask is being updated. If we are about
    4653             :          * to fail, check if the cpuset changed during allocation and if so,
    4654             :          * retry.
    4655             :          */
    4656           0 :         if (read_mems_allowed_retry(cpuset_mems_cookie))
    4657             :                 return true;
    4658             : 
    4659           0 :         return false;
    4660             : }
    4661             : 
    4662             : static inline struct page *
    4663           0 : __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
    4664             :                                                 struct alloc_context *ac)
    4665             : {
    4666           0 :         bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
    4667           0 :         const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
    4668           0 :         struct page *page = NULL;
    4669           0 :         unsigned int alloc_flags;
    4670           0 :         unsigned long did_some_progress;
    4671           0 :         enum compact_priority compact_priority;
    4672           0 :         enum compact_result compact_result;
    4673           0 :         int compaction_retries;
    4674           0 :         int no_progress_loops;
    4675           0 :         unsigned int cpuset_mems_cookie;
    4676           0 :         int reserve_flags;
    4677             : 
    4678             :         /*
    4679             :          * We also sanity check to catch abuse of atomic reserves being used by
    4680             :          * callers that are not in atomic context.
    4681             :          */
    4682           0 :         if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
    4683             :                                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
    4684           0 :                 gfp_mask &= ~__GFP_ATOMIC;
    4685             : 
    4686           0 : retry_cpuset:
    4687           0 :         compaction_retries = 0;
    4688           0 :         no_progress_loops = 0;
    4689           0 :         compact_priority = DEF_COMPACT_PRIORITY;
    4690           0 :         cpuset_mems_cookie = read_mems_allowed_begin();
    4691             : 
    4692             :         /*
    4693             :          * The fast path uses conservative alloc_flags to succeed only until
    4694             :          * kswapd needs to be woken up, and to avoid the cost of setting up
    4695             :          * alloc_flags precisely. So we do that now.
    4696             :          */
    4697           0 :         alloc_flags = gfp_to_alloc_flags(gfp_mask);
    4698             : 
    4699             :         /*
    4700             :          * We need to recalculate the starting point for the zonelist iterator
    4701             :          * because we might have used different nodemask in the fast path, or
    4702             :          * there was a cpuset modification and we are retrying - otherwise we
    4703             :          * could end up iterating over non-eligible zones endlessly.
    4704             :          */
    4705           0 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4706             :                                         ac->highest_zoneidx, ac->nodemask);
    4707           0 :         if (!ac->preferred_zoneref->zone)
    4708           0 :                 goto nopage;
    4709             : 
    4710           0 :         if (alloc_flags & ALLOC_KSWAPD)
    4711           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    4712             : 
    4713             :         /*
    4714             :          * The adjusted alloc_flags might result in immediate success, so try
    4715             :          * that first
    4716             :          */
    4717           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4718           0 :         if (page)
    4719           0 :                 goto got_pg;
    4720             : 
    4721             :         /*
    4722             :          * For costly allocations, try direct compaction first, as it's likely
    4723             :          * that we have enough base pages and don't need to reclaim. For non-
    4724             :          * movable high-order allocations, do that as well, as compaction will
    4725             :          * try prevent permanent fragmentation by migrating from blocks of the
    4726             :          * same migratetype.
    4727             :          * Don't try this for allocations that are allowed to ignore
    4728             :          * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
    4729             :          */
    4730           0 :         if (can_direct_reclaim &&
    4731           0 :                         (costly_order ||
    4732           0 :                            (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
    4733           0 :                         && !gfp_pfmemalloc_allowed(gfp_mask)) {
    4734           0 :                 page = __alloc_pages_direct_compact(gfp_mask, order,
    4735             :                                                 alloc_flags, ac,
    4736             :                                                 INIT_COMPACT_PRIORITY,
    4737             :                                                 &compact_result);
    4738           0 :                 if (page)
    4739           0 :                         goto got_pg;
    4740             : 
    4741             :                 /*
    4742             :                  * Checks for costly allocations with __GFP_NORETRY, which
    4743             :                  * includes some THP page fault allocations
    4744             :                  */
    4745           0 :                 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
    4746             :                         /*
    4747             :                          * If allocating entire pageblock(s) and compaction
    4748             :                          * failed because all zones are below low watermarks
    4749             :                          * or is prohibited because it recently failed at this
    4750             :                          * order, fail immediately unless the allocator has
    4751             :                          * requested compaction and reclaim retry.
    4752             :                          *
    4753             :                          * Reclaim is
    4754             :                          *  - potentially very expensive because zones are far
    4755             :                          *    below their low watermarks or this is part of very
    4756             :                          *    bursty high order allocations,
    4757             :                          *  - not guaranteed to help because isolate_freepages()
    4758             :                          *    may not iterate over freed pages as part of its
    4759             :                          *    linear scan, and
    4760             :                          *  - unlikely to make entire pageblocks free on its
    4761             :                          *    own.
    4762             :                          */
    4763           0 :                         if (compact_result == COMPACT_SKIPPED ||
    4764             :                             compact_result == COMPACT_DEFERRED)
    4765           0 :                                 goto nopage;
    4766             : 
    4767             :                         /*
    4768             :                          * Looks like reclaim/compaction is worth trying, but
    4769             :                          * sync compaction could be very expensive, so keep
    4770             :                          * using async compaction.
    4771             :                          */
    4772           0 :                         compact_priority = INIT_COMPACT_PRIORITY;
    4773             :                 }
    4774             :         }
    4775             : 
    4776           0 : retry:
    4777             :         /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
    4778           0 :         if (alloc_flags & ALLOC_KSWAPD)
    4779           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    4780             : 
    4781           0 :         reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
    4782           0 :         if (reserve_flags)
    4783           0 :                 alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
    4784             : 
    4785             :         /*
    4786             :          * Reset the nodemask and zonelist iterators if memory policies can be
    4787             :          * ignored. These allocations are high priority and system rather than
    4788             :          * user oriented.
    4789             :          */
    4790           0 :         if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
    4791           0 :                 ac->nodemask = NULL;
    4792           0 :                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4793             :                                         ac->highest_zoneidx, ac->nodemask);
    4794             :         }
    4795             : 
    4796             :         /* Attempt with potentially adjusted zonelist and alloc_flags */
    4797           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4798           0 :         if (page)
    4799           0 :                 goto got_pg;
    4800             : 
    4801             :         /* Caller is not willing to reclaim, we can't balance anything */
    4802           0 :         if (!can_direct_reclaim)
    4803           0 :                 goto nopage;
    4804             : 
    4805             :         /* Avoid recursion of direct reclaim */
    4806           0 :         if (current->flags & PF_MEMALLOC)
    4807           0 :                 goto nopage;
    4808             : 
    4809             :         /* Try direct reclaim and then allocating */
    4810           0 :         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
    4811             :                                                         &did_some_progress);
    4812           0 :         if (page)
    4813           0 :                 goto got_pg;
    4814             : 
    4815             :         /* Try direct compaction and then allocating */
    4816           0 :         page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
    4817             :                                         compact_priority, &compact_result);
    4818           0 :         if (page)
    4819           0 :                 goto got_pg;
    4820             : 
    4821             :         /* Do not loop if specifically requested */
    4822           0 :         if (gfp_mask & __GFP_NORETRY)
    4823           0 :                 goto nopage;
    4824             : 
    4825             :         /*
    4826             :          * Do not retry costly high order allocations unless they are
    4827             :          * __GFP_RETRY_MAYFAIL
    4828             :          */
    4829           0 :         if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
    4830           0 :                 goto nopage;
    4831             : 
    4832           0 :         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
    4833             :                                  did_some_progress > 0, &no_progress_loops))
    4834           0 :                 goto retry;
    4835             : 
    4836             :         /*
    4837             :          * It doesn't make any sense to retry for the compaction if the order-0
    4838             :          * reclaim is not able to make any progress because the current
    4839             :          * implementation of the compaction depends on the sufficient amount
    4840             :          * of free memory (see __compaction_suitable)
    4841             :          */
    4842           0 :         if (did_some_progress > 0 &&
    4843           0 :                         should_compact_retry(ac, order, alloc_flags,
    4844             :                                 compact_result, &compact_priority,
    4845             :                                 &compaction_retries))
    4846           0 :                 goto retry;
    4847             : 
    4848             : 
    4849             :         /* Deal with possible cpuset update races before we start OOM killing */
    4850           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac))
    4851             :                 goto retry_cpuset;
    4852             : 
    4853             :         /* Reclaim has failed us, start killing things */
    4854           0 :         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
    4855           0 :         if (page)
    4856           0 :                 goto got_pg;
    4857             : 
    4858             :         /* Avoid allocations with no watermarks from looping endlessly */
    4859           0 :         if (tsk_is_oom_victim(current) &&
    4860           0 :             (alloc_flags & ALLOC_OOM ||
    4861           0 :              (gfp_mask & __GFP_NOMEMALLOC)))
    4862           0 :                 goto nopage;
    4863             : 
    4864             :         /* Retry as long as the OOM killer is making progress */
    4865           0 :         if (did_some_progress) {
    4866           0 :                 no_progress_loops = 0;
    4867           0 :                 goto retry;
    4868             :         }
    4869             : 
    4870           0 : nopage:
    4871             :         /* Deal with possible cpuset update races before we fail */
    4872           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac))
    4873             :                 goto retry_cpuset;
    4874             : 
    4875             :         /*
    4876             :          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
    4877             :          * we always retry
    4878             :          */
    4879           0 :         if (gfp_mask & __GFP_NOFAIL) {
    4880             :                 /*
    4881             :                  * All existing users of the __GFP_NOFAIL are blockable, so warn
    4882             :                  * of any new users that actually require GFP_NOWAIT
    4883             :                  */
    4884           0 :                 if (WARN_ON_ONCE(!can_direct_reclaim))
    4885           0 :                         goto fail;
    4886             : 
    4887             :                 /*
    4888             :                  * PF_MEMALLOC request from this context is rather bizarre
    4889             :                  * because we cannot reclaim anything and only can loop waiting
    4890             :                  * for somebody to do a work for us
    4891             :                  */
    4892           0 :                 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
    4893             : 
    4894             :                 /*
    4895             :                  * non failing costly orders are a hard requirement which we
    4896             :                  * are not prepared for much so let's warn about these users
    4897             :                  * so that we can identify them and convert them to something
    4898             :                  * else.
    4899             :                  */
    4900           0 :                 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
    4901             : 
    4902             :                 /*
    4903             :                  * Help non-failing allocations by giving them access to memory
    4904             :                  * reserves but do not use ALLOC_NO_WATERMARKS because this
    4905             :                  * could deplete whole memory reserves which would just make
    4906             :                  * the situation worse
    4907             :                  */
    4908           0 :                 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
    4909           0 :                 if (page)
    4910           0 :                         goto got_pg;
    4911             : 
    4912           0 :                 cond_resched();
    4913           0 :                 goto retry;
    4914             :         }
    4915           0 : fail:
    4916           0 :         warn_alloc(gfp_mask, ac->nodemask,
    4917             :                         "page allocation failure: order:%u", order);
    4918           0 : got_pg:
    4919           0 :         return page;
    4920             : }
    4921             : 
    4922      192956 : static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
    4923             :                 int preferred_nid, nodemask_t *nodemask,
    4924             :                 struct alloc_context *ac, gfp_t *alloc_mask,
    4925             :                 unsigned int *alloc_flags)
    4926             : {
    4927      192956 :         ac->highest_zoneidx = gfp_zone(gfp_mask);
    4928      192960 :         ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
    4929      192960 :         ac->nodemask = nodemask;
    4930      192960 :         ac->migratetype = gfp_migratetype(gfp_mask);
    4931             : 
    4932      192961 :         if (cpusets_enabled()) {
    4933             :                 *alloc_mask |= __GFP_HARDWALL;
    4934             :                 /*
    4935             :                  * When we are in the interrupt context, it is irrelevant
    4936             :                  * to the current task context. It means that any node ok.
    4937             :                  */
    4938             :                 if (!in_interrupt() && !ac->nodemask)
    4939             :                         ac->nodemask = &cpuset_current_mems_allowed;
    4940             :                 else
    4941             :                         *alloc_flags |= ALLOC_CPUSET;
    4942             :         }
    4943             : 
    4944      192961 :         fs_reclaim_acquire(gfp_mask);
    4945      192940 :         fs_reclaim_release(gfp_mask);
    4946             : 
    4947      192948 :         might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
    4948             : 
    4949      192960 :         if (should_fail_alloc_page(gfp_mask, order))
    4950             :                 return false;
    4951             : 
    4952      192945 :         *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
    4953             : 
    4954             :         /* Dirty zone balancing only done in the fast path */
    4955      192945 :         ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
    4956             : 
    4957             :         /*
    4958             :          * The preferred zone is used for statistics but crucially it is
    4959             :          * also used as the starting point for the zonelist iterator. It
    4960             :          * may get reset for allocations that ignore memory policies.
    4961             :          */
    4962      192945 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4963             :                                         ac->highest_zoneidx, ac->nodemask);
    4964             : 
    4965      192957 :         return true;
    4966             : }
    4967             : 
    4968             : /*
    4969             :  * This is the 'heart' of the zoned buddy allocator.
    4970             :  */
    4971             : struct page *
    4972      192957 : __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
    4973             :                                                         nodemask_t *nodemask)
    4974             : {
    4975      192957 :         struct page *page;
    4976      192957 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    4977      192957 :         gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
    4978      192957 :         struct alloc_context ac = { };
    4979             : 
    4980             :         /*
    4981             :          * There are several places where we assume that the order value is sane
    4982             :          * so bail out early if the request is out of bound.
    4983             :          */
    4984      192957 :         if (unlikely(order >= MAX_ORDER)) {
    4985           0 :                 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
    4986             :                 return NULL;
    4987             :         }
    4988             : 
    4989      192957 :         gfp_mask &= gfp_allowed_mask;
    4990      192957 :         alloc_mask = gfp_mask;
    4991      192957 :         if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
    4992             :                 return NULL;
    4993             : 
    4994             :         /*
    4995             :          * Forbid the first pass from falling back to types that fragment
    4996             :          * memory until all local zones are considered.
    4997             :          */
    4998      192958 :         alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
    4999             : 
    5000             :         /* First allocation attempt */
    5001      192958 :         page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
    5002      192956 :         if (likely(page))
    5003      192956 :                 goto out;
    5004             : 
    5005             :         /*
    5006             :          * Apply scoped allocation constraints. This is mainly about GFP_NOFS
    5007             :          * resp. GFP_NOIO which has to be inherited for all allocation requests
    5008             :          * from a particular context which has been marked by
    5009             :          * memalloc_no{fs,io}_{save,restore}.
    5010             :          */
    5011           0 :         alloc_mask = current_gfp_context(gfp_mask);
    5012           0 :         ac.spread_dirty_pages = false;
    5013             : 
    5014             :         /*
    5015             :          * Restore the original nodemask if it was potentially replaced with
    5016             :          * &cpuset_current_mems_allowed to optimize the fast-path attempt.
    5017             :          */
    5018           0 :         ac.nodemask = nodemask;
    5019             : 
    5020           0 :         page = __alloc_pages_slowpath(alloc_mask, order, &ac);
    5021             : 
    5022      192956 : out:
    5023      192956 :         if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
    5024             :             unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
    5025             :                 __free_pages(page, order);
    5026             :                 page = NULL;
    5027             :         }
    5028             : 
    5029      192956 :         trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
    5030             : 
    5031      192956 :         return page;
    5032             : }
    5033             : EXPORT_SYMBOL(__alloc_pages_nodemask);
    5034             : 
    5035             : /*
    5036             :  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
    5037             :  * address cannot represent highmem pages. Use alloc_pages and then kmap if
    5038             :  * you need to access high mem.
    5039             :  */
    5040       23127 : unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
    5041             : {
    5042       23127 :         struct page *page;
    5043             : 
    5044       23127 :         page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
    5045       23126 :         if (!page)
    5046             :                 return 0;
    5047       23126 :         return (unsigned long) page_address(page);
    5048             : }
    5049             : EXPORT_SYMBOL(__get_free_pages);
    5050             : 
    5051        7356 : unsigned long get_zeroed_page(gfp_t gfp_mask)
    5052             : {
    5053        7356 :         return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
    5054             : }
    5055             : EXPORT_SYMBOL(get_zeroed_page);
    5056             : 
    5057       46009 : static inline void free_the_page(struct page *page, unsigned int order)
    5058             : {
    5059       46009 :         if (order == 0)         /* Via pcp? */
    5060       31346 :                 free_unref_page(page);
    5061             :         else
    5062       14663 :                 __free_pages_ok(page, order, FPI_NONE);
    5063       46009 : }
    5064             : 
    5065             : /**
    5066             :  * __free_pages - Free pages allocated with alloc_pages().
    5067             :  * @page: The page pointer returned from alloc_pages().
    5068             :  * @order: The order of the allocation.
    5069             :  *
    5070             :  * This function can free multi-page allocations that are not compound
    5071             :  * pages.  It does not check that the @order passed in matches that of
    5072             :  * the allocation, so it is easy to leak memory.  Freeing more memory
    5073             :  * than was allocated will probably emit a warning.
    5074             :  *
    5075             :  * If the last reference to this page is speculative, it will be released
    5076             :  * by put_page() which only frees the first page of a non-compound
    5077             :  * allocation.  To prevent the remaining pages from being leaked, we free
    5078             :  * the subsequent pages here.  If you want to use the page's reference
    5079             :  * count to decide when to free the allocation, you should allocate a
    5080             :  * compound page, and use put_page() instead of __free_pages().
    5081             :  *
    5082             :  * Context: May be called in interrupt context or while holding a normal
    5083             :  * spinlock, but not in NMI context or while holding a raw spinlock.
    5084             :  */
    5085       46009 : void __free_pages(struct page *page, unsigned int order)
    5086             : {
    5087       46009 :         if (put_page_testzero(page))
    5088       46009 :                 free_the_page(page, order);
    5089           0 :         else if (!PageHead(page))
    5090           0 :                 while (order-- > 0)
    5091           0 :                         free_the_page(page + (1 << order), order);
    5092       46009 : }
    5093             : EXPORT_SYMBOL(__free_pages);
    5094             : 
    5095       13040 : void free_pages(unsigned long addr, unsigned int order)
    5096             : {
    5097       13040 :         if (addr != 0) {
    5098       13040 :                 VM_BUG_ON(!virt_addr_valid((void *)addr));
    5099       13040 :                 __free_pages(virt_to_page((void *)addr), order);
    5100             :         }
    5101       13040 : }
    5102             : 
    5103             : EXPORT_SYMBOL(free_pages);
    5104             : 
    5105             : /*
    5106             :  * Page Fragment:
    5107             :  *  An arbitrary-length arbitrary-offset area of memory which resides
    5108             :  *  within a 0 or higher order page.  Multiple fragments within that page
    5109             :  *  are individually refcounted, in the page's reference counter.
    5110             :  *
    5111             :  * The page_frag functions below provide a simple allocation framework for
    5112             :  * page fragments.  This is used by the network stack and network device
    5113             :  * drivers to provide a backing region of memory for use as either an
    5114             :  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
    5115             :  */
    5116           0 : static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
    5117             :                                              gfp_t gfp_mask)
    5118             : {
    5119           0 :         struct page *page = NULL;
    5120           0 :         gfp_t gfp = gfp_mask;
    5121             : 
    5122             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5123           0 :         gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
    5124             :                     __GFP_NOMEMALLOC;
    5125           0 :         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
    5126           0 :                                 PAGE_FRAG_CACHE_MAX_ORDER);
    5127           0 :         nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
    5128             : #endif
    5129           0 :         if (unlikely(!page))
    5130           0 :                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
    5131             : 
    5132           0 :         nc->va = page ? page_address(page) : NULL;
    5133             : 
    5134           0 :         return page;
    5135             : }
    5136             : 
    5137           0 : void __page_frag_cache_drain(struct page *page, unsigned int count)
    5138             : {
    5139           0 :         VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
    5140             : 
    5141           0 :         if (page_ref_sub_and_test(page, count))
    5142           0 :                 free_the_page(page, compound_order(page));
    5143           0 : }
    5144             : EXPORT_SYMBOL(__page_frag_cache_drain);
    5145             : 
    5146           0 : void *page_frag_alloc_align(struct page_frag_cache *nc,
    5147             :                       unsigned int fragsz, gfp_t gfp_mask,
    5148             :                       unsigned int align_mask)
    5149             : {
    5150           0 :         unsigned int size = PAGE_SIZE;
    5151           0 :         struct page *page;
    5152           0 :         int offset;
    5153             : 
    5154           0 :         if (unlikely(!nc->va)) {
    5155           0 : refill:
    5156           0 :                 page = __page_frag_cache_refill(nc, gfp_mask);
    5157           0 :                 if (!page)
    5158             :                         return NULL;
    5159             : 
    5160             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5161             :                 /* if size can vary use size else just use PAGE_SIZE */
    5162           0 :                 size = nc->size;
    5163             : #endif
    5164             :                 /* Even if we own the page, we do not use atomic_set().
    5165             :                  * This would break get_page_unless_zero() users.
    5166             :                  */
    5167           0 :                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
    5168             : 
    5169             :                 /* reset page count bias and offset to start of new frag */
    5170           0 :                 nc->pfmemalloc = page_is_pfmemalloc(page);
    5171           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    5172           0 :                 nc->offset = size;
    5173             :         }
    5174             : 
    5175           0 :         offset = nc->offset - fragsz;
    5176           0 :         if (unlikely(offset < 0)) {
    5177           0 :                 page = virt_to_page(nc->va);
    5178             : 
    5179           0 :                 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
    5180           0 :                         goto refill;
    5181             : 
    5182           0 :                 if (unlikely(nc->pfmemalloc)) {
    5183           0 :                         free_the_page(page, compound_order(page));
    5184           0 :                         goto refill;
    5185             :                 }
    5186             : 
    5187             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5188             :                 /* if size can vary use size else just use PAGE_SIZE */
    5189           0 :                 size = nc->size;
    5190             : #endif
    5191             :                 /* OK, page count is 0, we can safely set it */
    5192           0 :                 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
    5193             : 
    5194             :                 /* reset page count bias and offset to start of new frag */
    5195           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    5196           0 :                 offset = size - fragsz;
    5197             :         }
    5198             : 
    5199           0 :         nc->pagecnt_bias--;
    5200           0 :         offset &= align_mask;
    5201           0 :         nc->offset = offset;
    5202             : 
    5203           0 :         return nc->va + offset;
    5204             : }
    5205             : EXPORT_SYMBOL(page_frag_alloc_align);
    5206             : 
    5207             : /*
    5208             :  * Frees a page fragment allocated out of either a compound or order 0 page.
    5209             :  */
    5210           0 : void page_frag_free(void *addr)
    5211             : {
    5212           0 :         struct page *page = virt_to_head_page(addr);
    5213             : 
    5214           0 :         if (unlikely(put_page_testzero(page)))
    5215           0 :                 free_the_page(page, compound_order(page));
    5216           0 : }
    5217             : EXPORT_SYMBOL(page_frag_free);
    5218             : 
    5219          12 : static void *make_alloc_exact(unsigned long addr, unsigned int order,
    5220             :                 size_t size)
    5221             : {
    5222          12 :         if (addr) {
    5223          12 :                 unsigned long alloc_end = addr + (PAGE_SIZE << order);
    5224          12 :                 unsigned long used = addr + PAGE_ALIGN(size);
    5225             : 
    5226          12 :                 split_page(virt_to_page((void *)addr), order);
    5227         154 :                 while (used < alloc_end) {
    5228         142 :                         free_page(used);
    5229         142 :                         used += PAGE_SIZE;
    5230             :                 }
    5231             :         }
    5232          12 :         return (void *)addr;
    5233             : }
    5234             : 
    5235             : /**
    5236             :  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
    5237             :  * @size: the number of bytes to allocate
    5238             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    5239             :  *
    5240             :  * This function is similar to alloc_pages(), except that it allocates the
    5241             :  * minimum number of pages to satisfy the request.  alloc_pages() can only
    5242             :  * allocate memory in power-of-two pages.
    5243             :  *
    5244             :  * This function is also limited by MAX_ORDER.
    5245             :  *
    5246             :  * Memory allocated by this function must be released by free_pages_exact().
    5247             :  *
    5248             :  * Return: pointer to the allocated area or %NULL in case of error.
    5249             :  */
    5250          12 : void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
    5251             : {
    5252          12 :         unsigned int order = get_order(size);
    5253          12 :         unsigned long addr;
    5254             : 
    5255          12 :         if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
    5256           0 :                 gfp_mask &= ~__GFP_COMP;
    5257             : 
    5258          12 :         addr = __get_free_pages(gfp_mask, order);
    5259          12 :         return make_alloc_exact(addr, order, size);
    5260             : }
    5261             : EXPORT_SYMBOL(alloc_pages_exact);
    5262             : 
    5263             : /**
    5264             :  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
    5265             :  *                         pages on a node.
    5266             :  * @nid: the preferred node ID where memory should be allocated
    5267             :  * @size: the number of bytes to allocate
    5268             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    5269             :  *
    5270             :  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
    5271             :  * back.
    5272             :  *
    5273             :  * Return: pointer to the allocated area or %NULL in case of error.
    5274             :  */
    5275           0 : void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
    5276             : {
    5277           0 :         unsigned int order = get_order(size);
    5278           0 :         struct page *p;
    5279             : 
    5280           0 :         if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
    5281           0 :                 gfp_mask &= ~__GFP_COMP;
    5282             : 
    5283           0 :         p = alloc_pages_node(nid, gfp_mask, order);
    5284           0 :         if (!p)
    5285             :                 return NULL;
    5286           0 :         return make_alloc_exact((unsigned long)page_address(p), order, size);
    5287             : }
    5288             : 
    5289             : /**
    5290             :  * free_pages_exact - release memory allocated via alloc_pages_exact()
    5291             :  * @virt: the value returned by alloc_pages_exact.
    5292             :  * @size: size of allocation, same value as passed to alloc_pages_exact().
    5293             :  *
    5294             :  * Release the memory allocated by a previous call to alloc_pages_exact.
    5295             :  */
    5296           0 : void free_pages_exact(void *virt, size_t size)
    5297             : {
    5298           0 :         unsigned long addr = (unsigned long)virt;
    5299           0 :         unsigned long end = addr + PAGE_ALIGN(size);
    5300             : 
    5301           0 :         while (addr < end) {
    5302           0 :                 free_page(addr);
    5303           0 :                 addr += PAGE_SIZE;
    5304             :         }
    5305           0 : }
    5306             : EXPORT_SYMBOL(free_pages_exact);
    5307             : 
    5308             : /**
    5309             :  * nr_free_zone_pages - count number of pages beyond high watermark
    5310             :  * @offset: The zone index of the highest zone
    5311             :  *
    5312             :  * nr_free_zone_pages() counts the number of pages which are beyond the
    5313             :  * high watermark within all zones at or below a given zone index.  For each
    5314             :  * zone, the number of pages is calculated as:
    5315             :  *
    5316             :  *     nr_free_zone_pages = managed_pages - high_pages
    5317             :  *
    5318             :  * Return: number of pages beyond high watermark.
    5319             :  */
    5320           7 : static unsigned long nr_free_zone_pages(int offset)
    5321             : {
    5322           7 :         struct zoneref *z;
    5323           7 :         struct zone *zone;
    5324             : 
    5325             :         /* Just pick one node, since fallback list is circular */
    5326           7 :         unsigned long sum = 0;
    5327             : 
    5328           7 :         struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
    5329             : 
    5330          14 :         for_each_zone_zonelist(zone, z, zonelist, offset) {
    5331           7 :                 unsigned long size = zone_managed_pages(zone);
    5332           7 :                 unsigned long high = high_wmark_pages(zone);
    5333           7 :                 if (size > high)
    5334           7 :                         sum += size - high;
    5335             :         }
    5336             : 
    5337           7 :         return sum;
    5338             : }
    5339             : 
    5340             : /**
    5341             :  * nr_free_buffer_pages - count number of pages beyond high watermark
    5342             :  *
    5343             :  * nr_free_buffer_pages() counts the number of pages which are beyond the high
    5344             :  * watermark within ZONE_DMA and ZONE_NORMAL.
    5345             :  *
    5346             :  * Return: number of pages beyond high watermark within ZONE_DMA and
    5347             :  * ZONE_NORMAL.
    5348             :  */
    5349           6 : unsigned long nr_free_buffer_pages(void)
    5350             : {
    5351           5 :         return nr_free_zone_pages(gfp_zone(GFP_USER));
    5352             : }
    5353             : EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
    5354             : 
    5355           0 : static inline void show_node(struct zone *zone)
    5356             : {
    5357           0 :         if (IS_ENABLED(CONFIG_NUMA))
    5358           0 :                 printk("Node %d ", zone_to_nid(zone));
    5359           0 : }
    5360             : 
    5361           9 : long si_mem_available(void)
    5362             : {
    5363           9 :         long available;
    5364           9 :         unsigned long pagecache;
    5365           9 :         unsigned long wmark_low = 0;
    5366           9 :         unsigned long pages[NR_LRU_LISTS];
    5367           9 :         unsigned long reclaimable;
    5368           9 :         struct zone *zone;
    5369           9 :         int lru;
    5370             : 
    5371          54 :         for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
    5372          45 :                 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
    5373             : 
    5374          36 :         for_each_zone(zone)
    5375          27 :                 wmark_low += low_wmark_pages(zone);
    5376             : 
    5377             :         /*
    5378             :          * Estimate the amount of memory available for userspace allocations,
    5379             :          * without causing swapping.
    5380             :          */
    5381           9 :         available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
    5382             : 
    5383             :         /*
    5384             :          * Not all the page cache can be freed, otherwise the system will
    5385             :          * start swapping. Assume at least half of the page cache, or the
    5386             :          * low watermark worth of cache, needs to stay.
    5387             :          */
    5388           9 :         pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
    5389           9 :         pagecache -= min(pagecache / 2, wmark_low);
    5390           9 :         available += pagecache;
    5391             : 
    5392             :         /*
    5393             :          * Part of the reclaimable slab and other kernel memory consists of
    5394             :          * items that are in use, and cannot be freed. Cap this estimate at the
    5395             :          * low watermark.
    5396             :          */
    5397           9 :         reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
    5398           9 :                 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
    5399           9 :         available += reclaimable - min(reclaimable / 2, wmark_low);
    5400             : 
    5401           9 :         if (available < 0)
    5402             :                 available = 0;
    5403           9 :         return available;
    5404             : }
    5405             : EXPORT_SYMBOL_GPL(si_mem_available);
    5406             : 
    5407          14 : void si_meminfo(struct sysinfo *val)
    5408             : {
    5409          14 :         val->totalram = totalram_pages();
    5410          14 :         val->sharedram = global_node_page_state(NR_SHMEM);
    5411          14 :         val->freeram = global_zone_page_state(NR_FREE_PAGES);
    5412          14 :         val->bufferram = nr_blockdev_pages();
    5413          14 :         val->totalhigh = totalhigh_pages();
    5414          14 :         val->freehigh = nr_free_highpages();
    5415          14 :         val->mem_unit = PAGE_SIZE;
    5416          14 : }
    5417             : 
    5418             : EXPORT_SYMBOL(si_meminfo);
    5419             : 
    5420             : #ifdef CONFIG_NUMA
    5421           0 : void si_meminfo_node(struct sysinfo *val, int nid)
    5422             : {
    5423           0 :         int zone_type;          /* needs to be signed */
    5424           0 :         unsigned long managed_pages = 0;
    5425           0 :         unsigned long managed_highpages = 0;
    5426           0 :         unsigned long free_highpages = 0;
    5427           0 :         pg_data_t *pgdat = NODE_DATA(nid);
    5428             : 
    5429           0 :         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
    5430           0 :                 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
    5431           0 :         val->totalram = managed_pages;
    5432           0 :         val->sharedram = node_page_state(pgdat, NR_SHMEM);
    5433           0 :         val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
    5434             : #ifdef CONFIG_HIGHMEM
    5435             :         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
    5436             :                 struct zone *zone = &pgdat->node_zones[zone_type];
    5437             : 
    5438             :                 if (is_highmem(zone)) {
    5439             :                         managed_highpages += zone_managed_pages(zone);
    5440             :                         free_highpages += zone_page_state(zone, NR_FREE_PAGES);
    5441             :                 }
    5442             :         }
    5443             :         val->totalhigh = managed_highpages;
    5444             :         val->freehigh = free_highpages;
    5445             : #else
    5446           0 :         val->totalhigh = managed_highpages;
    5447           0 :         val->freehigh = free_highpages;
    5448             : #endif
    5449           0 :         val->mem_unit = PAGE_SIZE;
    5450           0 : }
    5451             : #endif
    5452             : 
    5453             : /*
    5454             :  * Determine whether the node should be displayed or not, depending on whether
    5455             :  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
    5456             :  */
    5457           0 : static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
    5458             : {
    5459           0 :         if (!(flags & SHOW_MEM_FILTER_NODES))
    5460             :                 return false;
    5461             : 
    5462             :         /*
    5463             :          * no node mask - aka implicit memory numa policy. Do not bother with
    5464             :          * the synchronization - read_mems_allowed_begin - because we do not
    5465             :          * have to be precise here.
    5466             :          */
    5467           0 :         if (!nodemask)
    5468           0 :                 nodemask = &cpuset_current_mems_allowed;
    5469             : 
    5470           0 :         return !node_isset(nid, *nodemask);
    5471             : }
    5472             : 
    5473             : #define K(x) ((x) << (PAGE_SHIFT-10))
    5474             : 
    5475           0 : static void show_migration_types(unsigned char type)
    5476             : {
    5477           0 :         static const char types[MIGRATE_TYPES] = {
    5478             :                 [MIGRATE_UNMOVABLE]     = 'U',
    5479             :                 [MIGRATE_MOVABLE]       = 'M',
    5480             :                 [MIGRATE_RECLAIMABLE]   = 'E',
    5481             :                 [MIGRATE_HIGHATOMIC]    = 'H',
    5482             : #ifdef CONFIG_CMA
    5483             :                 [MIGRATE_CMA]           = 'C',
    5484             : #endif
    5485             : #ifdef CONFIG_MEMORY_ISOLATION
    5486             :                 [MIGRATE_ISOLATE]       = 'I',
    5487             : #endif
    5488             :         };
    5489           0 :         char tmp[MIGRATE_TYPES + 1];
    5490           0 :         char *p = tmp;
    5491           0 :         int i;
    5492             : 
    5493           0 :         for (i = 0; i < MIGRATE_TYPES; i++) {
    5494           0 :                 if (type & (1 << i))
    5495           0 :                         *p++ = types[i];
    5496             :         }
    5497             : 
    5498           0 :         *p = '\0';
    5499           0 :         printk(KERN_CONT "(%s) ", tmp);
    5500           0 : }
    5501             : 
    5502             : /*
    5503             :  * Show free area list (used inside shift_scroll-lock stuff)
    5504             :  * We also calculate the percentage fragmentation. We do this by counting the
    5505             :  * memory on each free list with the exception of the first item on the list.
    5506             :  *
    5507             :  * Bits in @filter:
    5508             :  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
    5509             :  *   cpuset.
    5510             :  */
    5511           0 : void show_free_areas(unsigned int filter, nodemask_t *nodemask)
    5512             : {
    5513           0 :         unsigned long free_pcp = 0;
    5514           0 :         int cpu;
    5515           0 :         struct zone *zone;
    5516           0 :         pg_data_t *pgdat;
    5517             : 
    5518           0 :         for_each_populated_zone(zone) {
    5519           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    5520           0 :                         continue;
    5521             : 
    5522           0 :                 for_each_online_cpu(cpu)
    5523           0 :                         free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
    5524             :         }
    5525             : 
    5526           0 :         printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
    5527             :                 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
    5528             :                 " unevictable:%lu dirty:%lu writeback:%lu\n"
    5529             :                 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
    5530             :                 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
    5531             :                 " free:%lu free_pcp:%lu free_cma:%lu\n",
    5532             :                 global_node_page_state(NR_ACTIVE_ANON),
    5533             :                 global_node_page_state(NR_INACTIVE_ANON),
    5534             :                 global_node_page_state(NR_ISOLATED_ANON),
    5535             :                 global_node_page_state(NR_ACTIVE_FILE),
    5536             :                 global_node_page_state(NR_INACTIVE_FILE),
    5537             :                 global_node_page_state(NR_ISOLATED_FILE),
    5538             :                 global_node_page_state(NR_UNEVICTABLE),
    5539             :                 global_node_page_state(NR_FILE_DIRTY),
    5540             :                 global_node_page_state(NR_WRITEBACK),
    5541             :                 global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
    5542             :                 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
    5543             :                 global_node_page_state(NR_FILE_MAPPED),
    5544             :                 global_node_page_state(NR_SHMEM),
    5545             :                 global_node_page_state(NR_PAGETABLE),
    5546             :                 global_zone_page_state(NR_BOUNCE),
    5547             :                 global_zone_page_state(NR_FREE_PAGES),
    5548             :                 free_pcp,
    5549             :                 global_zone_page_state(NR_FREE_CMA_PAGES));
    5550             : 
    5551           0 :         for_each_online_pgdat(pgdat) {
    5552           0 :                 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
    5553           0 :                         continue;
    5554             : 
    5555           0 :                 printk("Node %d"
    5556             :                         " active_anon:%lukB"
    5557             :                         " inactive_anon:%lukB"
    5558             :                         " active_file:%lukB"
    5559             :                         " inactive_file:%lukB"
    5560             :                         " unevictable:%lukB"
    5561             :                         " isolated(anon):%lukB"
    5562             :                         " isolated(file):%lukB"
    5563             :                         " mapped:%lukB"
    5564             :                         " dirty:%lukB"
    5565             :                         " writeback:%lukB"
    5566             :                         " shmem:%lukB"
    5567             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    5568             :                         " shmem_thp: %lukB"
    5569             :                         " shmem_pmdmapped: %lukB"
    5570             :                         " anon_thp: %lukB"
    5571             : #endif
    5572             :                         " writeback_tmp:%lukB"
    5573             :                         " kernel_stack:%lukB"
    5574             : #ifdef CONFIG_SHADOW_CALL_STACK
    5575             :                         " shadow_call_stack:%lukB"
    5576             : #endif
    5577             :                         " pagetables:%lukB"
    5578             :                         " all_unreclaimable? %s"
    5579             :                         "\n",
    5580             :                         pgdat->node_id,
    5581           0 :                         K(node_page_state(pgdat, NR_ACTIVE_ANON)),
    5582           0 :                         K(node_page_state(pgdat, NR_INACTIVE_ANON)),
    5583           0 :                         K(node_page_state(pgdat, NR_ACTIVE_FILE)),
    5584           0 :                         K(node_page_state(pgdat, NR_INACTIVE_FILE)),
    5585           0 :                         K(node_page_state(pgdat, NR_UNEVICTABLE)),
    5586           0 :                         K(node_page_state(pgdat, NR_ISOLATED_ANON)),
    5587           0 :                         K(node_page_state(pgdat, NR_ISOLATED_FILE)),
    5588           0 :                         K(node_page_state(pgdat, NR_FILE_MAPPED)),
    5589           0 :                         K(node_page_state(pgdat, NR_FILE_DIRTY)),
    5590           0 :                         K(node_page_state(pgdat, NR_WRITEBACK)),
    5591           0 :                         K(node_page_state(pgdat, NR_SHMEM)),
    5592             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    5593           0 :                         K(node_page_state(pgdat, NR_SHMEM_THPS)),
    5594           0 :                         K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
    5595           0 :                         K(node_page_state(pgdat, NR_ANON_THPS)),
    5596             : #endif
    5597           0 :                         K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
    5598             :                         node_page_state(pgdat, NR_KERNEL_STACK_KB),
    5599             : #ifdef CONFIG_SHADOW_CALL_STACK
    5600             :                         node_page_state(pgdat, NR_KERNEL_SCS_KB),
    5601             : #endif
    5602           0 :                         K(node_page_state(pgdat, NR_PAGETABLE)),
    5603           0 :                         pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
    5604             :                                 "yes" : "no");
    5605             :         }
    5606             : 
    5607           0 :         for_each_populated_zone(zone) {
    5608           0 :                 int i;
    5609             : 
    5610           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    5611           0 :                         continue;
    5612             : 
    5613             :                 free_pcp = 0;
    5614           0 :                 for_each_online_cpu(cpu)
    5615           0 :                         free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
    5616             : 
    5617           0 :                 show_node(zone);
    5618           0 :                 printk(KERN_CONT
    5619             :                         "%s"
    5620             :                         " free:%lukB"
    5621             :                         " min:%lukB"
    5622             :                         " low:%lukB"
    5623             :                         " high:%lukB"
    5624             :                         " reserved_highatomic:%luKB"
    5625             :                         " active_anon:%lukB"
    5626             :                         " inactive_anon:%lukB"
    5627             :                         " active_file:%lukB"
    5628             :                         " inactive_file:%lukB"
    5629             :                         " unevictable:%lukB"
    5630             :                         " writepending:%lukB"
    5631             :                         " present:%lukB"
    5632             :                         " managed:%lukB"
    5633             :                         " mlocked:%lukB"
    5634             :                         " bounce:%lukB"
    5635             :                         " free_pcp:%lukB"
    5636             :                         " local_pcp:%ukB"
    5637             :                         " free_cma:%lukB"
    5638             :                         "\n",
    5639             :                         zone->name,
    5640           0 :                         K(zone_page_state(zone, NR_FREE_PAGES)),
    5641           0 :                         K(min_wmark_pages(zone)),
    5642           0 :                         K(low_wmark_pages(zone)),
    5643           0 :                         K(high_wmark_pages(zone)),
    5644           0 :                         K(zone->nr_reserved_highatomic),
    5645           0 :                         K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
    5646           0 :                         K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
    5647           0 :                         K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
    5648           0 :                         K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
    5649           0 :                         K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
    5650           0 :                         K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
    5651           0 :                         K(zone->present_pages),
    5652           0 :                         K(zone_managed_pages(zone)),
    5653           0 :                         K(zone_page_state(zone, NR_MLOCK)),
    5654           0 :                         K(zone_page_state(zone, NR_BOUNCE)),
    5655             :                         K(free_pcp),
    5656           0 :                         K(this_cpu_read(zone->pageset->pcp.count)),
    5657           0 :                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
    5658           0 :                 printk("lowmem_reserve[]:");
    5659           0 :                 for (i = 0; i < MAX_NR_ZONES; i++)
    5660           0 :                         printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
    5661           0 :                 printk(KERN_CONT "\n");
    5662             :         }
    5663             : 
    5664           0 :         for_each_populated_zone(zone) {
    5665           0 :                 unsigned int order;
    5666           0 :                 unsigned long nr[MAX_ORDER], flags, total = 0;
    5667           0 :                 unsigned char types[MAX_ORDER];
    5668             : 
    5669           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    5670           0 :                         continue;
    5671           0 :                 show_node(zone);
    5672           0 :                 printk(KERN_CONT "%s: ", zone->name);
    5673             : 
    5674           0 :                 spin_lock_irqsave(&zone->lock, flags);
    5675           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    5676           0 :                         struct free_area *area = &zone->free_area[order];
    5677           0 :                         int type;
    5678             : 
    5679           0 :                         nr[order] = area->nr_free;
    5680           0 :                         total += nr[order] << order;
    5681             : 
    5682           0 :                         types[order] = 0;
    5683           0 :                         for (type = 0; type < MIGRATE_TYPES; type++) {
    5684           0 :                                 if (!free_area_empty(area, type))
    5685           0 :                                         types[order] |= 1 << type;
    5686             :                         }
    5687             :                 }
    5688           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    5689           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    5690           0 :                         printk(KERN_CONT "%lu*%lukB ",
    5691             :                                nr[order], K(1UL) << order);
    5692           0 :                         if (nr[order])
    5693           0 :                                 show_migration_types(types[order]);
    5694             :                 }
    5695           0 :                 printk(KERN_CONT "= %lukB\n", K(total));
    5696             :         }
    5697             : 
    5698           0 :         hugetlb_show_meminfo();
    5699             : 
    5700           0 :         printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
    5701             : 
    5702           0 :         show_swap_cache_info();
    5703           0 : }
    5704             : 
    5705           2 : static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
    5706             : {
    5707           2 :         zoneref->zone = zone;
    5708           2 :         zoneref->zone_idx = zone_idx(zone);
    5709             : }
    5710             : 
    5711             : /*
    5712             :  * Builds allocation fallback zone lists.
    5713             :  *
    5714             :  * Add all populated zones of a node to the zonelist.
    5715             :  */
    5716           2 : static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
    5717             : {
    5718           2 :         struct zone *zone;
    5719           2 :         enum zone_type zone_type = MAX_NR_ZONES;
    5720           2 :         int nr_zones = 0;
    5721             : 
    5722           6 :         do {
    5723           6 :                 zone_type--;
    5724           6 :                 zone = pgdat->node_zones + zone_type;
    5725           6 :                 if (managed_zone(zone)) {
    5726           2 :                         zoneref_set_zone(zone, &zonerefs[nr_zones++]);
    5727           2 :                         check_highest_zone(zone_type);
    5728             :                 }
    5729           6 :         } while (zone_type);
    5730             : 
    5731           2 :         return nr_zones;
    5732             : }
    5733             : 
    5734             : #ifdef CONFIG_NUMA
    5735             : 
    5736           0 : static int __parse_numa_zonelist_order(char *s)
    5737             : {
    5738             :         /*
    5739             :          * We used to support different zonlists modes but they turned
    5740             :          * out to be just not useful. Let's keep the warning in place
    5741             :          * if somebody still use the cmd line parameter so that we do
    5742             :          * not fail it silently
    5743             :          */
    5744           0 :         if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
    5745           0 :                 pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
    5746           0 :                 return -EINVAL;
    5747             :         }
    5748             :         return 0;
    5749             : }
    5750             : 
    5751             : char numa_zonelist_order[] = "Node";
    5752             : 
    5753             : /*
    5754             :  * sysctl handler for numa_zonelist_order
    5755             :  */
    5756           0 : int numa_zonelist_order_handler(struct ctl_table *table, int write,
    5757             :                 void *buffer, size_t *length, loff_t *ppos)
    5758             : {
    5759           0 :         if (write)
    5760           0 :                 return __parse_numa_zonelist_order(buffer);
    5761           0 :         return proc_dostring(table, write, buffer, length, ppos);
    5762             : }
    5763             : 
    5764             : 
    5765             : #define MAX_NODE_LOAD (nr_online_nodes)
    5766             : static int node_load[MAX_NUMNODES];
    5767             : 
    5768             : /**
    5769             :  * find_next_best_node - find the next node that should appear in a given node's fallback list
    5770             :  * @node: node whose fallback list we're appending
    5771             :  * @used_node_mask: nodemask_t of already used nodes
    5772             :  *
    5773             :  * We use a number of factors to determine which is the next node that should
    5774             :  * appear on a given node's fallback list.  The node should not have appeared
    5775             :  * already in @node's fallback list, and it should be the next closest node
    5776             :  * according to the distance array (which contains arbitrary distance values
    5777             :  * from each node to each node in the system), and should also prefer nodes
    5778             :  * with no CPUs, since presumably they'll have very little allocation pressure
    5779             :  * on them otherwise.
    5780             :  *
    5781             :  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
    5782             :  */
    5783           2 : static int find_next_best_node(int node, nodemask_t *used_node_mask)
    5784             : {
    5785           2 :         int n, val;
    5786           2 :         int min_val = INT_MAX;
    5787           2 :         int best_node = NUMA_NO_NODE;
    5788             : 
    5789             :         /* Use the local node if we haven't already */
    5790           2 :         if (!node_isset(node, *used_node_mask)) {
    5791           1 :                 node_set(node, *used_node_mask);
    5792           1 :                 return node;
    5793             :         }
    5794             : 
    5795           2 :         for_each_node_state(n, N_MEMORY) {
    5796             : 
    5797             :                 /* Don't want a node to appear more than once */
    5798           1 :                 if (node_isset(n, *used_node_mask))
    5799           1 :                         continue;
    5800             : 
    5801             :                 /* Use the distance array to find the distance */
    5802           0 :                 val = node_distance(node, n);
    5803             : 
    5804             :                 /* Penalize nodes under us ("prefer the next node") */
    5805           0 :                 val += (n < node);
    5806             : 
    5807             :                 /* Give preference to headless and unused nodes */
    5808           0 :                 if (!cpumask_empty(cpumask_of_node(n)))
    5809           0 :                         val += PENALTY_FOR_NODE_WITH_CPUS;
    5810             : 
    5811             :                 /* Slight preference for less loaded node */
    5812           0 :                 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
    5813           0 :                 val += node_load[n];
    5814             : 
    5815           0 :                 if (val < min_val) {
    5816           0 :                         min_val = val;
    5817           0 :                         best_node = n;
    5818             :                 }
    5819             :         }
    5820             : 
    5821           1 :         if (best_node >= 0)
    5822           0 :                 node_set(best_node, *used_node_mask);
    5823             : 
    5824             :         return best_node;
    5825             : }
    5826             : 
    5827             : 
    5828             : /*
    5829             :  * Build zonelists ordered by node and zones within node.
    5830             :  * This results in maximum locality--normal zone overflows into local
    5831             :  * DMA zone, if any--but risks exhausting DMA zone.
    5832             :  */
    5833           1 : static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
    5834             :                 unsigned nr_nodes)
    5835             : {
    5836           1 :         struct zoneref *zonerefs;
    5837           1 :         int i;
    5838             : 
    5839           1 :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    5840             : 
    5841           2 :         for (i = 0; i < nr_nodes; i++) {
    5842           1 :                 int nr_zones;
    5843             : 
    5844           1 :                 pg_data_t *node = NODE_DATA(node_order[i]);
    5845             : 
    5846           1 :                 nr_zones = build_zonerefs_node(node, zonerefs);
    5847           1 :                 zonerefs += nr_zones;
    5848             :         }
    5849           1 :         zonerefs->zone = NULL;
    5850           1 :         zonerefs->zone_idx = 0;
    5851           1 : }
    5852             : 
    5853             : /*
    5854             :  * Build gfp_thisnode zonelists
    5855             :  */
    5856           1 : static void build_thisnode_zonelists(pg_data_t *pgdat)
    5857             : {
    5858           1 :         struct zoneref *zonerefs;
    5859           1 :         int nr_zones;
    5860             : 
    5861           1 :         zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
    5862           1 :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    5863           1 :         zonerefs += nr_zones;
    5864           1 :         zonerefs->zone = NULL;
    5865           1 :         zonerefs->zone_idx = 0;
    5866           1 : }
    5867             : 
    5868             : /*
    5869             :  * Build zonelists ordered by zone and nodes within zones.
    5870             :  * This results in conserving DMA zone[s] until all Normal memory is
    5871             :  * exhausted, but results in overflowing to remote node while memory
    5872             :  * may still exist in local DMA zone.
    5873             :  */
    5874             : 
    5875           1 : static void build_zonelists(pg_data_t *pgdat)
    5876             : {
    5877           1 :         static int node_order[MAX_NUMNODES];
    5878           1 :         int node, load, nr_nodes = 0;
    5879           1 :         nodemask_t used_mask = NODE_MASK_NONE;
    5880           1 :         int local_node, prev_node;
    5881             : 
    5882             :         /* NUMA-aware ordering of nodes */
    5883           1 :         local_node = pgdat->node_id;
    5884           1 :         load = nr_online_nodes;
    5885           1 :         prev_node = local_node;
    5886             : 
    5887           1 :         memset(node_order, 0, sizeof(node_order));
    5888           2 :         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
    5889             :                 /*
    5890             :                  * We don't want to pressure a particular node.
    5891             :                  * So adding penalty to the first node in same
    5892             :                  * distance group to make it round-robin.
    5893             :                  */
    5894           2 :                 if (node_distance(local_node, node) !=
    5895           1 :                     node_distance(local_node, prev_node))
    5896           0 :                         node_load[node] = load;
    5897             : 
    5898           1 :                 node_order[nr_nodes++] = node;
    5899           1 :                 prev_node = node;
    5900           1 :                 load--;
    5901             :         }
    5902             : 
    5903           1 :         build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
    5904           1 :         build_thisnode_zonelists(pgdat);
    5905           1 : }
    5906             : 
    5907             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    5908             : /*
    5909             :  * Return node id of node used for "local" allocations.
    5910             :  * I.e., first node id of first zone in arg node's generic zonelist.
    5911             :  * Used for initializing percpu 'numa_mem', which is used primarily
    5912             :  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
    5913             :  */
    5914             : int local_memory_node(int node)
    5915             : {
    5916             :         struct zoneref *z;
    5917             : 
    5918             :         z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
    5919             :                                    gfp_zone(GFP_KERNEL),
    5920             :                                    NULL);
    5921             :         return zone_to_nid(z->zone);
    5922             : }
    5923             : #endif
    5924             : 
    5925             : static void setup_min_unmapped_ratio(void);
    5926             : static void setup_min_slab_ratio(void);
    5927             : #else   /* CONFIG_NUMA */
    5928             : 
    5929             : static void build_zonelists(pg_data_t *pgdat)
    5930             : {
    5931             :         int node, local_node;
    5932             :         struct zoneref *zonerefs;
    5933             :         int nr_zones;
    5934             : 
    5935             :         local_node = pgdat->node_id;
    5936             : 
    5937             :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    5938             :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    5939             :         zonerefs += nr_zones;
    5940             : 
    5941             :         /*
    5942             :          * Now we build the zonelist so that it contains the zones
    5943             :          * of all the other nodes.
    5944             :          * We don't want to pressure a particular node, so when
    5945             :          * building the zones for node N, we make sure that the
    5946             :          * zones coming right after the local ones are those from
    5947             :          * node N+1 (modulo N)
    5948             :          */
    5949             :         for (node = local_node + 1; node < MAX_NUMNODES; node++) {
    5950             :                 if (!node_online(node))
    5951             :                         continue;
    5952             :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    5953             :                 zonerefs += nr_zones;
    5954             :         }
    5955             :         for (node = 0; node < local_node; node++) {
    5956             :                 if (!node_online(node))
    5957             :                         continue;
    5958             :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    5959             :                 zonerefs += nr_zones;
    5960             :         }
    5961             : 
    5962             :         zonerefs->zone = NULL;
    5963             :         zonerefs->zone_idx = 0;
    5964             : }
    5965             : 
    5966             : #endif  /* CONFIG_NUMA */
    5967             : 
    5968             : /*
    5969             :  * Boot pageset table. One per cpu which is going to be used for all
    5970             :  * zones and all nodes. The parameters will be set in such a way
    5971             :  * that an item put on a list will immediately be handed over to
    5972             :  * the buddy list. This is safe since pageset manipulation is done
    5973             :  * with interrupts disabled.
    5974             :  *
    5975             :  * The boot_pagesets must be kept even after bootup is complete for
    5976             :  * unused processors and/or zones. They do play a role for bootstrapping
    5977             :  * hotplugged processors.
    5978             :  *
    5979             :  * zoneinfo_show() and maybe other functions do
    5980             :  * not check if the processor is online before following the pageset pointer.
    5981             :  * Other parts of the kernel may not check if the zone is available.
    5982             :  */
    5983             : static void pageset_init(struct per_cpu_pageset *p);
    5984             : /* These effectively disable the pcplists in the boot pageset completely */
    5985             : #define BOOT_PAGESET_HIGH       0
    5986             : #define BOOT_PAGESET_BATCH      1
    5987             : static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
    5988             : static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
    5989             : 
    5990           1 : static void __build_all_zonelists(void *data)
    5991             : {
    5992           1 :         int nid;
    5993           1 :         int __maybe_unused cpu;
    5994           1 :         pg_data_t *self = data;
    5995           1 :         static DEFINE_SPINLOCK(lock);
    5996             : 
    5997           1 :         spin_lock(&lock);
    5998             : 
    5999             : #ifdef CONFIG_NUMA
    6000           1 :         memset(node_load, 0, sizeof(node_load));
    6001             : #endif
    6002             : 
    6003             :         /*
    6004             :          * This node is hotadded and no memory is yet present.   So just
    6005             :          * building zonelists is fine - no need to touch other nodes.
    6006             :          */
    6007           1 :         if (self && !node_online(self->node_id)) {
    6008           0 :                 build_zonelists(self);
    6009             :         } else {
    6010           2 :                 for_each_online_node(nid) {
    6011           1 :                         pg_data_t *pgdat = NODE_DATA(nid);
    6012             : 
    6013           1 :                         build_zonelists(pgdat);
    6014             :                 }
    6015             : 
    6016             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    6017             :                 /*
    6018             :                  * We now know the "local memory node" for each node--
    6019             :                  * i.e., the node of the first zone in the generic zonelist.
    6020             :                  * Set up numa_mem percpu variable for on-line cpus.  During
    6021             :                  * boot, only the boot cpu should be on-line;  we'll init the
    6022             :                  * secondary cpus' numa_mem as they come on-line.  During
    6023             :                  * node/memory hotplug, we'll fixup all on-line cpus.
    6024             :                  */
    6025             :                 for_each_online_cpu(cpu)
    6026             :                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
    6027             : #endif
    6028             :         }
    6029             : 
    6030           1 :         spin_unlock(&lock);
    6031           1 : }
    6032             : 
    6033             : static noinline void __init
    6034           1 : build_all_zonelists_init(void)
    6035             : {
    6036           1 :         int cpu;
    6037             : 
    6038           1 :         __build_all_zonelists(NULL);
    6039             : 
    6040             :         /*
    6041             :          * Initialize the boot_pagesets that are going to be used
    6042             :          * for bootstrapping processors. The real pagesets for
    6043             :          * each zone will be allocated later when the per cpu
    6044             :          * allocator is available.
    6045             :          *
    6046             :          * boot_pagesets are used also for bootstrapping offline
    6047             :          * cpus if the system is already booted because the pagesets
    6048             :          * are needed to initialize allocators on a specific cpu too.
    6049             :          * F.e. the percpu allocator needs the page allocator which
    6050             :          * needs the percpu allocator in order to allocate its pagesets
    6051             :          * (a chicken-egg dilemma).
    6052             :          */
    6053           6 :         for_each_possible_cpu(cpu)
    6054           4 :                 pageset_init(&per_cpu(boot_pageset, cpu));
    6055             : 
    6056           1 :         mminit_verify_zonelist();
    6057           1 :         cpuset_init_current_mems_allowed();
    6058           1 : }
    6059             : 
    6060             : /*
    6061             :  * unless system_state == SYSTEM_BOOTING.
    6062             :  *
    6063             :  * __ref due to call of __init annotated helper build_all_zonelists_init
    6064             :  * [protected by SYSTEM_BOOTING].
    6065             :  */
    6066           1 : void __ref build_all_zonelists(pg_data_t *pgdat)
    6067             : {
    6068           1 :         unsigned long vm_total_pages;
    6069             : 
    6070           1 :         if (system_state == SYSTEM_BOOTING) {
    6071           1 :                 build_all_zonelists_init();
    6072             :         } else {
    6073           0 :                 __build_all_zonelists(pgdat);
    6074             :                 /* cpuset refresh routine should be here */
    6075             :         }
    6076             :         /* Get the number of free pages beyond high watermark in all zones. */
    6077           1 :         vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
    6078             :         /*
    6079             :          * Disable grouping by mobility if the number of pages in the
    6080             :          * system is too low to allow the mechanism to work. It would be
    6081             :          * more accurate, but expensive to check per-zone. This check is
    6082             :          * made on memory-hotadd so a system can start with mobility
    6083             :          * disabled and enable it later
    6084             :          */
    6085           1 :         if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
    6086           0 :                 page_group_by_mobility_disabled = 1;
    6087             :         else
    6088           1 :                 page_group_by_mobility_disabled = 0;
    6089             : 
    6090           2 :         pr_info("Built %u zonelists, mobility grouping %s.  Total pages: %ld\n",
    6091             :                 nr_online_nodes,
    6092             :                 page_group_by_mobility_disabled ? "off" : "on",
    6093             :                 vm_total_pages);
    6094             : #ifdef CONFIG_NUMA
    6095           1 :         pr_info("Policy zone: %s\n", zone_names[policy_zone]);
    6096             : #endif
    6097           1 : }
    6098             : 
    6099             : /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
    6100             : static bool __meminit
    6101      262046 : overlap_memmap_init(unsigned long zone, unsigned long *pfn)
    6102             : {
    6103      262046 :         static struct memblock_region *r;
    6104             : 
    6105      262046 :         if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
    6106           0 :                 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
    6107           0 :                         for_each_mem_region(r) {
    6108           0 :                                 if (*pfn < memblock_region_memory_end_pfn(r))
    6109             :                                         break;
    6110             :                         }
    6111             :                 }
    6112           0 :                 if (*pfn >= memblock_region_memory_base_pfn(r) &&
    6113           0 :                     memblock_is_mirror(r)) {
    6114           0 :                         *pfn = memblock_region_memory_end_pfn(r);
    6115           0 :                         return true;
    6116             :                 }
    6117             :         }
    6118             :         return false;
    6119             : }
    6120             : 
    6121             : /*
    6122             :  * Initially all pages are reserved - free ones are freed
    6123             :  * up by memblock_free_all() once the early boot process is
    6124             :  * done. Non-atomic initialization, single-pass.
    6125             :  *
    6126             :  * All aligned pageblocks are initialized to the specified migratetype
    6127             :  * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
    6128             :  * zone stats (e.g., nr_isolate_pageblock) are touched.
    6129             :  */
    6130           2 : void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
    6131             :                 unsigned long start_pfn, unsigned long zone_end_pfn,
    6132             :                 enum meminit_context context,
    6133             :                 struct vmem_altmap *altmap, int migratetype)
    6134             : {
    6135           2 :         unsigned long pfn, end_pfn = start_pfn + size;
    6136           2 :         struct page *page;
    6137             : 
    6138           2 :         if (highest_memmap_pfn < end_pfn - 1)
    6139           2 :                 highest_memmap_pfn = end_pfn - 1;
    6140             : 
    6141             : #ifdef CONFIG_ZONE_DEVICE
    6142             :         /*
    6143             :          * Honor reservation requested by the driver for this ZONE_DEVICE
    6144             :          * memory. We limit the total number of pages to initialize to just
    6145             :          * those that might contain the memory mapping. We will defer the
    6146             :          * ZONE_DEVICE page initialization until after we have released
    6147             :          * the hotplug lock.
    6148             :          */
    6149             :         if (zone == ZONE_DEVICE) {
    6150             :                 if (!altmap)
    6151             :                         return;
    6152             : 
    6153             :                 if (start_pfn == altmap->base_pfn)
    6154             :                         start_pfn += altmap->reserve;
    6155             :                 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
    6156             :         }
    6157             : #endif
    6158             : 
    6159      262048 :         for (pfn = start_pfn; pfn < end_pfn; ) {
    6160             :                 /*
    6161             :                  * There can be holes in boot-time mem_map[]s handed to this
    6162             :                  * function.  They do not exist on hotplugged memory.
    6163             :                  */
    6164      262046 :                 if (context == MEMINIT_EARLY) {
    6165      262046 :                         if (overlap_memmap_init(zone, &pfn))
    6166           0 :                                 continue;
    6167      262046 :                         if (defer_init(nid, pfn, zone_end_pfn))
    6168             :                                 break;
    6169             :                 }
    6170             : 
    6171      262046 :                 page = pfn_to_page(pfn);
    6172      262046 :                 __init_single_page(page, pfn, zone, nid);
    6173      262046 :                 if (context == MEMINIT_HOTPLUG)
    6174           0 :                         __SetPageReserved(page);
    6175             : 
    6176             :                 /*
    6177             :                  * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
    6178             :                  * such that unmovable allocations won't be scattered all
    6179             :                  * over the place during system boot.
    6180             :                  */
    6181      262046 :                 if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
    6182         255 :                         set_pageblock_migratetype(page, migratetype);
    6183         255 :                         cond_resched();
    6184             :                 }
    6185      262046 :                 pfn++;
    6186             :         }
    6187           2 : }
    6188             : 
    6189             : #ifdef CONFIG_ZONE_DEVICE
    6190             : void __ref memmap_init_zone_device(struct zone *zone,
    6191             :                                    unsigned long start_pfn,
    6192             :                                    unsigned long nr_pages,
    6193             :                                    struct dev_pagemap *pgmap)
    6194             : {
    6195             :         unsigned long pfn, end_pfn = start_pfn + nr_pages;
    6196             :         struct pglist_data *pgdat = zone->zone_pgdat;
    6197             :         struct vmem_altmap *altmap = pgmap_altmap(pgmap);
    6198             :         unsigned long zone_idx = zone_idx(zone);
    6199             :         unsigned long start = jiffies;
    6200             :         int nid = pgdat->node_id;
    6201             : 
    6202             :         if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
    6203             :                 return;
    6204             : 
    6205             :         /*
    6206             :          * The call to memmap_init_zone should have already taken care
    6207             :          * of the pages reserved for the memmap, so we can just jump to
    6208             :          * the end of that region and start processing the device pages.
    6209             :          */
    6210             :         if (altmap) {
    6211             :                 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
    6212             :                 nr_pages = end_pfn - start_pfn;
    6213             :         }
    6214             : 
    6215             :         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
    6216             :                 struct page *page = pfn_to_page(pfn);
    6217             : 
    6218             :                 __init_single_page(page, pfn, zone_idx, nid);
    6219             : 
    6220             :                 /*
    6221             :                  * Mark page reserved as it will need to wait for onlining
    6222             :                  * phase for it to be fully associated with a zone.
    6223             :                  *
    6224             :                  * We can use the non-atomic __set_bit operation for setting
    6225             :                  * the flag as we are still initializing the pages.
    6226             :                  */
    6227             :                 __SetPageReserved(page);
    6228             : 
    6229             :                 /*
    6230             :                  * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
    6231             :                  * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
    6232             :                  * ever freed or placed on a driver-private list.
    6233             :                  */
    6234             :                 page->pgmap = pgmap;
    6235             :                 page->zone_device_data = NULL;
    6236             : 
    6237             :                 /*
    6238             :                  * Mark the block movable so that blocks are reserved for
    6239             :                  * movable at startup. This will force kernel allocations
    6240             :                  * to reserve their blocks rather than leaking throughout
    6241             :                  * the address space during boot when many long-lived
    6242             :                  * kernel allocations are made.
    6243             :                  *
    6244             :                  * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
    6245             :                  * because this is done early in section_activate()
    6246             :                  */
    6247             :                 if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
    6248             :                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    6249             :                         cond_resched();
    6250             :                 }
    6251             :         }
    6252             : 
    6253             :         pr_info("%s initialised %lu pages in %ums\n", __func__,
    6254             :                 nr_pages, jiffies_to_msecs(jiffies - start));
    6255             : }
    6256             : 
    6257             : #endif
    6258           1 : static void __meminit zone_init_free_lists(struct zone *zone)
    6259             : {
    6260           1 :         unsigned int order, t;
    6261          56 :         for_each_migratetype_order(order, t) {
    6262          44 :                 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
    6263          44 :                 zone->free_area[order].nr_free = 0;
    6264             :         }
    6265           1 : }
    6266             : 
    6267             : #if !defined(CONFIG_FLAT_NODE_MEM_MAP)
    6268             : /*
    6269             :  * Only struct pages that correspond to ranges defined by memblock.memory
    6270             :  * are zeroed and initialized by going through __init_single_page() during
    6271             :  * memmap_init_zone().
    6272             :  *
    6273             :  * But, there could be struct pages that correspond to holes in
    6274             :  * memblock.memory. This can happen because of the following reasons:
    6275             :  * - physical memory bank size is not necessarily the exact multiple of the
    6276             :  *   arbitrary section size
    6277             :  * - early reserved memory may not be listed in memblock.memory
    6278             :  * - memory layouts defined with memmap= kernel parameter may not align
    6279             :  *   nicely with memmap sections
    6280             :  *
    6281             :  * Explicitly initialize those struct pages so that:
    6282             :  * - PG_Reserved is set
    6283             :  * - zone and node links point to zone and node that span the page if the
    6284             :  *   hole is in the middle of a zone
    6285             :  * - zone and node links point to adjacent zone/node if the hole falls on
    6286             :  *   the zone boundary; the pages in such holes will be prepended to the
    6287             :  *   zone/node above the hole except for the trailing pages in the last
    6288             :  *   section that will be appended to the zone/node below.
    6289             :  */
    6290           2 : static u64 __meminit init_unavailable_range(unsigned long spfn,
    6291             :                                             unsigned long epfn,
    6292             :                                             int zone, int node)
    6293             : {
    6294           2 :         unsigned long pfn;
    6295           2 :         u64 pgcnt = 0;
    6296             : 
    6297         100 :         for (pfn = spfn; pfn < epfn; pfn++) {
    6298          98 :                 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
    6299           0 :                         pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
    6300             :                                 + pageblock_nr_pages - 1;
    6301           0 :                         continue;
    6302             :                 }
    6303          98 :                 __init_single_page(pfn_to_page(pfn), pfn, zone, node);
    6304          98 :                 __SetPageReserved(pfn_to_page(pfn));
    6305          98 :                 pgcnt++;
    6306             :         }
    6307             : 
    6308           2 :         return pgcnt;
    6309             : }
    6310             : #else
    6311             : static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
    6312             :                                          int zone, int node)
    6313             : {
    6314             :         return 0;
    6315             : }
    6316             : #endif
    6317             : 
    6318           1 : void __meminit __weak memmap_init_zone(struct zone *zone)
    6319             : {
    6320           1 :         unsigned long zone_start_pfn = zone->zone_start_pfn;
    6321           1 :         unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
    6322           1 :         int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
    6323           1 :         static unsigned long hole_pfn;
    6324           1 :         unsigned long start_pfn, end_pfn;
    6325           1 :         u64 pgcnt = 0;
    6326             : 
    6327           3 :         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
    6328           2 :                 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
    6329           2 :                 end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
    6330             : 
    6331           2 :                 if (end_pfn > start_pfn)
    6332           2 :                         memmap_init_range(end_pfn - start_pfn, nid,
    6333             :                                         zone_id, start_pfn, zone_end_pfn,
    6334             :                                         MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
    6335             : 
    6336           2 :                 if (hole_pfn < start_pfn)
    6337           2 :                         pgcnt += init_unavailable_range(hole_pfn, start_pfn,
    6338             :                                                         zone_id, nid);
    6339           2 :                 hole_pfn = end_pfn;
    6340             :         }
    6341             : 
    6342             : #ifdef CONFIG_SPARSEMEM
    6343             :         /*
    6344             :          * Initialize the hole in the range [zone_end_pfn, section_end].
    6345             :          * If zone boundary falls in the middle of a section, this hole
    6346             :          * will be re-initialized during the call to this function for the
    6347             :          * higher zone.
    6348             :          */
    6349           1 :         end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
    6350           1 :         if (hole_pfn < end_pfn)
    6351           0 :                 pgcnt += init_unavailable_range(hole_pfn, end_pfn,
    6352             :                                                 zone_id, nid);
    6353             : #endif
    6354             : 
    6355           1 :         if (pgcnt)
    6356           1 :                 pr_info("  %s zone: %llu pages in unavailable ranges\n",
    6357             :                         zone->name, pgcnt);
    6358           1 : }
    6359             : 
    6360           2 : static int zone_batchsize(struct zone *zone)
    6361             : {
    6362             : #ifdef CONFIG_MMU
    6363           2 :         int batch;
    6364             : 
    6365             :         /*
    6366             :          * The per-cpu-pages pools are set to around 1000th of the
    6367             :          * size of the zone.
    6368             :          */
    6369           2 :         batch = zone_managed_pages(zone) / 1024;
    6370             :         /* But no more than a meg. */
    6371           2 :         if (batch * PAGE_SIZE > 1024 * 1024)
    6372             :                 batch = (1024 * 1024) / PAGE_SIZE;
    6373           2 :         batch /= 4;             /* We effectively *= 4 below */
    6374           2 :         if (batch < 1)
    6375           0 :                 batch = 1;
    6376             : 
    6377             :         /*
    6378             :          * Clamp the batch to a 2^n - 1 value. Having a power
    6379             :          * of 2 value was found to be more likely to have
    6380             :          * suboptimal cache aliasing properties in some cases.
    6381             :          *
    6382             :          * For example if 2 tasks are alternately allocating
    6383             :          * batches of pages, one task can end up with a lot
    6384             :          * of pages of one half of the possible page colors
    6385             :          * and the other with pages of the other colors.
    6386             :          */
    6387           2 :         batch = rounddown_pow_of_two(batch + batch/2) - 1;
    6388             : 
    6389           2 :         return batch;
    6390             : 
    6391             : #else
    6392             :         /* The deferral and batching of frees should be suppressed under NOMMU
    6393             :          * conditions.
    6394             :          *
    6395             :          * The problem is that NOMMU needs to be able to allocate large chunks
    6396             :          * of contiguous memory as there's no hardware page translation to
    6397             :          * assemble apparent contiguous memory from discontiguous pages.
    6398             :          *
    6399             :          * Queueing large contiguous runs of pages for batching, however,
    6400             :          * causes the pages to actually be freed in smaller chunks.  As there
    6401             :          * can be a significant delay between the individual batches being
    6402             :          * recycled, this leads to the once large chunks of space being
    6403             :          * fragmented and becoming unavailable for high-order allocations.
    6404             :          */
    6405             :         return 0;
    6406             : #endif
    6407             : }
    6408             : 
    6409             : /*
    6410             :  * pcp->high and pcp->batch values are related and generally batch is lower
    6411             :  * than high. They are also related to pcp->count such that count is lower
    6412             :  * than high, and as soon as it reaches high, the pcplist is flushed.
    6413             :  *
    6414             :  * However, guaranteeing these relations at all times would require e.g. write
    6415             :  * barriers here but also careful usage of read barriers at the read side, and
    6416             :  * thus be prone to error and bad for performance. Thus the update only prevents
    6417             :  * store tearing. Any new users of pcp->batch and pcp->high should ensure they
    6418             :  * can cope with those fields changing asynchronously, and fully trust only the
    6419             :  * pcp->count field on the local CPU with interrupts disabled.
    6420             :  *
    6421             :  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
    6422             :  * outside of boot time (or some other assurance that no concurrent updaters
    6423             :  * exist).
    6424             :  */
    6425           4 : static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
    6426             :                 unsigned long batch)
    6427             : {
    6428           4 :         WRITE_ONCE(pcp->batch, batch);
    6429           4 :         WRITE_ONCE(pcp->high, high);
    6430           4 : }
    6431             : 
    6432           8 : static void pageset_init(struct per_cpu_pageset *p)
    6433             : {
    6434           8 :         struct per_cpu_pages *pcp;
    6435           8 :         int migratetype;
    6436             : 
    6437           8 :         memset(p, 0, sizeof(*p));
    6438             : 
    6439           8 :         pcp = &p->pcp;
    6440          32 :         for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
    6441          24 :                 INIT_LIST_HEAD(&pcp->lists[migratetype]);
    6442             : 
    6443             :         /*
    6444             :          * Set batch and high values safe for a boot pageset. A true percpu
    6445             :          * pageset's initialization will update them subsequently. Here we don't
    6446             :          * need to be as careful as pageset_update() as nobody can access the
    6447             :          * pageset yet.
    6448             :          */
    6449           8 :         pcp->high = BOOT_PAGESET_HIGH;
    6450           8 :         pcp->batch = BOOT_PAGESET_BATCH;
    6451           8 : }
    6452             : 
    6453           1 : static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
    6454             :                 unsigned long batch)
    6455             : {
    6456           1 :         struct per_cpu_pageset *p;
    6457           1 :         int cpu;
    6458             : 
    6459           6 :         for_each_possible_cpu(cpu) {
    6460           4 :                 p = per_cpu_ptr(zone->pageset, cpu);
    6461           5 :                 pageset_update(&p->pcp, high, batch);
    6462             :         }
    6463           1 : }
    6464             : 
    6465             : /*
    6466             :  * Calculate and set new high and batch values for all per-cpu pagesets of a
    6467             :  * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
    6468             :  */
    6469           1 : static void zone_set_pageset_high_and_batch(struct zone *zone)
    6470             : {
    6471           1 :         unsigned long new_high, new_batch;
    6472             : 
    6473           1 :         if (percpu_pagelist_fraction) {
    6474           0 :                 new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
    6475           0 :                 new_batch = max(1UL, new_high / 4);
    6476           0 :                 if ((new_high / 4) > (PAGE_SHIFT * 8))
    6477           0 :                         new_batch = PAGE_SHIFT * 8;
    6478             :         } else {
    6479           1 :                 new_batch = zone_batchsize(zone);
    6480           1 :                 new_high = 6 * new_batch;
    6481           1 :                 new_batch = max(1UL, 1 * new_batch);
    6482             :         }
    6483             : 
    6484           1 :         if (zone->pageset_high == new_high &&
    6485           0 :             zone->pageset_batch == new_batch)
    6486             :                 return;
    6487             : 
    6488           1 :         zone->pageset_high = new_high;
    6489           1 :         zone->pageset_batch = new_batch;
    6490             : 
    6491           1 :         __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
    6492             : }
    6493             : 
    6494           1 : void __meminit setup_zone_pageset(struct zone *zone)
    6495             : {
    6496           1 :         struct per_cpu_pageset *p;
    6497           1 :         int cpu;
    6498             : 
    6499           1 :         zone->pageset = alloc_percpu(struct per_cpu_pageset);
    6500           5 :         for_each_possible_cpu(cpu) {
    6501           4 :                 p = per_cpu_ptr(zone->pageset, cpu);
    6502           4 :                 pageset_init(p);
    6503             :         }
    6504             : 
    6505           1 :         zone_set_pageset_high_and_batch(zone);
    6506           1 : }
    6507             : 
    6508             : /*
    6509             :  * Allocate per cpu pagesets and initialize them.
    6510             :  * Before this call only boot pagesets were available.
    6511             :  */
    6512           1 : void __init setup_per_cpu_pageset(void)
    6513             : {
    6514           1 :         struct pglist_data *pgdat;
    6515           1 :         struct zone *zone;
    6516           1 :         int __maybe_unused cpu;
    6517             : 
    6518           4 :         for_each_populated_zone(zone)
    6519           1 :                 setup_zone_pageset(zone);
    6520             : 
    6521             : #ifdef CONFIG_NUMA
    6522             :         /*
    6523             :          * Unpopulated zones continue using the boot pagesets.
    6524             :          * The numa stats for these pagesets need to be reset.
    6525             :          * Otherwise, they will end up skewing the stats of
    6526             :          * the nodes these zones are associated with.
    6527             :          */
    6528           5 :         for_each_possible_cpu(cpu) {
    6529           4 :                 struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
    6530           4 :                 memset(pcp->vm_numa_stat_diff, 0,
    6531             :                        sizeof(pcp->vm_numa_stat_diff));
    6532             :         }
    6533             : #endif
    6534             : 
    6535           2 :         for_each_online_pgdat(pgdat)
    6536           1 :                 pgdat->per_cpu_nodestats =
    6537           1 :                         alloc_percpu(struct per_cpu_nodestat);
    6538           1 : }
    6539             : 
    6540           3 : static __meminit void zone_pcp_init(struct zone *zone)
    6541             : {
    6542             :         /*
    6543             :          * per cpu subsystem is not up at this point. The following code
    6544             :          * relies on the ability of the linker to provide the
    6545             :          * offset of a (static) per cpu variable into the per cpu area.
    6546             :          */
    6547           3 :         zone->pageset = &boot_pageset;
    6548           3 :         zone->pageset_high = BOOT_PAGESET_HIGH;
    6549           3 :         zone->pageset_batch = BOOT_PAGESET_BATCH;
    6550             : 
    6551           3 :         if (populated_zone(zone))
    6552           1 :                 printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
    6553             :                         zone->name, zone->present_pages,
    6554             :                                          zone_batchsize(zone));
    6555           3 : }
    6556             : 
    6557           1 : void __meminit init_currently_empty_zone(struct zone *zone,
    6558             :                                         unsigned long zone_start_pfn,
    6559             :                                         unsigned long size)
    6560             : {
    6561           1 :         struct pglist_data *pgdat = zone->zone_pgdat;
    6562           1 :         int zone_idx = zone_idx(zone) + 1;
    6563             : 
    6564           1 :         if (zone_idx > pgdat->nr_zones)
    6565           1 :                 pgdat->nr_zones = zone_idx;
    6566             : 
    6567           1 :         zone->zone_start_pfn = zone_start_pfn;
    6568             : 
    6569           1 :         mminit_dprintk(MMINIT_TRACE, "memmap_init",
    6570             :                         "Initialising map node %d zone %lu pfns %lu -> %lu\n",
    6571             :                         pgdat->node_id,
    6572             :                         (unsigned long)zone_idx(zone),
    6573             :                         zone_start_pfn, (zone_start_pfn + size));
    6574             : 
    6575           1 :         zone_init_free_lists(zone);
    6576           1 :         zone->initialized = 1;
    6577           1 : }
    6578             : 
    6579             : /**
    6580             :  * get_pfn_range_for_nid - Return the start and end page frames for a node
    6581             :  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
    6582             :  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
    6583             :  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
    6584             :  *
    6585             :  * It returns the start and end page frame of a node based on information
    6586             :  * provided by memblock_set_node(). If called for a node
    6587             :  * with no available memory, a warning is printed and the start and end
    6588             :  * PFNs will be 0.
    6589             :  */
    6590           1 : void __init get_pfn_range_for_nid(unsigned int nid,
    6591             :                         unsigned long *start_pfn, unsigned long *end_pfn)
    6592             : {
    6593           1 :         unsigned long this_start_pfn, this_end_pfn;
    6594           1 :         int i;
    6595             : 
    6596           1 :         *start_pfn = -1UL;
    6597           1 :         *end_pfn = 0;
    6598             : 
    6599           3 :         for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
    6600           2 :                 *start_pfn = min(*start_pfn, this_start_pfn);
    6601           2 :                 *end_pfn = max(*end_pfn, this_end_pfn);
    6602             :         }
    6603             : 
    6604           1 :         if (*start_pfn == -1UL)
    6605           0 :                 *start_pfn = 0;
    6606           1 : }
    6607             : 
    6608             : /*
    6609             :  * This finds a zone that can be used for ZONE_MOVABLE pages. The
    6610             :  * assumption is made that zones within a node are ordered in monotonic
    6611             :  * increasing memory addresses so that the "highest" populated zone is used
    6612             :  */
    6613           1 : static void __init find_usable_zone_for_movable(void)
    6614             : {
    6615           1 :         int zone_index;
    6616           3 :         for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
    6617           3 :                 if (zone_index == ZONE_MOVABLE)
    6618           1 :                         continue;
    6619             : 
    6620           2 :                 if (arch_zone_highest_possible_pfn[zone_index] >
    6621           2 :                                 arch_zone_lowest_possible_pfn[zone_index])
    6622             :                         break;
    6623             :         }
    6624             : 
    6625           1 :         VM_BUG_ON(zone_index == -1);
    6626           1 :         movable_zone = zone_index;
    6627           1 : }
    6628             : 
    6629             : /*
    6630             :  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
    6631             :  * because it is sized independent of architecture. Unlike the other zones,
    6632             :  * the starting point for ZONE_MOVABLE is not fixed. It may be different
    6633             :  * in each node depending on the size of each node and how evenly kernelcore
    6634             :  * is distributed. This helper function adjusts the zone ranges
    6635             :  * provided by the architecture for a given node by using the end of the
    6636             :  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
    6637             :  * zones within a node are in order of monotonic increases memory addresses
    6638             :  */
    6639           6 : static void __init adjust_zone_range_for_zone_movable(int nid,
    6640             :                                         unsigned long zone_type,
    6641             :                                         unsigned long node_start_pfn,
    6642             :                                         unsigned long node_end_pfn,
    6643             :                                         unsigned long *zone_start_pfn,
    6644             :                                         unsigned long *zone_end_pfn)
    6645             : {
    6646             :         /* Only adjust if ZONE_MOVABLE is on this node */
    6647           6 :         if (zone_movable_pfn[nid]) {
    6648             :                 /* Size ZONE_MOVABLE */
    6649           0 :                 if (zone_type == ZONE_MOVABLE) {
    6650           0 :                         *zone_start_pfn = zone_movable_pfn[nid];
    6651           0 :                         *zone_end_pfn = min(node_end_pfn,
    6652             :                                 arch_zone_highest_possible_pfn[movable_zone]);
    6653             : 
    6654             :                 /* Adjust for ZONE_MOVABLE starting within this range */
    6655           0 :                 } else if (!mirrored_kernelcore &&
    6656           0 :                         *zone_start_pfn < zone_movable_pfn[nid] &&
    6657           0 :                         *zone_end_pfn > zone_movable_pfn[nid]) {
    6658           0 :                         *zone_end_pfn = zone_movable_pfn[nid];
    6659             : 
    6660             :                 /* Check if this whole range is within ZONE_MOVABLE */
    6661           0 :                 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
    6662           0 :                         *zone_start_pfn = *zone_end_pfn;
    6663             :         }
    6664           6 : }
    6665             : 
    6666             : /*
    6667             :  * Return the number of pages a zone spans in a node, including holes
    6668             :  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
    6669             :  */
    6670           3 : static unsigned long __init zone_spanned_pages_in_node(int nid,
    6671             :                                         unsigned long zone_type,
    6672             :                                         unsigned long node_start_pfn,
    6673             :                                         unsigned long node_end_pfn,
    6674             :                                         unsigned long *zone_start_pfn,
    6675             :                                         unsigned long *zone_end_pfn)
    6676             : {
    6677           3 :         unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
    6678           3 :         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
    6679             :         /* When hotadd a new node from cpu_up(), the node should be empty */
    6680           3 :         if (!node_start_pfn && !node_end_pfn)
    6681             :                 return 0;
    6682             : 
    6683             :         /* Get the start and end of the zone */
    6684           3 :         *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
    6685           3 :         *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
    6686           3 :         adjust_zone_range_for_zone_movable(nid, zone_type,
    6687             :                                 node_start_pfn, node_end_pfn,
    6688             :                                 zone_start_pfn, zone_end_pfn);
    6689             : 
    6690             :         /* Check that this node has pages within the zone's required range */
    6691           3 :         if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
    6692             :                 return 0;
    6693             : 
    6694             :         /* Move the zone boundaries inside the node if necessary */
    6695           2 :         *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
    6696           2 :         *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
    6697             : 
    6698             :         /* Return the spanned pages */
    6699           2 :         return *zone_end_pfn - *zone_start_pfn;
    6700             : }
    6701             : 
    6702             : /*
    6703             :  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
    6704             :  * then all holes in the requested range will be accounted for.
    6705             :  */
    6706           5 : unsigned long __init __absent_pages_in_range(int nid,
    6707             :                                 unsigned long range_start_pfn,
    6708             :                                 unsigned long range_end_pfn)
    6709             : {
    6710           5 :         unsigned long nr_absent = range_end_pfn - range_start_pfn;
    6711           5 :         unsigned long start_pfn, end_pfn;
    6712           5 :         int i;
    6713             : 
    6714          15 :         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
    6715          10 :                 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
    6716          10 :                 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
    6717          10 :                 nr_absent -= end_pfn - start_pfn;
    6718             :         }
    6719           5 :         return nr_absent;
    6720             : }
    6721             : 
    6722             : /**
    6723             :  * absent_pages_in_range - Return number of page frames in holes within a range
    6724             :  * @start_pfn: The start PFN to start searching for holes
    6725             :  * @end_pfn: The end PFN to stop searching for holes
    6726             :  *
    6727             :  * Return: the number of pages frames in memory holes within a range.
    6728             :  */
    6729           1 : unsigned long __init absent_pages_in_range(unsigned long start_pfn,
    6730             :                                                         unsigned long end_pfn)
    6731             : {
    6732           1 :         return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
    6733             : }
    6734             : 
    6735             : /* Return the number of page frames in holes in a zone on a node */
    6736           3 : static unsigned long __init zone_absent_pages_in_node(int nid,
    6737             :                                         unsigned long zone_type,
    6738             :                                         unsigned long node_start_pfn,
    6739             :                                         unsigned long node_end_pfn)
    6740             : {
    6741           3 :         unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
    6742           3 :         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
    6743           3 :         unsigned long zone_start_pfn, zone_end_pfn;
    6744           3 :         unsigned long nr_absent;
    6745             : 
    6746             :         /* When hotadd a new node from cpu_up(), the node should be empty */
    6747           3 :         if (!node_start_pfn && !node_end_pfn)
    6748             :                 return 0;
    6749             : 
    6750           3 :         zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
    6751           3 :         zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
    6752             : 
    6753           3 :         adjust_zone_range_for_zone_movable(nid, zone_type,
    6754             :                         node_start_pfn, node_end_pfn,
    6755             :                         &zone_start_pfn, &zone_end_pfn);
    6756           3 :         nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
    6757             : 
    6758             :         /*
    6759             :          * ZONE_MOVABLE handling.
    6760             :          * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
    6761             :          * and vice versa.
    6762             :          */
    6763           3 :         if (mirrored_kernelcore && zone_movable_pfn[nid]) {
    6764           0 :                 unsigned long start_pfn, end_pfn;
    6765           0 :                 struct memblock_region *r;
    6766             : 
    6767           0 :                 for_each_mem_region(r) {
    6768           0 :                         start_pfn = clamp(memblock_region_memory_base_pfn(r),
    6769             :                                           zone_start_pfn, zone_end_pfn);
    6770           0 :                         end_pfn = clamp(memblock_region_memory_end_pfn(r),
    6771             :                                         zone_start_pfn, zone_end_pfn);
    6772             : 
    6773           0 :                         if (zone_type == ZONE_MOVABLE &&
    6774           0 :                             memblock_is_mirror(r))
    6775           0 :                                 nr_absent += end_pfn - start_pfn;
    6776             : 
    6777           0 :                         if (zone_type == ZONE_NORMAL &&
    6778           0 :                             !memblock_is_mirror(r))
    6779           0 :                                 nr_absent += end_pfn - start_pfn;
    6780             :                 }
    6781             :         }
    6782             : 
    6783             :         return nr_absent;
    6784             : }
    6785             : 
    6786           1 : static void __init calculate_node_totalpages(struct pglist_data *pgdat,
    6787             :                                                 unsigned long node_start_pfn,
    6788             :                                                 unsigned long node_end_pfn)
    6789             : {
    6790           1 :         unsigned long realtotalpages = 0, totalpages = 0;
    6791           1 :         enum zone_type i;
    6792             : 
    6793           4 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    6794           3 :                 struct zone *zone = pgdat->node_zones + i;
    6795           3 :                 unsigned long zone_start_pfn, zone_end_pfn;
    6796           3 :                 unsigned long spanned, absent;
    6797           3 :                 unsigned long size, real_size;
    6798             : 
    6799           3 :                 spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
    6800             :                                                      node_start_pfn,
    6801             :                                                      node_end_pfn,
    6802             :                                                      &zone_start_pfn,
    6803             :                                                      &zone_end_pfn);
    6804           3 :                 absent = zone_absent_pages_in_node(pgdat->node_id, i,
    6805             :                                                    node_start_pfn,
    6806             :                                                    node_end_pfn);
    6807             : 
    6808           3 :                 size = spanned;
    6809           3 :                 real_size = size - absent;
    6810             : 
    6811           3 :                 if (size)
    6812           1 :                         zone->zone_start_pfn = zone_start_pfn;
    6813             :                 else
    6814           2 :                         zone->zone_start_pfn = 0;
    6815           3 :                 zone->spanned_pages = size;
    6816           3 :                 zone->present_pages = real_size;
    6817             : 
    6818           3 :                 totalpages += size;
    6819           3 :                 realtotalpages += real_size;
    6820             :         }
    6821             : 
    6822           1 :         pgdat->node_spanned_pages = totalpages;
    6823           1 :         pgdat->node_present_pages = realtotalpages;
    6824           1 :         printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
    6825             :                                                         realtotalpages);
    6826           1 : }
    6827             : 
    6828             : #ifndef CONFIG_SPARSEMEM
    6829             : /*
    6830             :  * Calculate the size of the zone->blockflags rounded to an unsigned long
    6831             :  * Start by making sure zonesize is a multiple of pageblock_order by rounding
    6832             :  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
    6833             :  * round what is now in bits to nearest long in bits, then return it in
    6834             :  * bytes.
    6835             :  */
    6836             : static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
    6837             : {
    6838             :         unsigned long usemapsize;
    6839             : 
    6840             :         zonesize += zone_start_pfn & (pageblock_nr_pages-1);
    6841             :         usemapsize = roundup(zonesize, pageblock_nr_pages);
    6842             :         usemapsize = usemapsize >> pageblock_order;
    6843             :         usemapsize *= NR_PAGEBLOCK_BITS;
    6844             :         usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
    6845             : 
    6846             :         return usemapsize / 8;
    6847             : }
    6848             : 
    6849             : static void __ref setup_usemap(struct zone *zone)
    6850             : {
    6851             :         unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
    6852             :                                                zone->spanned_pages);
    6853             :         zone->pageblock_flags = NULL;
    6854             :         if (usemapsize) {
    6855             :                 zone->pageblock_flags =
    6856             :                         memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
    6857             :                                             zone_to_nid(zone));
    6858             :                 if (!zone->pageblock_flags)
    6859             :                         panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
    6860             :                               usemapsize, zone->name, zone_to_nid(zone));
    6861             :         }
    6862             : }
    6863             : #else
    6864           1 : static inline void setup_usemap(struct zone *zone) {}
    6865             : #endif /* CONFIG_SPARSEMEM */
    6866             : 
    6867             : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
    6868             : 
    6869             : /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
    6870             : void __init set_pageblock_order(void)
    6871             : {
    6872             :         unsigned int order;
    6873             : 
    6874             :         /* Check that pageblock_nr_pages has not already been setup */
    6875             :         if (pageblock_order)
    6876             :                 return;
    6877             : 
    6878             :         if (HPAGE_SHIFT > PAGE_SHIFT)
    6879             :                 order = HUGETLB_PAGE_ORDER;
    6880             :         else
    6881             :                 order = MAX_ORDER - 1;
    6882             : 
    6883             :         /*
    6884             :          * Assume the largest contiguous order of interest is a huge page.
    6885             :          * This value may be variable depending on boot parameters on IA64 and
    6886             :          * powerpc.
    6887             :          */
    6888             :         pageblock_order = order;
    6889             : }
    6890             : #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
    6891             : 
    6892             : /*
    6893             :  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
    6894             :  * is unused as pageblock_order is set at compile-time. See
    6895             :  * include/linux/pageblock-flags.h for the values of pageblock_order based on
    6896             :  * the kernel config
    6897             :  */
    6898           2 : void __init set_pageblock_order(void)
    6899             : {
    6900           2 : }
    6901             : 
    6902             : #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
    6903             : 
    6904           3 : static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
    6905             :                                                 unsigned long present_pages)
    6906             : {
    6907           3 :         unsigned long pages = spanned_pages;
    6908             : 
    6909             :         /*
    6910             :          * Provide a more accurate estimation if there are holes within
    6911             :          * the zone and SPARSEMEM is in use. If there are holes within the
    6912             :          * zone, each populated memory region may cost us one or two extra
    6913             :          * memmap pages due to alignment because memmap pages for each
    6914             :          * populated regions may not be naturally aligned on page boundary.
    6915             :          * So the (present_pages >> 4) heuristic is a tradeoff for that.
    6916             :          */
    6917           3 :         if (spanned_pages > present_pages + (present_pages >> 4) &&
    6918             :             IS_ENABLED(CONFIG_SPARSEMEM))
    6919           0 :                 pages = present_pages;
    6920             : 
    6921           3 :         return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
    6922             : }
    6923             : 
    6924             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    6925           1 : static void pgdat_init_split_queue(struct pglist_data *pgdat)
    6926             : {
    6927           1 :         struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
    6928             : 
    6929           1 :         spin_lock_init(&ds_queue->split_queue_lock);
    6930           1 :         INIT_LIST_HEAD(&ds_queue->split_queue);
    6931           1 :         ds_queue->split_queue_len = 0;
    6932           1 : }
    6933             : #else
    6934             : static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
    6935             : #endif
    6936             : 
    6937             : #ifdef CONFIG_COMPACTION
    6938           1 : static void pgdat_init_kcompactd(struct pglist_data *pgdat)
    6939             : {
    6940           2 :         init_waitqueue_head(&pgdat->kcompactd_wait);
    6941             : }
    6942             : #else
    6943             : static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
    6944             : #endif
    6945             : 
    6946           1 : static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
    6947             : {
    6948           1 :         pgdat_resize_init(pgdat);
    6949             : 
    6950           1 :         pgdat_init_split_queue(pgdat);
    6951           1 :         pgdat_init_kcompactd(pgdat);
    6952             : 
    6953           1 :         init_waitqueue_head(&pgdat->kswapd_wait);
    6954           1 :         init_waitqueue_head(&pgdat->pfmemalloc_wait);
    6955             : 
    6956           1 :         pgdat_page_ext_init(pgdat);
    6957           1 :         lruvec_init(&pgdat->__lruvec);
    6958           1 : }
    6959             : 
    6960           3 : static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
    6961             :                                                         unsigned long remaining_pages)
    6962             : {
    6963           3 :         atomic_long_set(&zone->managed_pages, remaining_pages);
    6964           3 :         zone_set_nid(zone, nid);
    6965           3 :         zone->name = zone_names[idx];
    6966           3 :         zone->zone_pgdat = NODE_DATA(nid);
    6967           3 :         spin_lock_init(&zone->lock);
    6968           3 :         zone_seqlock_init(zone);
    6969           3 :         zone_pcp_init(zone);
    6970           3 : }
    6971             : 
    6972             : /*
    6973             :  * Set up the zone data structures
    6974             :  * - init pgdat internals
    6975             :  * - init all zones belonging to this node
    6976             :  *
    6977             :  * NOTE: this function is only called during memory hotplug
    6978             :  */
    6979             : #ifdef CONFIG_MEMORY_HOTPLUG
    6980             : void __ref free_area_init_core_hotplug(int nid)
    6981             : {
    6982             :         enum zone_type z;
    6983             :         pg_data_t *pgdat = NODE_DATA(nid);
    6984             : 
    6985             :         pgdat_init_internals(pgdat);
    6986             :         for (z = 0; z < MAX_NR_ZONES; z++)
    6987             :                 zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
    6988             : }
    6989             : #endif
    6990             : 
    6991             : /*
    6992             :  * Set up the zone data structures:
    6993             :  *   - mark all pages reserved
    6994             :  *   - mark all memory queues empty
    6995             :  *   - clear the memory bitmaps
    6996             :  *
    6997             :  * NOTE: pgdat should get zeroed by caller.
    6998             :  * NOTE: this function is only called during early init.
    6999             :  */
    7000           1 : static void __init free_area_init_core(struct pglist_data *pgdat)
    7001             : {
    7002           1 :         enum zone_type j;
    7003           1 :         int nid = pgdat->node_id;
    7004             : 
    7005           1 :         pgdat_init_internals(pgdat);
    7006           1 :         pgdat->per_cpu_nodestats = &boot_nodestats;
    7007             : 
    7008           4 :         for (j = 0; j < MAX_NR_ZONES; j++) {
    7009           3 :                 struct zone *zone = pgdat->node_zones + j;
    7010           3 :                 unsigned long size, freesize, memmap_pages;
    7011             : 
    7012           3 :                 size = zone->spanned_pages;
    7013           3 :                 freesize = zone->present_pages;
    7014             : 
    7015             :                 /*
    7016             :                  * Adjust freesize so that it accounts for how much memory
    7017             :                  * is used by this zone for memmap. This affects the watermark
    7018             :                  * and per-cpu initialisations
    7019             :                  */
    7020           3 :                 memmap_pages = calc_memmap_size(size, freesize);
    7021           3 :                 if (!is_highmem_idx(j)) {
    7022           3 :                         if (freesize >= memmap_pages) {
    7023           3 :                                 freesize -= memmap_pages;
    7024           3 :                                 if (memmap_pages)
    7025           1 :                                         printk(KERN_DEBUG
    7026             :                                                "  %s zone: %lu pages used for memmap\n",
    7027             :                                                zone_names[j], memmap_pages);
    7028             :                         } else
    7029           0 :                                 pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
    7030             :                                         zone_names[j], memmap_pages, freesize);
    7031             :                 }
    7032             : 
    7033             :                 /* Account for reserved pages */
    7034           3 :                 if (j == 0 && freesize > dma_reserve) {
    7035           1 :                         freesize -= dma_reserve;
    7036           1 :                         printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
    7037             :                                         zone_names[0], dma_reserve);
    7038             :                 }
    7039             : 
    7040           3 :                 if (!is_highmem_idx(j))
    7041           3 :                         nr_kernel_pages += freesize;
    7042             :                 /* Charge for highmem memmap if there are enough kernel pages */
    7043             :                 else if (nr_kernel_pages > memmap_pages * 2)
    7044             :                         nr_kernel_pages -= memmap_pages;
    7045           3 :                 nr_all_pages += freesize;
    7046             : 
    7047             :                 /*
    7048             :                  * Set an approximate value for lowmem here, it will be adjusted
    7049             :                  * when the bootmem allocator frees pages into the buddy system.
    7050             :                  * And all highmem pages will be managed by the buddy system.
    7051             :                  */
    7052           3 :                 zone_init_internals(zone, j, nid, freesize);
    7053             : 
    7054           3 :                 if (!size)
    7055           2 :                         continue;
    7056             : 
    7057           1 :                 set_pageblock_order();
    7058           1 :                 setup_usemap(zone);
    7059           1 :                 init_currently_empty_zone(zone, zone->zone_start_pfn, size);
    7060           1 :                 memmap_init_zone(zone);
    7061             :         }
    7062           1 : }
    7063             : 
    7064             : #ifdef CONFIG_FLAT_NODE_MEM_MAP
    7065             : static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
    7066             : {
    7067             :         unsigned long __maybe_unused start = 0;
    7068             :         unsigned long __maybe_unused offset = 0;
    7069             : 
    7070             :         /* Skip empty nodes */
    7071             :         if (!pgdat->node_spanned_pages)
    7072             :                 return;
    7073             : 
    7074             :         start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
    7075             :         offset = pgdat->node_start_pfn - start;
    7076             :         /* ia64 gets its own node_mem_map, before this, without bootmem */
    7077             :         if (!pgdat->node_mem_map) {
    7078             :                 unsigned long size, end;
    7079             :                 struct page *map;
    7080             : 
    7081             :                 /*
    7082             :                  * The zone's endpoints aren't required to be MAX_ORDER
    7083             :                  * aligned but the node_mem_map endpoints must be in order
    7084             :                  * for the buddy allocator to function correctly.
    7085             :                  */
    7086             :                 end = pgdat_end_pfn(pgdat);
    7087             :                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
    7088             :                 size =  (end - start) * sizeof(struct page);
    7089             :                 map = memblock_alloc_node(size, SMP_CACHE_BYTES,
    7090             :                                           pgdat->node_id);
    7091             :                 if (!map)
    7092             :                         panic("Failed to allocate %ld bytes for node %d memory map\n",
    7093             :                               size, pgdat->node_id);
    7094             :                 pgdat->node_mem_map = map + offset;
    7095             :         }
    7096             :         pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
    7097             :                                 __func__, pgdat->node_id, (unsigned long)pgdat,
    7098             :                                 (unsigned long)pgdat->node_mem_map);
    7099             : #ifndef CONFIG_NEED_MULTIPLE_NODES
    7100             :         /*
    7101             :          * With no DISCONTIG, the global mem_map is just set as node 0's
    7102             :          */
    7103             :         if (pgdat == NODE_DATA(0)) {
    7104             :                 mem_map = NODE_DATA(0)->node_mem_map;
    7105             :                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
    7106             :                         mem_map -= offset;
    7107             :         }
    7108             : #endif
    7109             : }
    7110             : #else
    7111             : static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
    7112             : #endif /* CONFIG_FLAT_NODE_MEM_MAP */
    7113             : 
    7114             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    7115             : static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
    7116             : {
    7117             :         pgdat->first_deferred_pfn = ULONG_MAX;
    7118             : }
    7119             : #else
    7120           1 : static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
    7121             : #endif
    7122             : 
    7123           1 : static void __init free_area_init_node(int nid)
    7124             : {
    7125           1 :         pg_data_t *pgdat = NODE_DATA(nid);
    7126           1 :         unsigned long start_pfn = 0;
    7127           1 :         unsigned long end_pfn = 0;
    7128             : 
    7129             :         /* pg_data_t should be reset to zero when it's allocated */
    7130           2 :         WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
    7131             : 
    7132           1 :         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
    7133             : 
    7134           1 :         pgdat->node_id = nid;
    7135           1 :         pgdat->node_start_pfn = start_pfn;
    7136           1 :         pgdat->per_cpu_nodestats = NULL;
    7137             : 
    7138           1 :         pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
    7139             :                 (u64)start_pfn << PAGE_SHIFT,
    7140             :                 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
    7141           1 :         calculate_node_totalpages(pgdat, start_pfn, end_pfn);
    7142             : 
    7143           1 :         alloc_node_mem_map(pgdat);
    7144           1 :         pgdat_set_deferred_range(pgdat);
    7145             : 
    7146           1 :         free_area_init_core(pgdat);
    7147           1 : }
    7148             : 
    7149           0 : void __init free_area_init_memoryless_node(int nid)
    7150             : {
    7151           0 :         free_area_init_node(nid);
    7152           0 : }
    7153             : 
    7154             : #if MAX_NUMNODES > 1
    7155             : /*
    7156             :  * Figure out the number of possible node ids.
    7157             :  */
    7158           1 : void __init setup_nr_node_ids(void)
    7159             : {
    7160           1 :         unsigned int highest;
    7161             : 
    7162           1 :         highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
    7163           1 :         nr_node_ids = highest + 1;
    7164           1 : }
    7165             : #endif
    7166             : 
    7167             : /**
    7168             :  * node_map_pfn_alignment - determine the maximum internode alignment
    7169             :  *
    7170             :  * This function should be called after node map is populated and sorted.
    7171             :  * It calculates the maximum power of two alignment which can distinguish
    7172             :  * all the nodes.
    7173             :  *
    7174             :  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
    7175             :  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
    7176             :  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
    7177             :  * shifted, 1GiB is enough and this function will indicate so.
    7178             :  *
    7179             :  * This is used to test whether pfn -> nid mapping of the chosen memory
    7180             :  * model has fine enough granularity to avoid incorrect mapping for the
    7181             :  * populated node map.
    7182             :  *
    7183             :  * Return: the determined alignment in pfn's.  0 if there is no alignment
    7184             :  * requirement (single node).
    7185             :  */
    7186           0 : unsigned long __init node_map_pfn_alignment(void)
    7187             : {
    7188           0 :         unsigned long accl_mask = 0, last_end = 0;
    7189           0 :         unsigned long start, end, mask;
    7190           0 :         int last_nid = NUMA_NO_NODE;
    7191           0 :         int i, nid;
    7192             : 
    7193           0 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
    7194           0 :                 if (!start || last_nid < 0 || last_nid == nid) {
    7195           0 :                         last_nid = nid;
    7196           0 :                         last_end = end;
    7197           0 :                         continue;
    7198             :                 }
    7199             : 
    7200             :                 /*
    7201             :                  * Start with a mask granular enough to pin-point to the
    7202             :                  * start pfn and tick off bits one-by-one until it becomes
    7203             :                  * too coarse to separate the current node from the last.
    7204             :                  */
    7205           0 :                 mask = ~((1 << __ffs(start)) - 1);
    7206           0 :                 while (mask && last_end <= (start & (mask << 1)))
    7207             :                         mask <<= 1;
    7208             : 
    7209             :                 /* accumulate all internode masks */
    7210           0 :                 accl_mask |= mask;
    7211             :         }
    7212             : 
    7213             :         /* convert mask to number of pages */
    7214           0 :         return ~accl_mask + 1;
    7215             : }
    7216             : 
    7217             : /**
    7218             :  * find_min_pfn_with_active_regions - Find the minimum PFN registered
    7219             :  *
    7220             :  * Return: the minimum PFN based on information provided via
    7221             :  * memblock_set_node().
    7222             :  */
    7223           1 : unsigned long __init find_min_pfn_with_active_regions(void)
    7224             : {
    7225           1 :         return PHYS_PFN(memblock_start_of_DRAM());
    7226             : }
    7227             : 
    7228             : /*
    7229             :  * early_calculate_totalpages()
    7230             :  * Sum pages in active regions for movable zone.
    7231             :  * Populate N_MEMORY for calculating usable_nodes.
    7232             :  */
    7233           1 : static unsigned long __init early_calculate_totalpages(void)
    7234             : {
    7235           1 :         unsigned long totalpages = 0;
    7236           1 :         unsigned long start_pfn, end_pfn;
    7237           1 :         int i, nid;
    7238             : 
    7239           3 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
    7240           2 :                 unsigned long pages = end_pfn - start_pfn;
    7241             : 
    7242           2 :                 totalpages += pages;
    7243           2 :                 if (pages)
    7244           4 :                         node_set_state(nid, N_MEMORY);
    7245             :         }
    7246           1 :         return totalpages;
    7247             : }
    7248             : 
    7249             : /*
    7250             :  * Find the PFN the Movable zone begins in each node. Kernel memory
    7251             :  * is spread evenly between nodes as long as the nodes have enough
    7252             :  * memory. When they don't, some nodes will have more kernelcore than
    7253             :  * others
    7254             :  */
    7255           1 : static void __init find_zone_movable_pfns_for_nodes(void)
    7256             : {
    7257           1 :         int i, nid;
    7258           1 :         unsigned long usable_startpfn;
    7259           1 :         unsigned long kernelcore_node, kernelcore_remaining;
    7260             :         /* save the state before borrow the nodemask */
    7261           1 :         nodemask_t saved_node_state = node_states[N_MEMORY];
    7262           1 :         unsigned long totalpages = early_calculate_totalpages();
    7263           1 :         int usable_nodes = nodes_weight(node_states[N_MEMORY]);
    7264           1 :         struct memblock_region *r;
    7265             : 
    7266             :         /* Need to find movable_zone earlier when movable_node is specified. */
    7267           1 :         find_usable_zone_for_movable();
    7268             : 
    7269             :         /*
    7270             :          * If movable_node is specified, ignore kernelcore and movablecore
    7271             :          * options.
    7272             :          */
    7273           1 :         if (movable_node_is_enabled()) {
    7274             :                 for_each_mem_region(r) {
    7275             :                         if (!memblock_is_hotpluggable(r))
    7276             :                                 continue;
    7277             : 
    7278             :                         nid = memblock_get_region_node(r);
    7279             : 
    7280             :                         usable_startpfn = PFN_DOWN(r->base);
    7281             :                         zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
    7282             :                                 min(usable_startpfn, zone_movable_pfn[nid]) :
    7283             :                                 usable_startpfn;
    7284             :                 }
    7285             : 
    7286             :                 goto out2;
    7287             :         }
    7288             : 
    7289             :         /*
    7290             :          * If kernelcore=mirror is specified, ignore movablecore option
    7291             :          */
    7292           1 :         if (mirrored_kernelcore) {
    7293           0 :                 bool mem_below_4gb_not_mirrored = false;
    7294             : 
    7295           0 :                 for_each_mem_region(r) {
    7296           0 :                         if (memblock_is_mirror(r))
    7297           0 :                                 continue;
    7298             : 
    7299           0 :                         nid = memblock_get_region_node(r);
    7300             : 
    7301           0 :                         usable_startpfn = memblock_region_memory_base_pfn(r);
    7302             : 
    7303           0 :                         if (usable_startpfn < 0x100000) {
    7304           0 :                                 mem_below_4gb_not_mirrored = true;
    7305           0 :                                 continue;
    7306             :                         }
    7307             : 
    7308           0 :                         zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
    7309           0 :                                 min(usable_startpfn, zone_movable_pfn[nid]) :
    7310             :                                 usable_startpfn;
    7311             :                 }
    7312             : 
    7313           0 :                 if (mem_below_4gb_not_mirrored)
    7314           0 :                         pr_warn("This configuration results in unmirrored kernel memory.\n");
    7315             : 
    7316           0 :                 goto out2;
    7317             :         }
    7318             : 
    7319             :         /*
    7320             :          * If kernelcore=nn% or movablecore=nn% was specified, calculate the
    7321             :          * amount of necessary memory.
    7322             :          */
    7323           1 :         if (required_kernelcore_percent)
    7324           0 :                 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
    7325             :                                        10000UL;
    7326           1 :         if (required_movablecore_percent)
    7327           0 :                 required_movablecore = (totalpages * 100 * required_movablecore_percent) /
    7328             :                                         10000UL;
    7329             : 
    7330             :         /*
    7331             :          * If movablecore= was specified, calculate what size of
    7332             :          * kernelcore that corresponds so that memory usable for
    7333             :          * any allocation type is evenly spread. If both kernelcore
    7334             :          * and movablecore are specified, then the value of kernelcore
    7335             :          * will be used for required_kernelcore if it's greater than
    7336             :          * what movablecore would have allowed.
    7337             :          */
    7338           1 :         if (required_movablecore) {
    7339           0 :                 unsigned long corepages;
    7340             : 
    7341             :                 /*
    7342             :                  * Round-up so that ZONE_MOVABLE is at least as large as what
    7343             :                  * was requested by the user
    7344             :                  */
    7345           0 :                 required_movablecore =
    7346           0 :                         roundup(required_movablecore, MAX_ORDER_NR_PAGES);
    7347           0 :                 required_movablecore = min(totalpages, required_movablecore);
    7348           0 :                 corepages = totalpages - required_movablecore;
    7349             : 
    7350           0 :                 required_kernelcore = max(required_kernelcore, corepages);
    7351             :         }
    7352             : 
    7353             :         /*
    7354             :          * If kernelcore was not specified or kernelcore size is larger
    7355             :          * than totalpages, there is no ZONE_MOVABLE.
    7356             :          */
    7357           1 :         if (!required_kernelcore || required_kernelcore >= totalpages)
    7358           1 :                 goto out;
    7359             : 
    7360             :         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
    7361           0 :         usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
    7362             : 
    7363           0 : restart:
    7364             :         /* Spread kernelcore memory as evenly as possible throughout nodes */
    7365           0 :         kernelcore_node = required_kernelcore / usable_nodes;
    7366           0 :         for_each_node_state(nid, N_MEMORY) {
    7367           0 :                 unsigned long start_pfn, end_pfn;
    7368             : 
    7369             :                 /*
    7370             :                  * Recalculate kernelcore_node if the division per node
    7371             :                  * now exceeds what is necessary to satisfy the requested
    7372             :                  * amount of memory for the kernel
    7373             :                  */
    7374           0 :                 if (required_kernelcore < kernelcore_node)
    7375           0 :                         kernelcore_node = required_kernelcore / usable_nodes;
    7376             : 
    7377             :                 /*
    7378             :                  * As the map is walked, we track how much memory is usable
    7379             :                  * by the kernel using kernelcore_remaining. When it is
    7380             :                  * 0, the rest of the node is usable by ZONE_MOVABLE
    7381             :                  */
    7382           0 :                 kernelcore_remaining = kernelcore_node;
    7383             : 
    7384             :                 /* Go through each range of PFNs within this node */
    7385           0 :                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
    7386           0 :                         unsigned long size_pages;
    7387             : 
    7388           0 :                         start_pfn = max(start_pfn, zone_movable_pfn[nid]);
    7389           0 :                         if (start_pfn >= end_pfn)
    7390           0 :                                 continue;
    7391             : 
    7392             :                         /* Account for what is only usable for kernelcore */
    7393           0 :                         if (start_pfn < usable_startpfn) {
    7394           0 :                                 unsigned long kernel_pages;
    7395           0 :                                 kernel_pages = min(end_pfn, usable_startpfn)
    7396             :                                                                 - start_pfn;
    7397             : 
    7398           0 :                                 kernelcore_remaining -= min(kernel_pages,
    7399             :                                                         kernelcore_remaining);
    7400           0 :                                 required_kernelcore -= min(kernel_pages,
    7401             :                                                         required_kernelcore);
    7402             : 
    7403             :                                 /* Continue if range is now fully accounted */
    7404           0 :                                 if (end_pfn <= usable_startpfn) {
    7405             : 
    7406             :                                         /*
    7407             :                                          * Push zone_movable_pfn to the end so
    7408             :                                          * that if we have to rebalance
    7409             :                                          * kernelcore across nodes, we will
    7410             :                                          * not double account here
    7411             :                                          */
    7412           0 :                                         zone_movable_pfn[nid] = end_pfn;
    7413           0 :                                         continue;
    7414             :                                 }
    7415           0 :                                 start_pfn = usable_startpfn;
    7416             :                         }
    7417             : 
    7418             :                         /*
    7419             :                          * The usable PFN range for ZONE_MOVABLE is from
    7420             :                          * start_pfn->end_pfn. Calculate size_pages as the
    7421             :                          * number of pages used as kernelcore
    7422             :                          */
    7423           0 :                         size_pages = end_pfn - start_pfn;
    7424           0 :                         if (size_pages > kernelcore_remaining)
    7425             :                                 size_pages = kernelcore_remaining;
    7426           0 :                         zone_movable_pfn[nid] = start_pfn + size_pages;
    7427             : 
    7428             :                         /*
    7429             :                          * Some kernelcore has been met, update counts and
    7430             :                          * break if the kernelcore for this node has been
    7431             :                          * satisfied
    7432             :                          */
    7433           0 :                         required_kernelcore -= min(required_kernelcore,
    7434             :                                                                 size_pages);
    7435           0 :                         kernelcore_remaining -= size_pages;
    7436           0 :                         if (!kernelcore_remaining)
    7437             :                                 break;
    7438             :                 }
    7439             :         }
    7440             : 
    7441             :         /*
    7442             :          * If there is still required_kernelcore, we do another pass with one
    7443             :          * less node in the count. This will push zone_movable_pfn[nid] further
    7444             :          * along on the nodes that still have memory until kernelcore is
    7445             :          * satisfied
    7446             :          */
    7447           0 :         usable_nodes--;
    7448           0 :         if (usable_nodes && required_kernelcore > usable_nodes)
    7449           0 :                 goto restart;
    7450             : 
    7451           0 : out2:
    7452             :         /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
    7453           0 :         for (nid = 0; nid < MAX_NUMNODES; nid++)
    7454           0 :                 zone_movable_pfn[nid] =
    7455           0 :                         roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
    7456             : 
    7457           0 : out:
    7458             :         /* restore the node_state */
    7459           1 :         node_states[N_MEMORY] = saved_node_state;
    7460           1 : }
    7461             : 
    7462             : /* Any regular or high memory on that node ? */
    7463           1 : static void check_for_memory(pg_data_t *pgdat, int nid)
    7464             : {
    7465           1 :         enum zone_type zone_type;
    7466             : 
    7467           1 :         for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
    7468           1 :                 struct zone *zone = &pgdat->node_zones[zone_type];
    7469           1 :                 if (populated_zone(zone)) {
    7470           1 :                         if (IS_ENABLED(CONFIG_HIGHMEM))
    7471             :                                 node_set_state(nid, N_HIGH_MEMORY);
    7472           1 :                         if (zone_type <= ZONE_NORMAL)
    7473           1 :                                 node_set_state(nid, N_NORMAL_MEMORY);
    7474             :                         break;
    7475             :                 }
    7476             :         }
    7477           1 : }
    7478             : 
    7479             : /*
    7480             :  * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
    7481             :  * such cases we allow max_zone_pfn sorted in the descending order
    7482             :  */
    7483           1 : bool __weak arch_has_descending_max_zone_pfns(void)
    7484             : {
    7485           1 :         return false;
    7486             : }
    7487             : 
    7488             : /**
    7489             :  * free_area_init - Initialise all pg_data_t and zone data
    7490             :  * @max_zone_pfn: an array of max PFNs for each zone
    7491             :  *
    7492             :  * This will call free_area_init_node() for each active node in the system.
    7493             :  * Using the page ranges provided by memblock_set_node(), the size of each
    7494             :  * zone in each node and their holes is calculated. If the maximum PFN
    7495             :  * between two adjacent zones match, it is assumed that the zone is empty.
    7496             :  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
    7497             :  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
    7498             :  * starts where the previous one ended. For example, ZONE_DMA32 starts
    7499             :  * at arch_max_dma_pfn.
    7500             :  */
    7501           1 : void __init free_area_init(unsigned long *max_zone_pfn)
    7502             : {
    7503           1 :         unsigned long start_pfn, end_pfn;
    7504           1 :         int i, nid, zone;
    7505           1 :         bool descending;
    7506             : 
    7507             :         /* Record where the zone boundaries are */
    7508           1 :         memset(arch_zone_lowest_possible_pfn, 0,
    7509             :                                 sizeof(arch_zone_lowest_possible_pfn));
    7510           1 :         memset(arch_zone_highest_possible_pfn, 0,
    7511             :                                 sizeof(arch_zone_highest_possible_pfn));
    7512             : 
    7513           1 :         start_pfn = find_min_pfn_with_active_regions();
    7514           1 :         descending = arch_has_descending_max_zone_pfns();
    7515             : 
    7516           4 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    7517           3 :                 if (descending)
    7518           0 :                         zone = MAX_NR_ZONES - i - 1;
    7519             :                 else
    7520             :                         zone = i;
    7521             : 
    7522           3 :                 if (zone == ZONE_MOVABLE)
    7523           1 :                         continue;
    7524             : 
    7525           2 :                 end_pfn = max(max_zone_pfn[zone], start_pfn);
    7526           2 :                 arch_zone_lowest_possible_pfn[zone] = start_pfn;
    7527           2 :                 arch_zone_highest_possible_pfn[zone] = end_pfn;
    7528             : 
    7529           2 :                 start_pfn = end_pfn;
    7530             :         }
    7531             : 
    7532             :         /* Find the PFNs that ZONE_MOVABLE begins at in each node */
    7533           1 :         memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
    7534           1 :         find_zone_movable_pfns_for_nodes();
    7535             : 
    7536             :         /* Print out the zone ranges */
    7537           1 :         pr_info("Zone ranges:\n");
    7538           4 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    7539           3 :                 if (i == ZONE_MOVABLE)
    7540           1 :                         continue;
    7541           2 :                 pr_info("  %-8s ", zone_names[i]);
    7542           2 :                 if (arch_zone_lowest_possible_pfn[i] ==
    7543           2 :                                 arch_zone_highest_possible_pfn[i])
    7544           1 :                         pr_cont("empty\n");
    7545             :                 else
    7546           1 :                         pr_cont("[mem %#018Lx-%#018Lx]\n",
    7547             :                                 (u64)arch_zone_lowest_possible_pfn[i]
    7548             :                                         << PAGE_SHIFT,
    7549             :                                 ((u64)arch_zone_highest_possible_pfn[i]
    7550             :                                         << PAGE_SHIFT) - 1);
    7551             :         }
    7552             : 
    7553             :         /* Print out the PFNs ZONE_MOVABLE begins at in each node */
    7554           1 :         pr_info("Movable zone start for each node\n");
    7555          65 :         for (i = 0; i < MAX_NUMNODES; i++) {
    7556          64 :                 if (zone_movable_pfn[i])
    7557           0 :                         pr_info("  Node %d: %#018Lx\n", i,
    7558             :                                (u64)zone_movable_pfn[i] << PAGE_SHIFT);
    7559             :         }
    7560             : 
    7561             :         /*
    7562             :          * Print out the early node map, and initialize the
    7563             :          * subsection-map relative to active online memory ranges to
    7564             :          * enable future "sub-section" extensions of the memory map.
    7565             :          */
    7566           1 :         pr_info("Early memory node ranges\n");
    7567           3 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
    7568           2 :                 pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
    7569             :                         (u64)start_pfn << PAGE_SHIFT,
    7570             :                         ((u64)end_pfn << PAGE_SHIFT) - 1);
    7571           2 :                 subsection_map_init(start_pfn, end_pfn - start_pfn);
    7572             :         }
    7573             : 
    7574             :         /* Initialise every node */
    7575           1 :         mminit_verify_pageflags_layout();
    7576           1 :         setup_nr_node_ids();
    7577           2 :         for_each_online_node(nid) {
    7578           1 :                 pg_data_t *pgdat = NODE_DATA(nid);
    7579           1 :                 free_area_init_node(nid);
    7580             : 
    7581             :                 /* Any memory on that node */
    7582           1 :                 if (pgdat->node_present_pages)
    7583           1 :                         node_set_state(nid, N_MEMORY);
    7584           1 :                 check_for_memory(pgdat, nid);
    7585             :         }
    7586           1 : }
    7587             : 
    7588           0 : static int __init cmdline_parse_core(char *p, unsigned long *core,
    7589             :                                      unsigned long *percent)
    7590             : {
    7591           0 :         unsigned long long coremem;
    7592           0 :         char *endptr;
    7593             : 
    7594           0 :         if (!p)
    7595             :                 return -EINVAL;
    7596             : 
    7597             :         /* Value may be a percentage of total memory, otherwise bytes */
    7598           0 :         coremem = simple_strtoull(p, &endptr, 0);
    7599           0 :         if (*endptr == '%') {
    7600             :                 /* Paranoid check for percent values greater than 100 */
    7601           0 :                 WARN_ON(coremem > 100);
    7602             : 
    7603           0 :                 *percent = coremem;
    7604             :         } else {
    7605           0 :                 coremem = memparse(p, &p);
    7606             :                 /* Paranoid check that UL is enough for the coremem value */
    7607           0 :                 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
    7608             : 
    7609           0 :                 *core = coremem >> PAGE_SHIFT;
    7610           0 :                 *percent = 0UL;
    7611             :         }
    7612             :         return 0;
    7613             : }
    7614             : 
    7615             : /*
    7616             :  * kernelcore=size sets the amount of memory for use for allocations that
    7617             :  * cannot be reclaimed or migrated.
    7618             :  */
    7619           0 : static int __init cmdline_parse_kernelcore(char *p)
    7620             : {
    7621             :         /* parse kernelcore=mirror */
    7622           0 :         if (parse_option_str(p, "mirror")) {
    7623           0 :                 mirrored_kernelcore = true;
    7624           0 :                 return 0;
    7625             :         }
    7626             : 
    7627           0 :         return cmdline_parse_core(p, &required_kernelcore,
    7628             :                                   &required_kernelcore_percent);
    7629             : }
    7630             : 
    7631             : /*
    7632             :  * movablecore=size sets the amount of memory for use for allocations that
    7633             :  * can be reclaimed or migrated.
    7634             :  */
    7635           0 : static int __init cmdline_parse_movablecore(char *p)
    7636             : {
    7637           0 :         return cmdline_parse_core(p, &required_movablecore,
    7638             :                                   &required_movablecore_percent);
    7639             : }
    7640             : 
    7641             : early_param("kernelcore", cmdline_parse_kernelcore);
    7642             : early_param("movablecore", cmdline_parse_movablecore);
    7643             : 
    7644         964 : void adjust_managed_page_count(struct page *page, long count)
    7645             : {
    7646         964 :         atomic_long_add(count, &page_zone(page)->managed_pages);
    7647         964 :         totalram_pages_add(count);
    7648             : #ifdef CONFIG_HIGHMEM
    7649             :         if (PageHighMem(page))
    7650             :                 totalhigh_pages_add(count);
    7651             : #endif
    7652         964 : }
    7653             : EXPORT_SYMBOL(adjust_managed_page_count);
    7654             : 
    7655           4 : unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
    7656             : {
    7657           4 :         void *pos;
    7658           4 :         unsigned long pages = 0;
    7659             : 
    7660           4 :         start = (void *)PAGE_ALIGN((unsigned long)start);
    7661           4 :         end = (void *)((unsigned long)end & PAGE_MASK);
    7662         968 :         for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
    7663        1928 :                 struct page *page = virt_to_page(pos);
    7664         964 :                 void *direct_map_addr;
    7665             : 
    7666             :                 /*
    7667             :                  * 'direct_map_addr' might be different from 'pos'
    7668             :                  * because some architectures' virt_to_page()
    7669             :                  * work with aliases.  Getting the direct map
    7670             :                  * address ensures that we get a _writeable_
    7671             :                  * alias for the memset().
    7672             :                  */
    7673         964 :                 direct_map_addr = page_address(page);
    7674             :                 /*
    7675             :                  * Perform a kasan-unchecked memset() since this memory
    7676             :                  * has not been initialized.
    7677             :                  */
    7678         964 :                 direct_map_addr = kasan_reset_tag(direct_map_addr);
    7679         964 :                 if ((unsigned int)poison <= 0xFF)
    7680         964 :                         memset(direct_map_addr, poison, PAGE_SIZE);
    7681             : 
    7682         964 :                 free_reserved_page(page);
    7683             :         }
    7684             : 
    7685           4 :         if (pages && s)
    7686           4 :                 pr_info("Freeing %s memory: %ldK\n",
    7687             :                         s, pages << (PAGE_SHIFT - 10));
    7688             : 
    7689           4 :         return pages;
    7690             : }
    7691             : 
    7692           1 : void __init mem_init_print_info(const char *str)
    7693             : {
    7694           1 :         unsigned long physpages, codesize, datasize, rosize, bss_size;
    7695           1 :         unsigned long init_code_size, init_data_size;
    7696             : 
    7697           1 :         physpages = get_num_physpages();
    7698           1 :         codesize = _etext - _stext;
    7699           1 :         datasize = _edata - _sdata;
    7700           1 :         rosize = __end_rodata - __start_rodata;
    7701           1 :         bss_size = __bss_stop - __bss_start;
    7702           1 :         init_data_size = __init_end - __init_begin;
    7703           1 :         init_code_size = _einittext - _sinittext;
    7704             : 
    7705             :         /*
    7706             :          * Detect special cases and adjust section sizes accordingly:
    7707             :          * 1) .init.* may be embedded into .data sections
    7708             :          * 2) .init.text.* may be out of [__init_begin, __init_end],
    7709             :          *    please refer to arch/tile/kernel/vmlinux.lds.S.
    7710             :          * 3) .rodata.* may be embedded into .text or .data sections.
    7711             :          */
    7712             : #define adj_init_size(start, end, size, pos, adj) \
    7713             :         do { \
    7714             :                 if (start <= pos && pos < end && size > adj) \
    7715             :                         size -= adj; \
    7716             :         } while (0)
    7717             : 
    7718           1 :         adj_init_size(__init_begin, __init_end, init_data_size,
    7719             :                      _sinittext, init_code_size);
    7720           1 :         adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
    7721           1 :         adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
    7722           1 :         adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
    7723           1 :         adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
    7724             : 
    7725             : #undef  adj_init_size
    7726             : 
    7727           3 :         pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
    7728             : #ifdef  CONFIG_HIGHMEM
    7729             :                 ", %luK highmem"
    7730             : #endif
    7731             :                 "%s%s)\n",
    7732             :                 nr_free_pages() << (PAGE_SHIFT - 10),
    7733             :                 physpages << (PAGE_SHIFT - 10),
    7734             :                 codesize >> 10, datasize >> 10, rosize >> 10,
    7735             :                 (init_data_size + init_code_size) >> 10, bss_size >> 10,
    7736             :                 (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
    7737             :                 totalcma_pages << (PAGE_SHIFT - 10),
    7738             : #ifdef  CONFIG_HIGHMEM
    7739             :                 totalhigh_pages() << (PAGE_SHIFT - 10),
    7740             : #endif
    7741             :                 str ? ", " : "", str ? str : "");
    7742           1 : }
    7743             : 
    7744             : /**
    7745             :  * set_dma_reserve - set the specified number of pages reserved in the first zone
    7746             :  * @new_dma_reserve: The number of pages to mark reserved
    7747             :  *
    7748             :  * The per-cpu batchsize and zone watermarks are determined by managed_pages.
    7749             :  * In the DMA zone, a significant percentage may be consumed by kernel image
    7750             :  * and other unfreeable allocations which can skew the watermarks badly. This
    7751             :  * function may optionally be used to account for unfreeable pages in the
    7752             :  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
    7753             :  * smaller per-cpu batchsize.
    7754             :  */
    7755           1 : void __init set_dma_reserve(unsigned long new_dma_reserve)
    7756             : {
    7757           1 :         dma_reserve = new_dma_reserve;
    7758           1 : }
    7759             : 
    7760           0 : static int page_alloc_cpu_dead(unsigned int cpu)
    7761             : {
    7762             : 
    7763           0 :         lru_add_drain_cpu(cpu);
    7764           0 :         drain_pages(cpu);
    7765             : 
    7766             :         /*
    7767             :          * Spill the event counters of the dead processor
    7768             :          * into the current processors event counters.
    7769             :          * This artificially elevates the count of the current
    7770             :          * processor.
    7771             :          */
    7772           0 :         vm_events_fold_cpu(cpu);
    7773             : 
    7774             :         /*
    7775             :          * Zero the differential counters of the dead processor
    7776             :          * so that the vm statistics are consistent.
    7777             :          *
    7778             :          * This is only okay since the processor is dead and cannot
    7779             :          * race with what we are doing.
    7780             :          */
    7781           0 :         cpu_vm_stats_fold(cpu);
    7782           0 :         return 0;
    7783             : }
    7784             : 
    7785             : #ifdef CONFIG_NUMA
    7786             : int hashdist = HASHDIST_DEFAULT;
    7787             : 
    7788           0 : static int __init set_hashdist(char *str)
    7789             : {
    7790           0 :         if (!str)
    7791             :                 return 0;
    7792           0 :         hashdist = simple_strtoul(str, &str, 0);
    7793           0 :         return 1;
    7794             : }
    7795             : __setup("hashdist=", set_hashdist);
    7796             : #endif
    7797             : 
    7798           1 : void __init page_alloc_init(void)
    7799             : {
    7800           1 :         int ret;
    7801             : 
    7802             : #ifdef CONFIG_NUMA
    7803           1 :         if (num_node_state(N_MEMORY) == 1)
    7804           1 :                 hashdist = 0;
    7805             : #endif
    7806             : 
    7807           1 :         ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
    7808             :                                         "mm/page_alloc:dead", NULL,
    7809             :                                         page_alloc_cpu_dead);
    7810           1 :         WARN_ON(ret < 0);
    7811           1 : }
    7812             : 
    7813             : /*
    7814             :  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
    7815             :  *      or min_free_kbytes changes.
    7816             :  */
    7817           3 : static void calculate_totalreserve_pages(void)
    7818             : {
    7819           3 :         struct pglist_data *pgdat;
    7820           3 :         unsigned long reserve_pages = 0;
    7821           3 :         enum zone_type i, j;
    7822             : 
    7823           6 :         for_each_online_pgdat(pgdat) {
    7824             : 
    7825           3 :                 pgdat->totalreserve_pages = 0;
    7826             : 
    7827          12 :                 for (i = 0; i < MAX_NR_ZONES; i++) {
    7828           9 :                         struct zone *zone = pgdat->node_zones + i;
    7829           9 :                         long max = 0;
    7830           9 :                         unsigned long managed_pages = zone_managed_pages(zone);
    7831             : 
    7832             :                         /* Find valid and maximum lowmem_reserve in the zone */
    7833          27 :                         for (j = i; j < MAX_NR_ZONES; j++) {
    7834          18 :                                 if (zone->lowmem_reserve[j] > max)
    7835             :                                         max = zone->lowmem_reserve[j];
    7836             :                         }
    7837             : 
    7838             :                         /* we treat the high watermark as reserved pages. */
    7839           9 :                         max += high_wmark_pages(zone);
    7840             : 
    7841           9 :                         if (max > managed_pages)
    7842           0 :                                 max = managed_pages;
    7843             : 
    7844           9 :                         pgdat->totalreserve_pages += max;
    7845             : 
    7846           9 :                         reserve_pages += max;
    7847             :                 }
    7848             :         }
    7849           3 :         totalreserve_pages = reserve_pages;
    7850           3 : }
    7851             : 
    7852             : /*
    7853             :  * setup_per_zone_lowmem_reserve - called whenever
    7854             :  *      sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
    7855             :  *      has a correct pages reserved value, so an adequate number of
    7856             :  *      pages are left in the zone after a successful __alloc_pages().
    7857             :  */
    7858           1 : static void setup_per_zone_lowmem_reserve(void)
    7859             : {
    7860           1 :         struct pglist_data *pgdat;
    7861           1 :         enum zone_type i, j;
    7862             : 
    7863           2 :         for_each_online_pgdat(pgdat) {
    7864           3 :                 for (i = 0; i < MAX_NR_ZONES - 1; i++) {
    7865           2 :                         struct zone *zone = &pgdat->node_zones[i];
    7866           2 :                         int ratio = sysctl_lowmem_reserve_ratio[i];
    7867           4 :                         bool clear = !ratio || !zone_managed_pages(zone);
    7868           2 :                         unsigned long managed_pages = 0;
    7869             : 
    7870           5 :                         for (j = i + 1; j < MAX_NR_ZONES; j++) {
    7871           3 :                                 if (clear) {
    7872           1 :                                         zone->lowmem_reserve[j] = 0;
    7873             :                                 } else {
    7874           2 :                                         struct zone *upper_zone = &pgdat->node_zones[j];
    7875             : 
    7876           2 :                                         managed_pages += zone_managed_pages(upper_zone);
    7877           2 :                                         zone->lowmem_reserve[j] = managed_pages / ratio;
    7878             :                                 }
    7879             :                         }
    7880             :                 }
    7881             :         }
    7882             : 
    7883             :         /* update totalreserve_pages */
    7884           1 :         calculate_totalreserve_pages();
    7885           1 : }
    7886             : 
    7887           2 : static void __setup_per_zone_wmarks(void)
    7888             : {
    7889           2 :         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    7890           2 :         unsigned long lowmem_pages = 0;
    7891           2 :         struct zone *zone;
    7892           2 :         unsigned long flags;
    7893             : 
    7894             :         /* Calculate total number of !ZONE_HIGHMEM pages */
    7895           8 :         for_each_zone(zone) {
    7896           6 :                 if (!is_highmem(zone))
    7897           6 :                         lowmem_pages += zone_managed_pages(zone);
    7898             :         }
    7899             : 
    7900           8 :         for_each_zone(zone) {
    7901           6 :                 u64 tmp;
    7902             : 
    7903           6 :                 spin_lock_irqsave(&zone->lock, flags);
    7904           6 :                 tmp = (u64)pages_min * zone_managed_pages(zone);
    7905           6 :                 do_div(tmp, lowmem_pages);
    7906           6 :                 if (is_highmem(zone)) {
    7907             :                         /*
    7908             :                          * __GFP_HIGH and PF_MEMALLOC allocations usually don't
    7909             :                          * need highmem pages, so cap pages_min to a small
    7910             :                          * value here.
    7911             :                          *
    7912             :                          * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
    7913             :                          * deltas control async page reclaim, and so should
    7914             :                          * not be capped for highmem.
    7915             :                          */
    7916             :                         unsigned long min_pages;
    7917             : 
    7918             :                         min_pages = zone_managed_pages(zone) / 1024;
    7919             :                         min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
    7920             :                         zone->_watermark[WMARK_MIN] = min_pages;
    7921             :                 } else {
    7922             :                         /*
    7923             :                          * If it's a lowmem zone, reserve a number of pages
    7924             :                          * proportionate to the zone's size.
    7925             :                          */
    7926           6 :                         zone->_watermark[WMARK_MIN] = tmp;
    7927             :                 }
    7928             : 
    7929             :                 /*
    7930             :                  * Set the kswapd watermarks distance according to the
    7931             :                  * scale factor in proportion to available memory, but
    7932             :                  * ensure a minimum size on small systems.
    7933             :                  */
    7934           6 :                 tmp = max_t(u64, tmp >> 2,
    7935             :                             mult_frac(zone_managed_pages(zone),
    7936             :                                       watermark_scale_factor, 10000));
    7937             : 
    7938           6 :                 zone->watermark_boost = 0;
    7939           6 :                 zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
    7940           6 :                 zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
    7941             : 
    7942           6 :                 spin_unlock_irqrestore(&zone->lock, flags);
    7943             :         }
    7944             : 
    7945             :         /* update totalreserve_pages */
    7946           2 :         calculate_totalreserve_pages();
    7947           2 : }
    7948             : 
    7949             : /**
    7950             :  * setup_per_zone_wmarks - called when min_free_kbytes changes
    7951             :  * or when memory is hot-{added|removed}
    7952             :  *
    7953             :  * Ensures that the watermark[min,low,high] values for each zone are set
    7954             :  * correctly with respect to min_free_kbytes.
    7955             :  */
    7956           2 : void setup_per_zone_wmarks(void)
    7957             : {
    7958           2 :         static DEFINE_SPINLOCK(lock);
    7959             : 
    7960           2 :         spin_lock(&lock);
    7961           2 :         __setup_per_zone_wmarks();
    7962           2 :         spin_unlock(&lock);
    7963           2 : }
    7964             : 
    7965             : /*
    7966             :  * Initialise min_free_kbytes.
    7967             :  *
    7968             :  * For small machines we want it small (128k min).  For large machines
    7969             :  * we want it large (256MB max).  But it is not linear, because network
    7970             :  * bandwidth does not increase linearly with machine size.  We use
    7971             :  *
    7972             :  *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
    7973             :  *      min_free_kbytes = sqrt(lowmem_kbytes * 16)
    7974             :  *
    7975             :  * which yields
    7976             :  *
    7977             :  * 16MB:        512k
    7978             :  * 32MB:        724k
    7979             :  * 64MB:        1024k
    7980             :  * 128MB:       1448k
    7981             :  * 256MB:       2048k
    7982             :  * 512MB:       2896k
    7983             :  * 1024MB:      4096k
    7984             :  * 2048MB:      5792k
    7985             :  * 4096MB:      8192k
    7986             :  * 8192MB:      11584k
    7987             :  * 16384MB:     16384k
    7988             :  */
    7989           1 : int __meminit init_per_zone_wmark_min(void)
    7990             : {
    7991           1 :         unsigned long lowmem_kbytes;
    7992           1 :         int new_min_free_kbytes;
    7993             : 
    7994           1 :         lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
    7995           1 :         new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
    7996             : 
    7997           1 :         if (new_min_free_kbytes > user_min_free_kbytes) {
    7998           1 :                 min_free_kbytes = new_min_free_kbytes;
    7999           1 :                 if (min_free_kbytes < 128)
    8000           0 :                         min_free_kbytes = 128;
    8001           1 :                 if (min_free_kbytes > 262144)
    8002           0 :                         min_free_kbytes = 262144;
    8003             :         } else {
    8004           0 :                 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
    8005             :                                 new_min_free_kbytes, user_min_free_kbytes);
    8006             :         }
    8007           1 :         setup_per_zone_wmarks();
    8008           1 :         refresh_zone_stat_thresholds();
    8009           1 :         setup_per_zone_lowmem_reserve();
    8010             : 
    8011             : #ifdef CONFIG_NUMA
    8012           1 :         setup_min_unmapped_ratio();
    8013           1 :         setup_min_slab_ratio();
    8014             : #endif
    8015             : 
    8016           1 :         khugepaged_min_free_kbytes_update();
    8017             : 
    8018           1 :         return 0;
    8019             : }
    8020             : postcore_initcall(init_per_zone_wmark_min)
    8021             : 
    8022             : /*
    8023             :  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
    8024             :  *      that we can call two helper functions whenever min_free_kbytes
    8025             :  *      changes.
    8026             :  */
    8027           0 : int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
    8028             :                 void *buffer, size_t *length, loff_t *ppos)
    8029             : {
    8030           0 :         int rc;
    8031             : 
    8032           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8033           0 :         if (rc)
    8034             :                 return rc;
    8035             : 
    8036           0 :         if (write) {
    8037           0 :                 user_min_free_kbytes = min_free_kbytes;
    8038           0 :                 setup_per_zone_wmarks();
    8039             :         }
    8040             :         return 0;
    8041             : }
    8042             : 
    8043           0 : int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
    8044             :                 void *buffer, size_t *length, loff_t *ppos)
    8045             : {
    8046           0 :         int rc;
    8047             : 
    8048           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8049           0 :         if (rc)
    8050             :                 return rc;
    8051             : 
    8052           0 :         if (write)
    8053           0 :                 setup_per_zone_wmarks();
    8054             : 
    8055             :         return 0;
    8056             : }
    8057             : 
    8058             : #ifdef CONFIG_NUMA
    8059           1 : static void setup_min_unmapped_ratio(void)
    8060             : {
    8061           1 :         pg_data_t *pgdat;
    8062           1 :         struct zone *zone;
    8063             : 
    8064           2 :         for_each_online_pgdat(pgdat)
    8065           1 :                 pgdat->min_unmapped_pages = 0;
    8066             : 
    8067           4 :         for_each_zone(zone)
    8068           3 :                 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
    8069           3 :                                                          sysctl_min_unmapped_ratio) / 100;
    8070           1 : }
    8071             : 
    8072             : 
    8073           0 : int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
    8074             :                 void *buffer, size_t *length, loff_t *ppos)
    8075             : {
    8076           0 :         int rc;
    8077             : 
    8078           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8079           0 :         if (rc)
    8080             :                 return rc;
    8081             : 
    8082           0 :         setup_min_unmapped_ratio();
    8083             : 
    8084           0 :         return 0;
    8085             : }
    8086             : 
    8087           1 : static void setup_min_slab_ratio(void)
    8088             : {
    8089           1 :         pg_data_t *pgdat;
    8090           1 :         struct zone *zone;
    8091             : 
    8092           2 :         for_each_online_pgdat(pgdat)
    8093           1 :                 pgdat->min_slab_pages = 0;
    8094             : 
    8095           4 :         for_each_zone(zone)
    8096           3 :                 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
    8097           3 :                                                      sysctl_min_slab_ratio) / 100;
    8098           1 : }
    8099             : 
    8100           0 : int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
    8101             :                 void *buffer, size_t *length, loff_t *ppos)
    8102             : {
    8103           0 :         int rc;
    8104             : 
    8105           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8106           0 :         if (rc)
    8107             :                 return rc;
    8108             : 
    8109           0 :         setup_min_slab_ratio();
    8110             : 
    8111           0 :         return 0;
    8112             : }
    8113             : #endif
    8114             : 
    8115             : /*
    8116             :  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
    8117             :  *      proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
    8118             :  *      whenever sysctl_lowmem_reserve_ratio changes.
    8119             :  *
    8120             :  * The reserve ratio obviously has absolutely no relation with the
    8121             :  * minimum watermarks. The lowmem reserve ratio can only make sense
    8122             :  * if in function of the boot time zone sizes.
    8123             :  */
    8124           0 : int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
    8125             :                 void *buffer, size_t *length, loff_t *ppos)
    8126             : {
    8127           0 :         int i;
    8128             : 
    8129           0 :         proc_dointvec_minmax(table, write, buffer, length, ppos);
    8130             : 
    8131           0 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    8132           0 :                 if (sysctl_lowmem_reserve_ratio[i] < 1)
    8133           0 :                         sysctl_lowmem_reserve_ratio[i] = 0;
    8134             :         }
    8135             : 
    8136           0 :         setup_per_zone_lowmem_reserve();
    8137           0 :         return 0;
    8138             : }
    8139             : 
    8140             : /*
    8141             :  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
    8142             :  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
    8143             :  * pagelist can have before it gets flushed back to buddy allocator.
    8144             :  */
    8145           0 : int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
    8146             :                 void *buffer, size_t *length, loff_t *ppos)
    8147             : {
    8148           0 :         struct zone *zone;
    8149           0 :         int old_percpu_pagelist_fraction;
    8150           0 :         int ret;
    8151             : 
    8152           0 :         mutex_lock(&pcp_batch_high_lock);
    8153           0 :         old_percpu_pagelist_fraction = percpu_pagelist_fraction;
    8154             : 
    8155           0 :         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8156           0 :         if (!write || ret < 0)
    8157           0 :                 goto out;
    8158             : 
    8159             :         /* Sanity checking to avoid pcp imbalance */
    8160           0 :         if (percpu_pagelist_fraction &&
    8161             :             percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
    8162           0 :                 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
    8163           0 :                 ret = -EINVAL;
    8164           0 :                 goto out;
    8165             :         }
    8166             : 
    8167             :         /* No change? */
    8168           0 :         if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
    8169           0 :                 goto out;
    8170             : 
    8171           0 :         for_each_populated_zone(zone)
    8172           0 :                 zone_set_pageset_high_and_batch(zone);
    8173           0 : out:
    8174           0 :         mutex_unlock(&pcp_batch_high_lock);
    8175           0 :         return ret;
    8176             : }
    8177             : 
    8178             : #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
    8179             : /*
    8180             :  * Returns the number of pages that arch has reserved but
    8181             :  * is not known to alloc_large_system_hash().
    8182             :  */
    8183           8 : static unsigned long __init arch_reserved_kernel_pages(void)
    8184             : {
    8185           8 :         return 0;
    8186             : }
    8187             : #endif
    8188             : 
    8189             : /*
    8190             :  * Adaptive scale is meant to reduce sizes of hash tables on large memory
    8191             :  * machines. As memory size is increased the scale is also increased but at
    8192             :  * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
    8193             :  * quadruples the scale is increased by one, which means the size of hash table
    8194             :  * only doubles, instead of quadrupling as well.
    8195             :  * Because 32-bit systems cannot have large physical memory, where this scaling
    8196             :  * makes sense, it is disabled on such platforms.
    8197             :  */
    8198             : #if __BITS_PER_LONG > 32
    8199             : #define ADAPT_SCALE_BASE        (64ul << 30)
    8200             : #define ADAPT_SCALE_SHIFT       2
    8201             : #define ADAPT_SCALE_NPAGES      (ADAPT_SCALE_BASE >> PAGE_SHIFT)
    8202             : #endif
    8203             : 
    8204             : /*
    8205             :  * allocate a large system hash table from bootmem
    8206             :  * - it is assumed that the hash table must contain an exact power-of-2
    8207             :  *   quantity of entries
    8208             :  * - limit is the number of hash buckets, not the total allocation size
    8209             :  */
    8210          11 : void *__init alloc_large_system_hash(const char *tablename,
    8211             :                                      unsigned long bucketsize,
    8212             :                                      unsigned long numentries,
    8213             :                                      int scale,
    8214             :                                      int flags,
    8215             :                                      unsigned int *_hash_shift,
    8216             :                                      unsigned int *_hash_mask,
    8217             :                                      unsigned long low_limit,
    8218             :                                      unsigned long high_limit)
    8219             : {
    8220          11 :         unsigned long long max = high_limit;
    8221          11 :         unsigned long log2qty, size;
    8222          11 :         void *table = NULL;
    8223          11 :         gfp_t gfp_flags;
    8224          11 :         bool virt;
    8225             : 
    8226             :         /* allow the kernel cmdline to have a say */
    8227          11 :         if (!numentries) {
    8228             :                 /* round applicable memory size up to nearest megabyte */
    8229           8 :                 numentries = nr_kernel_pages;
    8230           8 :                 numentries -= arch_reserved_kernel_pages();
    8231             : 
    8232             :                 /* It isn't necessary when PAGE_SIZE >= 1MB */
    8233           8 :                 if (PAGE_SHIFT < 20)
    8234           8 :                         numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
    8235             : 
    8236             : #if __BITS_PER_LONG > 32
    8237           8 :                 if (!high_limit) {
    8238             :                         unsigned long adapt;
    8239             : 
    8240           4 :                         for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
    8241           0 :                              adapt <<= ADAPT_SCALE_SHIFT)
    8242           0 :                                 scale++;
    8243             :                 }
    8244             : #endif
    8245             : 
    8246             :                 /* limit to 1 bucket per 2^scale bytes of low memory */
    8247           8 :                 if (scale > PAGE_SHIFT)
    8248           8 :                         numentries >>= (scale - PAGE_SHIFT);
    8249             :                 else
    8250           0 :                         numentries <<= (PAGE_SHIFT - scale);
    8251             : 
    8252             :                 /* Make sure we've got at least a 0-order allocation.. */
    8253           8 :                 if (unlikely(flags & HASH_SMALL)) {
    8254             :                         /* Makes no sense without HASH_EARLY */
    8255           0 :                         WARN_ON(!(flags & HASH_EARLY));
    8256           0 :                         if (!(numentries >> *_hash_shift)) {
    8257           0 :                                 numentries = 1UL << *_hash_shift;
    8258           0 :                                 BUG_ON(!numentries);
    8259             :                         }
    8260           8 :                 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
    8261           0 :                         numentries = PAGE_SIZE / bucketsize;
    8262             :         }
    8263          11 :         numentries = roundup_pow_of_two(numentries);
    8264             : 
    8265             :         /* limit allocation size to 1/16 total memory by default */
    8266          11 :         if (max == 0) {
    8267           4 :                 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
    8268           4 :                 do_div(max, bucketsize);
    8269             :         }
    8270          11 :         max = min(max, 0x80000000ULL);
    8271             : 
    8272          11 :         if (numentries < low_limit)
    8273             :                 numentries = low_limit;
    8274          11 :         if (numentries > max)
    8275             :                 numentries = max;
    8276             : 
    8277          11 :         log2qty = ilog2(numentries);
    8278             : 
    8279          17 :         gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
    8280          11 :         do {
    8281          11 :                 virt = false;
    8282          11 :                 size = bucketsize << log2qty;
    8283          11 :                 if (flags & HASH_EARLY) {
    8284           3 :                         if (flags & HASH_ZERO)
    8285           3 :                                 table = memblock_alloc(size, SMP_CACHE_BYTES);
    8286             :                         else
    8287           0 :                                 table = memblock_alloc_raw(size,
    8288             :                                                            SMP_CACHE_BYTES);
    8289          16 :                 } else if (get_order(size) >= MAX_ORDER || hashdist) {
    8290           0 :                         table = __vmalloc(size, gfp_flags);
    8291           0 :                         virt = true;
    8292             :                 } else {
    8293             :                         /*
    8294             :                          * If bucketsize is not a power-of-two, we may free
    8295             :                          * some pages at the end of hash table which
    8296             :                          * alloc_pages_exact() automatically does
    8297             :                          */
    8298           8 :                         table = alloc_pages_exact(size, gfp_flags);
    8299           8 :                         kmemleak_alloc(table, size, 1, gfp_flags);
    8300             :                 }
    8301          11 :         } while (!table && size > PAGE_SIZE && --log2qty);
    8302             : 
    8303          11 :         if (!table)
    8304           0 :                 panic("Failed to allocate %s hash table\n", tablename);
    8305             : 
    8306          22 :         pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
    8307             :                 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
    8308             :                 virt ? "vmalloc" : "linear");
    8309             : 
    8310          11 :         if (_hash_shift)
    8311           9 :                 *_hash_shift = log2qty;
    8312          11 :         if (_hash_mask)
    8313           7 :                 *_hash_mask = (1 << log2qty) - 1;
    8314             : 
    8315          11 :         return table;
    8316             : }
    8317             : 
    8318             : /*
    8319             :  * This function checks whether pageblock includes unmovable pages or not.
    8320             :  *
    8321             :  * PageLRU check without isolation or lru_lock could race so that
    8322             :  * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
    8323             :  * check without lock_page also may miss some movable non-lru pages at
    8324             :  * race condition. So you can't expect this function should be exact.
    8325             :  *
    8326             :  * Returns a page without holding a reference. If the caller wants to
    8327             :  * dereference that page (e.g., dumping), it has to make sure that it
    8328             :  * cannot get removed (e.g., via memory unplug) concurrently.
    8329             :  *
    8330             :  */
    8331           0 : struct page *has_unmovable_pages(struct zone *zone, struct page *page,
    8332             :                                  int migratetype, int flags)
    8333             : {
    8334           0 :         unsigned long iter = 0;
    8335           0 :         unsigned long pfn = page_to_pfn(page);
    8336           0 :         unsigned long offset = pfn % pageblock_nr_pages;
    8337             : 
    8338           0 :         if (is_migrate_cma_page(page)) {
    8339             :                 /*
    8340             :                  * CMA allocations (alloc_contig_range) really need to mark
    8341             :                  * isolate CMA pageblocks even when they are not movable in fact
    8342             :                  * so consider them movable here.
    8343             :                  */
    8344             :                 if (is_migrate_cma(migratetype))
    8345             :                         return NULL;
    8346             : 
    8347             :                 return page;
    8348             :         }
    8349             : 
    8350           0 :         for (; iter < pageblock_nr_pages - offset; iter++) {
    8351           0 :                 if (!pfn_valid_within(pfn + iter))
    8352             :                         continue;
    8353             : 
    8354           0 :                 page = pfn_to_page(pfn + iter);
    8355             : 
    8356             :                 /*
    8357             :                  * Both, bootmem allocations and memory holes are marked
    8358             :                  * PG_reserved and are unmovable. We can even have unmovable
    8359             :                  * allocations inside ZONE_MOVABLE, for example when
    8360             :                  * specifying "movablecore".
    8361             :                  */
    8362           0 :                 if (PageReserved(page))
    8363           0 :                         return page;
    8364             : 
    8365             :                 /*
    8366             :                  * If the zone is movable and we have ruled out all reserved
    8367             :                  * pages then it should be reasonably safe to assume the rest
    8368             :                  * is movable.
    8369             :                  */
    8370           0 :                 if (zone_idx(zone) == ZONE_MOVABLE)
    8371           0 :                         continue;
    8372             : 
    8373             :                 /*
    8374             :                  * Hugepages are not in LRU lists, but they're movable.
    8375             :                  * THPs are on the LRU, but need to be counted as #small pages.
    8376             :                  * We need not scan over tail pages because we don't
    8377             :                  * handle each tail page individually in migration.
    8378             :                  */
    8379           0 :                 if (PageHuge(page) || PageTransCompound(page)) {
    8380           0 :                         struct page *head = compound_head(page);
    8381           0 :                         unsigned int skip_pages;
    8382             : 
    8383           0 :                         if (PageHuge(page)) {
    8384             :                                 if (!hugepage_migration_supported(page_hstate(head)))
    8385             :                                         return page;
    8386           0 :                         } else if (!PageLRU(head) && !__PageMovable(head)) {
    8387           0 :                                 return page;
    8388             :                         }
    8389             : 
    8390           0 :                         skip_pages = compound_nr(head) - (page - head);
    8391           0 :                         iter += skip_pages - 1;
    8392           0 :                         continue;
    8393             :                 }
    8394             : 
    8395             :                 /*
    8396             :                  * We can't use page_count without pin a page
    8397             :                  * because another CPU can free compound page.
    8398             :                  * This check already skips compound tails of THP
    8399             :                  * because their page->_refcount is zero at all time.
    8400             :                  */
    8401           0 :                 if (!page_ref_count(page)) {
    8402           0 :                         if (PageBuddy(page))
    8403           0 :                                 iter += (1 << buddy_order(page)) - 1;
    8404           0 :                         continue;
    8405             :                 }
    8406             : 
    8407             :                 /*
    8408             :                  * The HWPoisoned page may be not in buddy system, and
    8409             :                  * page_count() is not 0.
    8410             :                  */
    8411           0 :                 if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
    8412             :                         continue;
    8413             : 
    8414             :                 /*
    8415             :                  * We treat all PageOffline() pages as movable when offlining
    8416             :                  * to give drivers a chance to decrement their reference count
    8417             :                  * in MEM_GOING_OFFLINE in order to indicate that these pages
    8418             :                  * can be offlined as there are no direct references anymore.
    8419             :                  * For actually unmovable PageOffline() where the driver does
    8420             :                  * not support this, we will fail later when trying to actually
    8421             :                  * move these pages that still have a reference count > 0.
    8422             :                  * (false negatives in this function only)
    8423             :                  */
    8424           0 :                 if ((flags & MEMORY_OFFLINE) && PageOffline(page))
    8425           0 :                         continue;
    8426             : 
    8427           0 :                 if (__PageMovable(page) || PageLRU(page))
    8428           0 :                         continue;
    8429             : 
    8430             :                 /*
    8431             :                  * If there are RECLAIMABLE pages, we need to check
    8432             :                  * it.  But now, memory offline itself doesn't call
    8433             :                  * shrink_node_slabs() and it still to be fixed.
    8434             :                  */
    8435             :                 return page;
    8436             :         }
    8437             :         return NULL;
    8438             : }
    8439             : 
    8440             : #ifdef CONFIG_CONTIG_ALLOC
    8441             : static unsigned long pfn_max_align_down(unsigned long pfn)
    8442             : {
    8443             :         return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
    8444             :                              pageblock_nr_pages) - 1);
    8445             : }
    8446             : 
    8447             : static unsigned long pfn_max_align_up(unsigned long pfn)
    8448             : {
    8449             :         return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
    8450             :                                 pageblock_nr_pages));
    8451             : }
    8452             : 
    8453             : /* [start, end) must belong to a single zone. */
    8454             : static int __alloc_contig_migrate_range(struct compact_control *cc,
    8455             :                                         unsigned long start, unsigned long end)
    8456             : {
    8457             :         /* This function is based on compact_zone() from compaction.c. */
    8458             :         unsigned int nr_reclaimed;
    8459             :         unsigned long pfn = start;
    8460             :         unsigned int tries = 0;
    8461             :         int ret = 0;
    8462             :         struct migration_target_control mtc = {
    8463             :                 .nid = zone_to_nid(cc->zone),
    8464             :                 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
    8465             :         };
    8466             : 
    8467             :         migrate_prep();
    8468             : 
    8469             :         while (pfn < end || !list_empty(&cc->migratepages)) {
    8470             :                 if (fatal_signal_pending(current)) {
    8471             :                         ret = -EINTR;
    8472             :                         break;
    8473             :                 }
    8474             : 
    8475             :                 if (list_empty(&cc->migratepages)) {
    8476             :                         cc->nr_migratepages = 0;
    8477             :                         pfn = isolate_migratepages_range(cc, pfn, end);
    8478             :                         if (!pfn) {
    8479             :                                 ret = -EINTR;
    8480             :                                 break;
    8481             :                         }
    8482             :                         tries = 0;
    8483             :                 } else if (++tries == 5) {
    8484             :                         ret = ret < 0 ? ret : -EBUSY;
    8485             :                         break;
    8486             :                 }
    8487             : 
    8488             :                 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
    8489             :                                                         &cc->migratepages);
    8490             :                 cc->nr_migratepages -= nr_reclaimed;
    8491             : 
    8492             :                 ret = migrate_pages(&cc->migratepages, alloc_migration_target,
    8493             :                                 NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
    8494             :         }
    8495             :         if (ret < 0) {
    8496             :                 putback_movable_pages(&cc->migratepages);
    8497             :                 return ret;
    8498             :         }
    8499             :         return 0;
    8500             : }
    8501             : 
    8502             : /**
    8503             :  * alloc_contig_range() -- tries to allocate given range of pages
    8504             :  * @start:      start PFN to allocate
    8505             :  * @end:        one-past-the-last PFN to allocate
    8506             :  * @migratetype:        migratetype of the underlaying pageblocks (either
    8507             :  *                      #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
    8508             :  *                      in range must have the same migratetype and it must
    8509             :  *                      be either of the two.
    8510             :  * @gfp_mask:   GFP mask to use during compaction
    8511             :  *
    8512             :  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
    8513             :  * aligned.  The PFN range must belong to a single zone.
    8514             :  *
    8515             :  * The first thing this routine does is attempt to MIGRATE_ISOLATE all
    8516             :  * pageblocks in the range.  Once isolated, the pageblocks should not
    8517             :  * be modified by others.
    8518             :  *
    8519             :  * Return: zero on success or negative error code.  On success all
    8520             :  * pages which PFN is in [start, end) are allocated for the caller and
    8521             :  * need to be freed with free_contig_range().
    8522             :  */
    8523             : int alloc_contig_range(unsigned long start, unsigned long end,
    8524             :                        unsigned migratetype, gfp_t gfp_mask)
    8525             : {
    8526             :         unsigned long outer_start, outer_end;
    8527             :         unsigned int order;
    8528             :         int ret = 0;
    8529             : 
    8530             :         struct compact_control cc = {
    8531             :                 .nr_migratepages = 0,
    8532             :                 .order = -1,
    8533             :                 .zone = page_zone(pfn_to_page(start)),
    8534             :                 .mode = MIGRATE_SYNC,
    8535             :                 .ignore_skip_hint = true,
    8536             :                 .no_set_skip_hint = true,
    8537             :                 .gfp_mask = current_gfp_context(gfp_mask),
    8538             :                 .alloc_contig = true,
    8539             :         };
    8540             :         INIT_LIST_HEAD(&cc.migratepages);
    8541             : 
    8542             :         /*
    8543             :          * What we do here is we mark all pageblocks in range as
    8544             :          * MIGRATE_ISOLATE.  Because pageblock and max order pages may
    8545             :          * have different sizes, and due to the way page allocator
    8546             :          * work, we align the range to biggest of the two pages so
    8547             :          * that page allocator won't try to merge buddies from
    8548             :          * different pageblocks and change MIGRATE_ISOLATE to some
    8549             :          * other migration type.
    8550             :          *
    8551             :          * Once the pageblocks are marked as MIGRATE_ISOLATE, we
    8552             :          * migrate the pages from an unaligned range (ie. pages that
    8553             :          * we are interested in).  This will put all the pages in
    8554             :          * range back to page allocator as MIGRATE_ISOLATE.
    8555             :          *
    8556             :          * When this is done, we take the pages in range from page
    8557             :          * allocator removing them from the buddy system.  This way
    8558             :          * page allocator will never consider using them.
    8559             :          *
    8560             :          * This lets us mark the pageblocks back as
    8561             :          * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
    8562             :          * aligned range but not in the unaligned, original range are
    8563             :          * put back to page allocator so that buddy can use them.
    8564             :          */
    8565             : 
    8566             :         ret = start_isolate_page_range(pfn_max_align_down(start),
    8567             :                                        pfn_max_align_up(end), migratetype, 0);
    8568             :         if (ret)
    8569             :                 return ret;
    8570             : 
    8571             :         drain_all_pages(cc.zone);
    8572             : 
    8573             :         /*
    8574             :          * In case of -EBUSY, we'd like to know which page causes problem.
    8575             :          * So, just fall through. test_pages_isolated() has a tracepoint
    8576             :          * which will report the busy page.
    8577             :          *
    8578             :          * It is possible that busy pages could become available before
    8579             :          * the call to test_pages_isolated, and the range will actually be
    8580             :          * allocated.  So, if we fall through be sure to clear ret so that
    8581             :          * -EBUSY is not accidentally used or returned to caller.
    8582             :          */
    8583             :         ret = __alloc_contig_migrate_range(&cc, start, end);
    8584             :         if (ret && ret != -EBUSY)
    8585             :                 goto done;
    8586             :         ret =0;
    8587             : 
    8588             :         /*
    8589             :          * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
    8590             :          * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
    8591             :          * more, all pages in [start, end) are free in page allocator.
    8592             :          * What we are going to do is to allocate all pages from
    8593             :          * [start, end) (that is remove them from page allocator).
    8594             :          *
    8595             :          * The only problem is that pages at the beginning and at the
    8596             :          * end of interesting range may be not aligned with pages that
    8597             :          * page allocator holds, ie. they can be part of higher order
    8598             :          * pages.  Because of this, we reserve the bigger range and
    8599             :          * once this is done free the pages we are not interested in.
    8600             :          *
    8601             :          * We don't have to hold zone->lock here because the pages are
    8602             :          * isolated thus they won't get removed from buddy.
    8603             :          */
    8604             : 
    8605             :         lru_add_drain_all();
    8606             : 
    8607             :         order = 0;
    8608             :         outer_start = start;
    8609             :         while (!PageBuddy(pfn_to_page(outer_start))) {
    8610             :                 if (++order >= MAX_ORDER) {
    8611             :                         outer_start = start;
    8612             :                         break;
    8613             :                 }
    8614             :                 outer_start &= ~0UL << order;
    8615             :         }
    8616             : 
    8617             :         if (outer_start != start) {
    8618             :                 order = buddy_order(pfn_to_page(outer_start));
    8619             : 
    8620             :                 /*
    8621             :                  * outer_start page could be small order buddy page and
    8622             :                  * it doesn't include start page. Adjust outer_start
    8623             :                  * in this case to report failed page properly
    8624             :                  * on tracepoint in test_pages_isolated()
    8625             :                  */
    8626             :                 if (outer_start + (1UL << order) <= start)
    8627             :                         outer_start = start;
    8628             :         }
    8629             : 
    8630             :         /* Make sure the range is really isolated. */
    8631             :         if (test_pages_isolated(outer_start, end, 0)) {
    8632             :                 pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
    8633             :                         __func__, outer_start, end);
    8634             :                 ret = -EBUSY;
    8635             :                 goto done;
    8636             :         }
    8637             : 
    8638             :         /* Grab isolated pages from freelists. */
    8639             :         outer_end = isolate_freepages_range(&cc, outer_start, end);
    8640             :         if (!outer_end) {
    8641             :                 ret = -EBUSY;
    8642             :                 goto done;
    8643             :         }
    8644             : 
    8645             :         /* Free head and tail (if any) */
    8646             :         if (start != outer_start)
    8647             :                 free_contig_range(outer_start, start - outer_start);
    8648             :         if (end != outer_end)
    8649             :                 free_contig_range(end, outer_end - end);
    8650             : 
    8651             : done:
    8652             :         undo_isolate_page_range(pfn_max_align_down(start),
    8653             :                                 pfn_max_align_up(end), migratetype);
    8654             :         return ret;
    8655             : }
    8656             : EXPORT_SYMBOL(alloc_contig_range);
    8657             : 
    8658             : static int __alloc_contig_pages(unsigned long start_pfn,
    8659             :                                 unsigned long nr_pages, gfp_t gfp_mask)
    8660             : {
    8661             :         unsigned long end_pfn = start_pfn + nr_pages;
    8662             : 
    8663             :         return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
    8664             :                                   gfp_mask);
    8665             : }
    8666             : 
    8667             : static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
    8668             :                                    unsigned long nr_pages)
    8669             : {
    8670             :         unsigned long i, end_pfn = start_pfn + nr_pages;
    8671             :         struct page *page;
    8672             : 
    8673             :         for (i = start_pfn; i < end_pfn; i++) {
    8674             :                 page = pfn_to_online_page(i);
    8675             :                 if (!page)
    8676             :                         return false;
    8677             : 
    8678             :                 if (page_zone(page) != z)
    8679             :                         return false;
    8680             : 
    8681             :                 if (PageReserved(page))
    8682             :                         return false;
    8683             : 
    8684             :                 if (page_count(page) > 0)
    8685             :                         return false;
    8686             : 
    8687             :                 if (PageHuge(page))
    8688             :                         return false;
    8689             :         }
    8690             :         return true;
    8691             : }
    8692             : 
    8693             : static bool zone_spans_last_pfn(const struct zone *zone,
    8694             :                                 unsigned long start_pfn, unsigned long nr_pages)
    8695             : {
    8696             :         unsigned long last_pfn = start_pfn + nr_pages - 1;
    8697             : 
    8698             :         return zone_spans_pfn(zone, last_pfn);
    8699             : }
    8700             : 
    8701             : /**
    8702             :  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
    8703             :  * @nr_pages:   Number of contiguous pages to allocate
    8704             :  * @gfp_mask:   GFP mask to limit search and used during compaction
    8705             :  * @nid:        Target node
    8706             :  * @nodemask:   Mask for other possible nodes
    8707             :  *
    8708             :  * This routine is a wrapper around alloc_contig_range(). It scans over zones
    8709             :  * on an applicable zonelist to find a contiguous pfn range which can then be
    8710             :  * tried for allocation with alloc_contig_range(). This routine is intended
    8711             :  * for allocation requests which can not be fulfilled with the buddy allocator.
    8712             :  *
    8713             :  * The allocated memory is always aligned to a page boundary. If nr_pages is a
    8714             :  * power of two then the alignment is guaranteed to be to the given nr_pages
    8715             :  * (e.g. 1GB request would be aligned to 1GB).
    8716             :  *
    8717             :  * Allocated pages can be freed with free_contig_range() or by manually calling
    8718             :  * __free_page() on each allocated page.
    8719             :  *
    8720             :  * Return: pointer to contiguous pages on success, or NULL if not successful.
    8721             :  */
    8722             : struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
    8723             :                                 int nid, nodemask_t *nodemask)
    8724             : {
    8725             :         unsigned long ret, pfn, flags;
    8726             :         struct zonelist *zonelist;
    8727             :         struct zone *zone;
    8728             :         struct zoneref *z;
    8729             : 
    8730             :         zonelist = node_zonelist(nid, gfp_mask);
    8731             :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    8732             :                                         gfp_zone(gfp_mask), nodemask) {
    8733             :                 spin_lock_irqsave(&zone->lock, flags);
    8734             : 
    8735             :                 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
    8736             :                 while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
    8737             :                         if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
    8738             :                                 /*
    8739             :                                  * We release the zone lock here because
    8740             :                                  * alloc_contig_range() will also lock the zone
    8741             :                                  * at some point. If there's an allocation
    8742             :                                  * spinning on this lock, it may win the race
    8743             :                                  * and cause alloc_contig_range() to fail...
    8744             :                                  */
    8745             :                                 spin_unlock_irqrestore(&zone->lock, flags);
    8746             :                                 ret = __alloc_contig_pages(pfn, nr_pages,
    8747             :                                                         gfp_mask);
    8748             :                                 if (!ret)
    8749             :                                         return pfn_to_page(pfn);
    8750             :                                 spin_lock_irqsave(&zone->lock, flags);
    8751             :                         }
    8752             :                         pfn += nr_pages;
    8753             :                 }
    8754             :                 spin_unlock_irqrestore(&zone->lock, flags);
    8755             :         }
    8756             :         return NULL;
    8757             : }
    8758             : #endif /* CONFIG_CONTIG_ALLOC */
    8759             : 
    8760           0 : void free_contig_range(unsigned long pfn, unsigned int nr_pages)
    8761             : {
    8762           0 :         unsigned int count = 0;
    8763             : 
    8764           0 :         for (; nr_pages--; pfn++) {
    8765           0 :                 struct page *page = pfn_to_page(pfn);
    8766             : 
    8767           0 :                 count += page_count(page) != 1;
    8768           0 :                 __free_page(page);
    8769             :         }
    8770           0 :         WARN(count != 0, "%d pages are still in use!\n", count);
    8771           0 : }
    8772             : EXPORT_SYMBOL(free_contig_range);
    8773             : 
    8774             : /*
    8775             :  * The zone indicated has a new number of managed_pages; batch sizes and percpu
    8776             :  * page high values need to be recalulated.
    8777             :  */
    8778           0 : void __meminit zone_pcp_update(struct zone *zone)
    8779             : {
    8780           0 :         mutex_lock(&pcp_batch_high_lock);
    8781           0 :         zone_set_pageset_high_and_batch(zone);
    8782           0 :         mutex_unlock(&pcp_batch_high_lock);
    8783           0 : }
    8784             : 
    8785             : /*
    8786             :  * Effectively disable pcplists for the zone by setting the high limit to 0
    8787             :  * and draining all cpus. A concurrent page freeing on another CPU that's about
    8788             :  * to put the page on pcplist will either finish before the drain and the page
    8789             :  * will be drained, or observe the new high limit and skip the pcplist.
    8790             :  *
    8791             :  * Must be paired with a call to zone_pcp_enable().
    8792             :  */
    8793           0 : void zone_pcp_disable(struct zone *zone)
    8794             : {
    8795           0 :         mutex_lock(&pcp_batch_high_lock);
    8796           0 :         __zone_set_pageset_high_and_batch(zone, 0, 1);
    8797           0 :         __drain_all_pages(zone, true);
    8798           0 : }
    8799             : 
    8800           0 : void zone_pcp_enable(struct zone *zone)
    8801             : {
    8802           0 :         __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
    8803           0 :         mutex_unlock(&pcp_batch_high_lock);
    8804           0 : }
    8805             : 
    8806           0 : void zone_pcp_reset(struct zone *zone)
    8807             : {
    8808           0 :         unsigned long flags;
    8809           0 :         int cpu;
    8810           0 :         struct per_cpu_pageset *pset;
    8811             : 
    8812             :         /* avoid races with drain_pages()  */
    8813           0 :         local_irq_save(flags);
    8814           0 :         if (zone->pageset != &boot_pageset) {
    8815           0 :                 for_each_online_cpu(cpu) {
    8816           0 :                         pset = per_cpu_ptr(zone->pageset, cpu);
    8817           0 :                         drain_zonestat(zone, pset);
    8818             :                 }
    8819           0 :                 free_percpu(zone->pageset);
    8820           0 :                 zone->pageset = &boot_pageset;
    8821             :         }
    8822           0 :         local_irq_restore(flags);
    8823           0 : }
    8824             : 
    8825             : #ifdef CONFIG_MEMORY_HOTREMOVE
    8826             : /*
    8827             :  * All pages in the range must be in a single zone, must not contain holes,
    8828             :  * must span full sections, and must be isolated before calling this function.
    8829             :  */
    8830             : void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
    8831             : {
    8832             :         unsigned long pfn = start_pfn;
    8833             :         struct page *page;
    8834             :         struct zone *zone;
    8835             :         unsigned int order;
    8836             :         unsigned long flags;
    8837             : 
    8838             :         offline_mem_sections(pfn, end_pfn);
    8839             :         zone = page_zone(pfn_to_page(pfn));
    8840             :         spin_lock_irqsave(&zone->lock, flags);
    8841             :         while (pfn < end_pfn) {
    8842             :                 page = pfn_to_page(pfn);
    8843             :                 /*
    8844             :                  * The HWPoisoned page may be not in buddy system, and
    8845             :                  * page_count() is not 0.
    8846             :                  */
    8847             :                 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
    8848             :                         pfn++;
    8849             :                         continue;
    8850             :                 }
    8851             :                 /*
    8852             :                  * At this point all remaining PageOffline() pages have a
    8853             :                  * reference count of 0 and can simply be skipped.
    8854             :                  */
    8855             :                 if (PageOffline(page)) {
    8856             :                         BUG_ON(page_count(page));
    8857             :                         BUG_ON(PageBuddy(page));
    8858             :                         pfn++;
    8859             :                         continue;
    8860             :                 }
    8861             : 
    8862             :                 BUG_ON(page_count(page));
    8863             :                 BUG_ON(!PageBuddy(page));
    8864             :                 order = buddy_order(page);
    8865             :                 del_page_from_free_list(page, zone, order);
    8866             :                 pfn += (1 << order);
    8867             :         }
    8868             :         spin_unlock_irqrestore(&zone->lock, flags);
    8869             : }
    8870             : #endif
    8871             : 
    8872           0 : bool is_free_buddy_page(struct page *page)
    8873             : {
    8874           0 :         struct zone *zone = page_zone(page);
    8875           0 :         unsigned long pfn = page_to_pfn(page);
    8876           0 :         unsigned long flags;
    8877           0 :         unsigned int order;
    8878             : 
    8879           0 :         spin_lock_irqsave(&zone->lock, flags);
    8880           0 :         for (order = 0; order < MAX_ORDER; order++) {
    8881           0 :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    8882             : 
    8883           0 :                 if (PageBuddy(page_head) && buddy_order(page_head) >= order)
    8884             :                         break;
    8885             :         }
    8886           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    8887             : 
    8888           0 :         return order < MAX_ORDER;
    8889             : }
    8890             : 
    8891             : #ifdef CONFIG_MEMORY_FAILURE
    8892             : /*
    8893             :  * Break down a higher-order page in sub-pages, and keep our target out of
    8894             :  * buddy allocator.
    8895             :  */
    8896             : static void break_down_buddy_pages(struct zone *zone, struct page *page,
    8897             :                                    struct page *target, int low, int high,
    8898             :                                    int migratetype)
    8899             : {
    8900             :         unsigned long size = 1 << high;
    8901             :         struct page *current_buddy, *next_page;
    8902             : 
    8903             :         while (high > low) {
    8904             :                 high--;
    8905             :                 size >>= 1;
    8906             : 
    8907             :                 if (target >= &page[size]) {
    8908             :                         next_page = page + size;
    8909             :                         current_buddy = page;
    8910             :                 } else {
    8911             :                         next_page = page;
    8912             :                         current_buddy = page + size;
    8913             :                 }
    8914             : 
    8915             :                 if (set_page_guard(zone, current_buddy, high, migratetype))
    8916             :                         continue;
    8917             : 
    8918             :                 if (current_buddy != target) {
    8919             :                         add_to_free_list(current_buddy, zone, high, migratetype);
    8920             :                         set_buddy_order(current_buddy, high);
    8921             :                         page = next_page;
    8922             :                 }
    8923             :         }
    8924             : }
    8925             : 
    8926             : /*
    8927             :  * Take a page that will be marked as poisoned off the buddy allocator.
    8928             :  */
    8929             : bool take_page_off_buddy(struct page *page)
    8930             : {
    8931             :         struct zone *zone = page_zone(page);
    8932             :         unsigned long pfn = page_to_pfn(page);
    8933             :         unsigned long flags;
    8934             :         unsigned int order;
    8935             :         bool ret = false;
    8936             : 
    8937             :         spin_lock_irqsave(&zone->lock, flags);
    8938             :         for (order = 0; order < MAX_ORDER; order++) {
    8939             :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    8940             :                 int page_order = buddy_order(page_head);
    8941             : 
    8942             :                 if (PageBuddy(page_head) && page_order >= order) {
    8943             :                         unsigned long pfn_head = page_to_pfn(page_head);
    8944             :                         int migratetype = get_pfnblock_migratetype(page_head,
    8945             :                                                                    pfn_head);
    8946             : 
    8947             :                         del_page_from_free_list(page_head, zone, page_order);
    8948             :                         break_down_buddy_pages(zone, page_head, page, 0,
    8949             :                                                 page_order, migratetype);
    8950             :                         ret = true;
    8951             :                         break;
    8952             :                 }
    8953             :                 if (page_count(page_head) > 0)
    8954             :                         break;
    8955             :         }
    8956             :         spin_unlock_irqrestore(&zone->lock, flags);
    8957             :         return ret;
    8958             : }
    8959             : #endif

Generated by: LCOV version 1.14