LCOV - code coverage report
Current view: top level - mm - percpu.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 870 1116 78.0 %
Date: 2021-04-22 12:43:58 Functions: 39 46 84.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * mm/percpu.c - percpu memory allocator
       4             :  *
       5             :  * Copyright (C) 2009           SUSE Linux Products GmbH
       6             :  * Copyright (C) 2009           Tejun Heo <tj@kernel.org>
       7             :  *
       8             :  * Copyright (C) 2017           Facebook Inc.
       9             :  * Copyright (C) 2017           Dennis Zhou <dennis@kernel.org>
      10             :  *
      11             :  * The percpu allocator handles both static and dynamic areas.  Percpu
      12             :  * areas are allocated in chunks which are divided into units.  There is
      13             :  * a 1-to-1 mapping for units to possible cpus.  These units are grouped
      14             :  * based on NUMA properties of the machine.
      15             :  *
      16             :  *  c0                           c1                         c2
      17             :  *  -------------------          -------------------        ------------
      18             :  * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
      19             :  *  -------------------  ......  -------------------  ....  ------------
      20             :  *
      21             :  * Allocation is done by offsets into a unit's address space.  Ie., an
      22             :  * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
      23             :  * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
      24             :  * and even sparse.  Access is handled by configuring percpu base
      25             :  * registers according to the cpu to unit mappings and offsetting the
      26             :  * base address using pcpu_unit_size.
      27             :  *
      28             :  * There is special consideration for the first chunk which must handle
      29             :  * the static percpu variables in the kernel image as allocation services
      30             :  * are not online yet.  In short, the first chunk is structured like so:
      31             :  *
      32             :  *                  <Static | [Reserved] | Dynamic>
      33             :  *
      34             :  * The static data is copied from the original section managed by the
      35             :  * linker.  The reserved section, if non-zero, primarily manages static
      36             :  * percpu variables from kernel modules.  Finally, the dynamic section
      37             :  * takes care of normal allocations.
      38             :  *
      39             :  * The allocator organizes chunks into lists according to free size and
      40             :  * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
      41             :  * flag should be passed.  All memcg-aware allocations are sharing one set
      42             :  * of chunks and all unaccounted allocations and allocations performed
      43             :  * by processes belonging to the root memory cgroup are using the second set.
      44             :  *
      45             :  * The allocator tries to allocate from the fullest chunk first. Each chunk
      46             :  * is managed by a bitmap with metadata blocks.  The allocation map is updated
      47             :  * on every allocation and free to reflect the current state while the boundary
      48             :  * map is only updated on allocation.  Each metadata block contains
      49             :  * information to help mitigate the need to iterate over large portions
      50             :  * of the bitmap.  The reverse mapping from page to chunk is stored in
      51             :  * the page's index.  Lastly, units are lazily backed and grow in unison.
      52             :  *
      53             :  * There is a unique conversion that goes on here between bytes and bits.
      54             :  * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
      55             :  * tracks the number of pages it is responsible for in nr_pages.  Helper
      56             :  * functions are used to convert from between the bytes, bits, and blocks.
      57             :  * All hints are managed in bits unless explicitly stated.
      58             :  *
      59             :  * To use this allocator, arch code should do the following:
      60             :  *
      61             :  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
      62             :  *   regular address to percpu pointer and back if they need to be
      63             :  *   different from the default
      64             :  *
      65             :  * - use pcpu_setup_first_chunk() during percpu area initialization to
      66             :  *   setup the first chunk containing the kernel static percpu area
      67             :  */
      68             : 
      69             : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      70             : 
      71             : #include <linux/bitmap.h>
      72             : #include <linux/cpumask.h>
      73             : #include <linux/memblock.h>
      74             : #include <linux/err.h>
      75             : #include <linux/lcm.h>
      76             : #include <linux/list.h>
      77             : #include <linux/log2.h>
      78             : #include <linux/mm.h>
      79             : #include <linux/module.h>
      80             : #include <linux/mutex.h>
      81             : #include <linux/percpu.h>
      82             : #include <linux/pfn.h>
      83             : #include <linux/slab.h>
      84             : #include <linux/spinlock.h>
      85             : #include <linux/vmalloc.h>
      86             : #include <linux/workqueue.h>
      87             : #include <linux/kmemleak.h>
      88             : #include <linux/sched.h>
      89             : #include <linux/sched/mm.h>
      90             : #include <linux/memcontrol.h>
      91             : 
      92             : #include <asm/cacheflush.h>
      93             : #include <asm/sections.h>
      94             : #include <asm/tlbflush.h>
      95             : #include <asm/io.h>
      96             : 
      97             : #define CREATE_TRACE_POINTS
      98             : #include <trace/events/percpu.h>
      99             : 
     100             : #include "percpu-internal.h"
     101             : 
     102             : /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
     103             : #define PCPU_SLOT_BASE_SHIFT            5
     104             : /* chunks in slots below this are subject to being sidelined on failed alloc */
     105             : #define PCPU_SLOT_FAIL_THRESHOLD        3
     106             : 
     107             : #define PCPU_EMPTY_POP_PAGES_LOW        2
     108             : #define PCPU_EMPTY_POP_PAGES_HIGH       4
     109             : 
     110             : #ifdef CONFIG_SMP
     111             : /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
     112             : #ifndef __addr_to_pcpu_ptr
     113             : #define __addr_to_pcpu_ptr(addr)                                        \
     114             :         (void __percpu *)((unsigned long)(addr) -                       \
     115             :                           (unsigned long)pcpu_base_addr +               \
     116             :                           (unsigned long)__per_cpu_start)
     117             : #endif
     118             : #ifndef __pcpu_ptr_to_addr
     119             : #define __pcpu_ptr_to_addr(ptr)                                         \
     120             :         (void __force *)((unsigned long)(ptr) +                         \
     121             :                          (unsigned long)pcpu_base_addr -                \
     122             :                          (unsigned long)__per_cpu_start)
     123             : #endif
     124             : #else   /* CONFIG_SMP */
     125             : /* on UP, it's always identity mapped */
     126             : #define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
     127             : #define __pcpu_ptr_to_addr(ptr)         (void __force *)(ptr)
     128             : #endif  /* CONFIG_SMP */
     129             : 
     130             : static int pcpu_unit_pages __ro_after_init;
     131             : static int pcpu_unit_size __ro_after_init;
     132             : static int pcpu_nr_units __ro_after_init;
     133             : static int pcpu_atom_size __ro_after_init;
     134             : int pcpu_nr_slots __ro_after_init;
     135             : static size_t pcpu_chunk_struct_size __ro_after_init;
     136             : 
     137             : /* cpus with the lowest and highest unit addresses */
     138             : static unsigned int pcpu_low_unit_cpu __ro_after_init;
     139             : static unsigned int pcpu_high_unit_cpu __ro_after_init;
     140             : 
     141             : /* the address of the first chunk which starts with the kernel static area */
     142             : void *pcpu_base_addr __ro_after_init;
     143             : EXPORT_SYMBOL_GPL(pcpu_base_addr);
     144             : 
     145             : static const int *pcpu_unit_map __ro_after_init;                /* cpu -> unit */
     146             : const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
     147             : 
     148             : /* group information, used for vm allocation */
     149             : static int pcpu_nr_groups __ro_after_init;
     150             : static const unsigned long *pcpu_group_offsets __ro_after_init;
     151             : static const size_t *pcpu_group_sizes __ro_after_init;
     152             : 
     153             : /*
     154             :  * The first chunk which always exists.  Note that unlike other
     155             :  * chunks, this one can be allocated and mapped in several different
     156             :  * ways and thus often doesn't live in the vmalloc area.
     157             :  */
     158             : struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
     159             : 
     160             : /*
     161             :  * Optional reserved chunk.  This chunk reserves part of the first
     162             :  * chunk and serves it for reserved allocations.  When the reserved
     163             :  * region doesn't exist, the following variable is NULL.
     164             :  */
     165             : struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
     166             : 
     167             : DEFINE_SPINLOCK(pcpu_lock);     /* all internal data structures */
     168             : static DEFINE_MUTEX(pcpu_alloc_mutex);  /* chunk create/destroy, [de]pop, map ext */
     169             : 
     170             : struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
     171             : 
     172             : /* chunks which need their map areas extended, protected by pcpu_lock */
     173             : static LIST_HEAD(pcpu_map_extend_chunks);
     174             : 
     175             : /*
     176             :  * The number of empty populated pages, protected by pcpu_lock.  The
     177             :  * reserved chunk doesn't contribute to the count.
     178             :  */
     179             : int pcpu_nr_empty_pop_pages;
     180             : 
     181             : /*
     182             :  * The number of populated pages in use by the allocator, protected by
     183             :  * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
     184             :  * allocated/deallocated, it is allocated/deallocated in all units of a chunk
     185             :  * and increments/decrements this count by 1).
     186             :  */
     187             : static unsigned long pcpu_nr_populated;
     188             : 
     189             : /*
     190             :  * Balance work is used to populate or destroy chunks asynchronously.  We
     191             :  * try to keep the number of populated free pages between
     192             :  * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
     193             :  * empty chunk.
     194             :  */
     195             : static void pcpu_balance_workfn(struct work_struct *work);
     196             : static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
     197             : static bool pcpu_async_enabled __read_mostly;
     198             : static bool pcpu_atomic_alloc_failed;
     199             : 
     200           4 : static void pcpu_schedule_balance_work(void)
     201             : {
     202           4 :         if (pcpu_async_enabled)
     203           4 :                 schedule_work(&pcpu_balance_work);
     204           4 : }
     205             : 
     206             : /**
     207             :  * pcpu_addr_in_chunk - check if the address is served from this chunk
     208             :  * @chunk: chunk of interest
     209             :  * @addr: percpu address
     210             :  *
     211             :  * RETURNS:
     212             :  * True if the address is served from this chunk.
     213             :  */
     214        1685 : static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
     215             : {
     216        1685 :         void *start_addr, *end_addr;
     217             : 
     218        1685 :         if (!chunk)
     219             :                 return false;
     220             : 
     221        1558 :         start_addr = chunk->base_addr + chunk->start_offset;
     222        1558 :         end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
     223        1558 :                    chunk->end_offset;
     224             : 
     225        1558 :         return addr >= start_addr && addr < end_addr;
     226             : }
     227             : 
     228        9635 : static int __pcpu_size_to_slot(int size)
     229             : {
     230        9635 :         int highbit = fls(size);        /* size is in bytes */
     231        9634 :         return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
     232             : }
     233             : 
     234        9636 : static int pcpu_size_to_slot(int size)
     235             : {
     236        9636 :         if (size == pcpu_unit_size)
     237           2 :                 return pcpu_nr_slots - 1;
     238        9634 :         return __pcpu_size_to_slot(size);
     239             : }
     240             : 
     241        7462 : static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
     242             : {
     243        7462 :         const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
     244             : 
     245        7462 :         if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
     246        7462 :             chunk_md->contig_hint == 0)
     247             :                 return 0;
     248             : 
     249        7462 :         return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
     250             : }
     251             : 
     252             : /* set the pointer to a chunk in a page struct */
     253          12 : static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
     254             : {
     255          12 :         page->index = (unsigned long)pcpu;
     256             : }
     257             : 
     258             : /* obtain pointer to a chunk from a page struct */
     259         127 : static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
     260             : {
     261         127 :         return (struct pcpu_chunk *)page->index;
     262             : }
     263             : 
     264          28 : static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
     265             : {
     266          28 :         return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
     267             : }
     268             : 
     269        8756 : static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
     270             : {
     271        8756 :         return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
     272             : }
     273             : 
     274        8692 : static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
     275             :                                      unsigned int cpu, int page_idx)
     276             : {
     277        8692 :         return (unsigned long)chunk->base_addr +
     278           4 :                pcpu_unit_page_offset(cpu, page_idx);
     279             : }
     280             : 
     281             : /*
     282             :  * The following are helper functions to help access bitmaps and convert
     283             :  * between bitmap offsets to address offsets.
     284             :  */
     285        5181 : static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
     286             : {
     287        5181 :         return chunk->alloc_map +
     288        5181 :                (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
     289             : }
     290             : 
     291        7002 : static unsigned long pcpu_off_to_block_index(int off)
     292             : {
     293        7002 :         return off / PCPU_BITMAP_BLOCK_BITS;
     294             : }
     295             : 
     296        7002 : static unsigned long pcpu_off_to_block_off(int off)
     297             : {
     298        7002 :         return off & (PCPU_BITMAP_BLOCK_BITS - 1);
     299             : }
     300             : 
     301        4024 : static unsigned long pcpu_block_off_to_off(int index, int off)
     302             : {
     303        4024 :         return index * PCPU_BITMAP_BLOCK_BITS + off;
     304             : }
     305             : 
     306             : /*
     307             :  * pcpu_next_hint - determine which hint to use
     308             :  * @block: block of interest
     309             :  * @alloc_bits: size of allocation
     310             :  *
     311             :  * This determines if we should scan based on the scan_hint or first_free.
     312             :  * In general, we want to scan from first_free to fulfill allocations by
     313             :  * first fit.  However, if we know a scan_hint at position scan_hint_start
     314             :  * cannot fulfill an allocation, we can begin scanning from there knowing
     315             :  * the contig_hint will be our fallback.
     316             :  */
     317        4339 : static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
     318             : {
     319             :         /*
     320             :          * The three conditions below determine if we can skip past the
     321             :          * scan_hint.  First, does the scan hint exist.  Second, is the
     322             :          * contig_hint after the scan_hint (possibly not true iff
     323             :          * contig_hint == scan_hint).  Third, is the allocation request
     324             :          * larger than the scan_hint.
     325             :          */
     326        6089 :         if (block->scan_hint &&
     327        1750 :             block->contig_hint_start > block->scan_hint_start &&
     328             :             alloc_bits > block->scan_hint)
     329         843 :                 return block->scan_hint_start + block->scan_hint;
     330             : 
     331        3496 :         return block->first_free;
     332             : }
     333             : 
     334             : /**
     335             :  * pcpu_next_md_free_region - finds the next hint free area
     336             :  * @chunk: chunk of interest
     337             :  * @bit_off: chunk offset
     338             :  * @bits: size of free area
     339             :  *
     340             :  * Helper function for pcpu_for_each_md_free_region.  It checks
     341             :  * block->contig_hint and performs aggregation across blocks to find the
     342             :  * next hint.  It modifies bit_off and bits in-place to be consumed in the
     343             :  * loop.
     344             :  */
     345         824 : static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
     346             :                                      int *bits)
     347             : {
     348         824 :         int i = pcpu_off_to_block_index(*bit_off);
     349         824 :         int block_off = pcpu_off_to_block_off(*bit_off);
     350         824 :         struct pcpu_block_md *block;
     351             : 
     352         824 :         *bits = 0;
     353       10434 :         for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
     354        9610 :              block++, i++) {
     355             :                 /* handles contig area across blocks */
     356        9639 :                 if (*bits) {
     357        9000 :                         *bits += block->left_free;
     358        9000 :                         if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
     359        9000 :                                 continue;
     360             :                         return;
     361             :                 }
     362             : 
     363             :                 /*
     364             :                  * This checks three things.  First is there a contig_hint to
     365             :                  * check.  Second, have we checked this hint before by
     366             :                  * comparing the block_off.  Third, is this the same as the
     367             :                  * right contig hint.  In the last case, it spills over into
     368             :                  * the next block and should be handled by the contig area
     369             :                  * across blocks code.
     370             :                  */
     371         639 :                 *bits = block->contig_hint;
     372         639 :                 if (*bits && block->contig_hint_start >= block_off &&
     373         426 :                     *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
     374          29 :                         *bit_off = pcpu_block_off_to_off(i,
     375             :                                         block->contig_hint_start);
     376          29 :                         return;
     377             :                 }
     378             :                 /* reset to satisfy the second predicate above */
     379         610 :                 block_off = 0;
     380             : 
     381         610 :                 *bits = block->right_free;
     382         610 :                 *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
     383             :         }
     384             : }
     385             : 
     386             : /**
     387             :  * pcpu_next_fit_region - finds fit areas for a given allocation request
     388             :  * @chunk: chunk of interest
     389             :  * @alloc_bits: size of allocation
     390             :  * @align: alignment of area (max PAGE_SIZE)
     391             :  * @bit_off: chunk offset
     392             :  * @bits: size of free area
     393             :  *
     394             :  * Finds the next free region that is viable for use with a given size and
     395             :  * alignment.  This only returns if there is a valid area to be used for this
     396             :  * allocation.  block->first_free is returned if the allocation request fits
     397             :  * within the block to see if the request can be fulfilled prior to the contig
     398             :  * hint.
     399             :  */
     400        2172 : static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
     401             :                                  int align, int *bit_off, int *bits)
     402             : {
     403        2172 :         int i = pcpu_off_to_block_index(*bit_off);
     404        2172 :         int block_off = pcpu_off_to_block_off(*bit_off);
     405        2172 :         struct pcpu_block_md *block;
     406             : 
     407        2172 :         *bits = 0;
     408        2442 :         for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
     409         270 :              block++, i++) {
     410             :                 /* handles contig area across blocks */
     411        2442 :                 if (*bits) {
     412           5 :                         *bits += block->left_free;
     413           5 :                         if (*bits >= alloc_bits)
     414             :                                 return;
     415           0 :                         if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
     416           0 :                                 continue;
     417             :                 }
     418             : 
     419             :                 /* check block->contig_hint */
     420        2437 :                 *bits = ALIGN(block->contig_hint_start, align) -
     421             :                         block->contig_hint_start;
     422             :                 /*
     423             :                  * This uses the block offset to determine if this has been
     424             :                  * checked in the prior iteration.
     425             :                  */
     426        2437 :                 if (block->contig_hint &&
     427        2379 :                     block->contig_hint_start >= block_off &&
     428        2273 :                     block->contig_hint >= *bits + alloc_bits) {
     429        2167 :                         int start = pcpu_next_hint(block, alloc_bits);
     430             : 
     431        2167 :                         *bits += alloc_bits + block->contig_hint_start -
     432             :                                  start;
     433        2167 :                         *bit_off = pcpu_block_off_to_off(i, start);
     434        2167 :                         return;
     435             :                 }
     436             :                 /* reset to satisfy the second predicate above */
     437         270 :                 block_off = 0;
     438             : 
     439         270 :                 *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
     440             :                                  align);
     441         270 :                 *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
     442         270 :                 *bit_off = pcpu_block_off_to_off(i, *bit_off);
     443         270 :                 if (*bits >= alloc_bits)
     444             :                         return;
     445             :         }
     446             : 
     447             :         /* no valid offsets were found - fail condition */
     448           0 :         *bit_off = pcpu_chunk_map_bits(chunk);
     449             : }
     450             : 
     451             : /*
     452             :  * Metadata free area iterators.  These perform aggregation of free areas
     453             :  * based on the metadata blocks and return the offset @bit_off and size in
     454             :  * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
     455             :  * a fit is found for the allocation request.
     456             :  */
     457             : #define pcpu_for_each_md_free_region(chunk, bit_off, bits)              \
     458             :         for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));    \
     459             :              (bit_off) < pcpu_chunk_map_bits((chunk));                       \
     460             :              (bit_off) += (bits) + 1,                                   \
     461             :              pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
     462             : 
     463             : #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
     464             :         for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
     465             :                                   &(bits));                               \
     466             :              (bit_off) < pcpu_chunk_map_bits((chunk));                             \
     467             :              (bit_off) += (bits),                                             \
     468             :              pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
     469             :                                   &(bits)))
     470             : 
     471             : /**
     472             :  * pcpu_mem_zalloc - allocate memory
     473             :  * @size: bytes to allocate
     474             :  * @gfp: allocation flags
     475             :  *
     476             :  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
     477             :  * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
     478             :  * This is to facilitate passing through whitelisted flags.  The
     479             :  * returned memory is always zeroed.
     480             :  *
     481             :  * RETURNS:
     482             :  * Pointer to the allocated area on success, NULL on failure.
     483             :  */
     484           5 : static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
     485             : {
     486           5 :         if (WARN_ON_ONCE(!slab_is_available()))
     487             :                 return NULL;
     488             : 
     489           5 :         if (size <= PAGE_SIZE)
     490           3 :                 return kzalloc(size, gfp);
     491             :         else
     492           2 :                 return __vmalloc(size, gfp | __GFP_ZERO);
     493             : }
     494             : 
     495             : /**
     496             :  * pcpu_mem_free - free memory
     497             :  * @ptr: memory to free
     498             :  *
     499             :  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
     500             :  */
     501           0 : static void pcpu_mem_free(void *ptr)
     502             : {
     503           0 :         kvfree(ptr);
     504           0 : }
     505             : 
     506          15 : static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
     507             :                               bool move_front)
     508             : {
     509          15 :         if (chunk != pcpu_reserved_chunk) {
     510          15 :                 struct list_head *pcpu_slot;
     511             : 
     512          15 :                 pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
     513          15 :                 if (move_front)
     514           3 :                         list_move(&chunk->list, &pcpu_slot[slot]);
     515             :                 else
     516          12 :                         list_move_tail(&chunk->list, &pcpu_slot[slot]);
     517             :         }
     518          15 : }
     519             : 
     520           1 : static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
     521             : {
     522           1 :         __pcpu_chunk_move(chunk, slot, true);
     523           1 : }
     524             : 
     525             : /**
     526             :  * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
     527             :  * @chunk: chunk of interest
     528             :  * @oslot: the previous slot it was on
     529             :  *
     530             :  * This function is called after an allocation or free changed @chunk.
     531             :  * New slot according to the changed state is determined and @chunk is
     532             :  * moved to the slot.  Note that the reserved chunk is never put on
     533             :  * chunk slots.
     534             :  *
     535             :  * CONTEXT:
     536             :  * pcpu_lock.
     537             :  */
     538        3732 : static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
     539             : {
     540        3732 :         int nslot = pcpu_chunk_slot(chunk);
     541             : 
     542        3732 :         if (oslot != nslot)
     543          14 :                 __pcpu_chunk_move(chunk, nslot, oslot < nslot);
     544        3732 : }
     545             : 
     546             : /*
     547             :  * pcpu_update_empty_pages - update empty page counters
     548             :  * @chunk: chunk of interest
     549             :  * @nr: nr of empty pages
     550             :  *
     551             :  * This is used to keep track of the empty pages now based on the premise
     552             :  * a md_block covers a page.  The hint update functions recognize if a block
     553             :  * is made full or broken to calculate deltas for keeping track of free pages.
     554             :  */
     555          10 : static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
     556             : {
     557          10 :         chunk->nr_empty_pop_pages += nr;
     558          10 :         if (chunk != pcpu_reserved_chunk)
     559          10 :                 pcpu_nr_empty_pop_pages += nr;
     560             : }
     561             : 
     562             : /*
     563             :  * pcpu_region_overlap - determines if two regions overlap
     564             :  * @a: start of first region, inclusive
     565             :  * @b: end of first region, exclusive
     566             :  * @x: start of second region, inclusive
     567             :  * @y: end of second region, exclusive
     568             :  *
     569             :  * This is used to determine if the hint region [a, b) overlaps with the
     570             :  * allocated region [x, y).
     571             :  */
     572        8692 : static inline bool pcpu_region_overlap(int a, int b, int x, int y)
     573             : {
     574        8692 :         return (a < y) && (x < b);
     575             : }
     576             : 
     577             : /**
     578             :  * pcpu_block_update - updates a block given a free area
     579             :  * @block: block of interest
     580             :  * @start: start offset in block
     581             :  * @end: end offset in block
     582             :  *
     583             :  * Updates a block given a known free area.  The region [start, end) is
     584             :  * expected to be the entirety of the free area within a block.  Chooses
     585             :  * the best starting offset if the contig hints are equal.
     586             :  */
     587        5107 : static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
     588             : {
     589        5107 :         int contig = end - start;
     590             : 
     591        5107 :         block->first_free = min(block->first_free, start);
     592        5107 :         if (start == 0)
     593           2 :                 block->left_free = contig;
     594             : 
     595        5107 :         if (end == block->nr_bits)
     596        1198 :                 block->right_free = contig;
     597             : 
     598        5107 :         if (contig > block->contig_hint) {
     599             :                 /* promote the old contig_hint to be the new scan_hint */
     600        1296 :                 if (start > block->contig_hint_start) {
     601        1118 :                         if (block->contig_hint > block->scan_hint) {
     602         409 :                                 block->scan_hint_start =
     603             :                                         block->contig_hint_start;
     604         409 :                                 block->scan_hint = block->contig_hint;
     605         709 :                         } else if (start < block->scan_hint_start) {
     606             :                                 /*
     607             :                                  * The old contig_hint == scan_hint.  But, the
     608             :                                  * new contig is larger so hold the invariant
     609             :                                  * scan_hint_start < contig_hint_start.
     610             :                                  */
     611           9 :                                 block->scan_hint = 0;
     612             :                         }
     613             :                 } else {
     614         178 :                         block->scan_hint = 0;
     615             :                 }
     616        1296 :                 block->contig_hint_start = start;
     617        1296 :                 block->contig_hint = contig;
     618        3811 :         } else if (contig == block->contig_hint) {
     619         312 :                 if (block->contig_hint_start &&
     620         312 :                     (!start ||
     621         312 :                      __ffs(start) > __ffs(block->contig_hint_start))) {
     622             :                         /* start has a better alignment so use it */
     623           6 :                         block->contig_hint_start = start;
     624           6 :                         if (start < block->scan_hint_start &&
     625           1 :                             block->contig_hint > block->scan_hint)
     626           1 :                                 block->scan_hint = 0;
     627         306 :                 } else if (start > block->scan_hint_start ||
     628          13 :                            block->contig_hint > block->scan_hint) {
     629             :                         /*
     630             :                          * Knowing contig == contig_hint, update the scan_hint
     631             :                          * if it is farther than or larger than the current
     632             :                          * scan_hint.
     633             :                          */
     634         306 :                         block->scan_hint_start = start;
     635         306 :                         block->scan_hint = contig;
     636             :                 }
     637             :         } else {
     638             :                 /*
     639             :                  * The region is smaller than the contig_hint.  So only update
     640             :                  * the scan_hint if it is larger than or equal and farther than
     641             :                  * the current scan_hint.
     642             :                  */
     643        3499 :                 if ((start < block->contig_hint_start &&
     644        3033 :                      (contig > block->scan_hint ||
     645         186 :                       (contig == block->scan_hint &&
     646         186 :                        start > block->scan_hint_start)))) {
     647         886 :                         block->scan_hint_start = start;
     648         886 :                         block->scan_hint = contig;
     649             :                 }
     650             :         }
     651        5107 : }
     652             : 
     653             : /*
     654             :  * pcpu_block_update_scan - update a block given a free area from a scan
     655             :  * @chunk: chunk of interest
     656             :  * @bit_off: chunk offset
     657             :  * @bits: size of free area
     658             :  *
     659             :  * Finding the final allocation spot first goes through pcpu_find_block_fit()
     660             :  * to find a block that can hold the allocation and then pcpu_alloc_area()
     661             :  * where a scan is used.  When allocations require specific alignments,
     662             :  * we can inadvertently create holes which will not be seen in the alloc
     663             :  * or free paths.
     664             :  *
     665             :  * This takes a given free area hole and updates a block as it may change the
     666             :  * scan_hint.  We need to scan backwards to ensure we don't miss free bits
     667             :  * from alignment.
     668             :  */
     669         275 : static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
     670             :                                    int bits)
     671             : {
     672         275 :         int s_off = pcpu_off_to_block_off(bit_off);
     673         275 :         int e_off = s_off + bits;
     674         275 :         int s_index, l_bit;
     675         275 :         struct pcpu_block_md *block;
     676             : 
     677         275 :         if (e_off > PCPU_BITMAP_BLOCK_BITS)
     678             :                 return;
     679             : 
     680         275 :         s_index = pcpu_off_to_block_index(bit_off);
     681         275 :         block = chunk->md_blocks + s_index;
     682             : 
     683             :         /* scan backwards in case of alignment skipping free bits */
     684         275 :         l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
     685         275 :         s_off = (s_off == l_bit) ? 0 : l_bit + 1;
     686             : 
     687         275 :         pcpu_block_update(block, s_off, e_off);
     688             : }
     689             : 
     690             : /**
     691             :  * pcpu_chunk_refresh_hint - updates metadata about a chunk
     692             :  * @chunk: chunk of interest
     693             :  * @full_scan: if we should scan from the beginning
     694             :  *
     695             :  * Iterates over the metadata blocks to find the largest contig area.
     696             :  * A full scan can be avoided on the allocation path as this is triggered
     697             :  * if we broke the contig_hint.  In doing so, the scan_hint will be before
     698             :  * the contig_hint or after if the scan_hint == contig_hint.  This cannot
     699             :  * be prevented on freeing as we want to find the largest area possibly
     700             :  * spanning blocks.
     701             :  */
     702         398 : static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
     703             : {
     704         398 :         struct pcpu_block_md *chunk_md = &chunk->chunk_md;
     705         398 :         int bit_off, bits;
     706             : 
     707             :         /* promote scan_hint to contig_hint */
     708         398 :         if (!full_scan && chunk_md->scan_hint) {
     709         150 :                 bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
     710         150 :                 chunk_md->contig_hint_start = chunk_md->scan_hint_start;
     711         150 :                 chunk_md->contig_hint = chunk_md->scan_hint;
     712         150 :                 chunk_md->scan_hint = 0;
     713             :         } else {
     714         248 :                 bit_off = chunk_md->first_free;
     715         248 :                 chunk_md->contig_hint = 0;
     716             :         }
     717             : 
     718         398 :         bits = 0;
     719         824 :         pcpu_for_each_md_free_region(chunk, bit_off, bits)
     720         426 :                 pcpu_block_update(chunk_md, bit_off, bit_off + bits);
     721         398 : }
     722             : 
     723             : /**
     724             :  * pcpu_block_refresh_hint
     725             :  * @chunk: chunk of interest
     726             :  * @index: index of the metadata block
     727             :  *
     728             :  * Scans over the block beginning at first_free and updates the block
     729             :  * metadata accordingly.
     730             :  */
     731         741 : static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
     732             : {
     733         741 :         struct pcpu_block_md *block = chunk->md_blocks + index;
     734         741 :         unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
     735         741 :         unsigned int rs, re, start;     /* region start, region end */
     736             : 
     737             :         /* promote scan_hint to contig_hint */
     738         741 :         if (block->scan_hint) {
     739         374 :                 start = block->scan_hint_start + block->scan_hint;
     740         374 :                 block->contig_hint_start = block->scan_hint_start;
     741         374 :                 block->contig_hint = block->scan_hint;
     742         374 :                 block->scan_hint = 0;
     743             :         } else {
     744         367 :                 start = block->first_free;
     745         367 :                 block->contig_hint = 0;
     746             :         }
     747             : 
     748         741 :         block->right_free = 0;
     749             : 
     750             :         /* iterate over free areas and update the contig hints */
     751        2031 :         bitmap_for_each_clear_region(alloc_map, rs, re, start,
     752             :                                      PCPU_BITMAP_BLOCK_BITS)
     753        1290 :                 pcpu_block_update(block, rs, re);
     754         741 : }
     755             : 
     756             : /**
     757             :  * pcpu_block_update_hint_alloc - update hint on allocation path
     758             :  * @chunk: chunk of interest
     759             :  * @bit_off: chunk offset
     760             :  * @bits: size of request
     761             :  *
     762             :  * Updates metadata for the allocation path.  The metadata only has to be
     763             :  * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
     764             :  * scans are required if the block's contig hint is broken.
     765             :  */
     766        2173 : static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
     767             :                                          int bits)
     768             : {
     769        2173 :         struct pcpu_block_md *chunk_md = &chunk->chunk_md;
     770        2173 :         int nr_empty_pages = 0;
     771        2173 :         struct pcpu_block_md *s_block, *e_block, *block;
     772        2173 :         int s_index, e_index;   /* block indexes of the freed allocation */
     773        2173 :         int s_off, e_off;       /* block offsets of the freed allocation */
     774             : 
     775             :         /*
     776             :          * Calculate per block offsets.
     777             :          * The calculation uses an inclusive range, but the resulting offsets
     778             :          * are [start, end).  e_index always points to the last block in the
     779             :          * range.
     780             :          */
     781        2173 :         s_index = pcpu_off_to_block_index(bit_off);
     782        2173 :         e_index = pcpu_off_to_block_index(bit_off + bits - 1);
     783        2173 :         s_off = pcpu_off_to_block_off(bit_off);
     784        2173 :         e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
     785             : 
     786        2173 :         s_block = chunk->md_blocks + s_index;
     787        2173 :         e_block = chunk->md_blocks + e_index;
     788             : 
     789             :         /*
     790             :          * Update s_block.
     791             :          * block->first_free must be updated if the allocation takes its place.
     792             :          * If the allocation breaks the contig_hint, a scan is required to
     793             :          * restore this hint.
     794             :          */
     795        2173 :         if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
     796           4 :                 nr_empty_pages++;
     797             : 
     798        2173 :         if (s_off == s_block->first_free)
     799        1175 :                 s_block->first_free = find_next_zero_bit(
     800        1175 :                                         pcpu_index_alloc_map(chunk, s_index),
     801             :                                         PCPU_BITMAP_BLOCK_BITS,
     802        1175 :                                         s_off + bits);
     803             : 
     804        2173 :         if (pcpu_region_overlap(s_block->scan_hint_start,
     805        2173 :                                 s_block->scan_hint_start + s_block->scan_hint,
     806             :                                 s_off,
     807             :                                 s_off + bits))
     808         140 :                 s_block->scan_hint = 0;
     809             : 
     810        2173 :         if (pcpu_region_overlap(s_block->contig_hint_start,
     811        2173 :                                 s_block->contig_hint_start +
     812        2173 :                                 s_block->contig_hint,
     813             :                                 s_off,
     814             :                                 s_off + bits)) {
     815             :                 /* block contig hint is broken - scan to fix it */
     816         736 :                 if (!s_off)
     817           4 :                         s_block->left_free = 0;
     818         736 :                 pcpu_block_refresh_hint(chunk, s_index);
     819             :         } else {
     820             :                 /* update left and right contig manually */
     821        1437 :                 s_block->left_free = min(s_block->left_free, s_off);
     822        1437 :                 if (s_index == e_index)
     823        1437 :                         s_block->right_free = min_t(int, s_block->right_free,
     824             :                                         PCPU_BITMAP_BLOCK_BITS - e_off);
     825             :                 else
     826           0 :                         s_block->right_free = 0;
     827             :         }
     828             : 
     829             :         /*
     830             :          * Update e_block.
     831             :          */
     832        2173 :         if (s_index != e_index) {
     833           5 :                 if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
     834           5 :                         nr_empty_pages++;
     835             : 
     836             :                 /*
     837             :                  * When the allocation is across blocks, the end is along
     838             :                  * the left part of the e_block.
     839             :                  */
     840          10 :                 e_block->first_free = find_next_zero_bit(
     841           5 :                                 pcpu_index_alloc_map(chunk, e_index),
     842             :                                 PCPU_BITMAP_BLOCK_BITS, e_off);
     843             : 
     844           5 :                 if (e_off == PCPU_BITMAP_BLOCK_BITS) {
     845             :                         /* reset the block */
     846           0 :                         e_block++;
     847             :                 } else {
     848           5 :                         if (e_off > e_block->scan_hint_start)
     849           5 :                                 e_block->scan_hint = 0;
     850             : 
     851           5 :                         e_block->left_free = 0;
     852           5 :                         if (e_off > e_block->contig_hint_start) {
     853             :                                 /* contig hint is broken - scan to fix it */
     854           5 :                                 pcpu_block_refresh_hint(chunk, e_index);
     855             :                         } else {
     856           0 :                                 e_block->right_free =
     857           0 :                                         min_t(int, e_block->right_free,
     858             :                                               PCPU_BITMAP_BLOCK_BITS - e_off);
     859             :                         }
     860             :                 }
     861             : 
     862             :                 /* update in-between md_blocks */
     863           5 :                 nr_empty_pages += (e_index - s_index - 1);
     864           5 :                 for (block = s_block + 1; block < e_block; block++) {
     865           0 :                         block->scan_hint = 0;
     866           0 :                         block->contig_hint = 0;
     867           0 :                         block->left_free = 0;
     868           0 :                         block->right_free = 0;
     869             :                 }
     870             :         }
     871             : 
     872        2173 :         if (nr_empty_pages)
     873           9 :                 pcpu_update_empty_pages(chunk, -nr_empty_pages);
     874             : 
     875        2173 :         if (pcpu_region_overlap(chunk_md->scan_hint_start,
     876        2173 :                                 chunk_md->scan_hint_start +
     877        2173 :                                 chunk_md->scan_hint,
     878             :                                 bit_off,
     879             :                                 bit_off + bits))
     880          30 :                 chunk_md->scan_hint = 0;
     881             : 
     882             :         /*
     883             :          * The only time a full chunk scan is required is if the chunk
     884             :          * contig hint is broken.  Otherwise, it means a smaller space
     885             :          * was used and therefore the chunk contig hint is still correct.
     886             :          */
     887        2173 :         if (pcpu_region_overlap(chunk_md->contig_hint_start,
     888        2173 :                                 chunk_md->contig_hint_start +
     889        2173 :                                 chunk_md->contig_hint,
     890             :                                 bit_off,
     891             :                                 bit_off + bits))
     892         398 :                 pcpu_chunk_refresh_hint(chunk, false);
     893        2173 : }
     894             : 
     895             : /**
     896             :  * pcpu_block_update_hint_free - updates the block hints on the free path
     897             :  * @chunk: chunk of interest
     898             :  * @bit_off: chunk offset
     899             :  * @bits: size of request
     900             :  *
     901             :  * Updates metadata for the allocation path.  This avoids a blind block
     902             :  * refresh by making use of the block contig hints.  If this fails, it scans
     903             :  * forward and backward to determine the extent of the free area.  This is
     904             :  * capped at the boundary of blocks.
     905             :  *
     906             :  * A chunk update is triggered if a page becomes free, a block becomes free,
     907             :  * or the free spans across blocks.  This tradeoff is to minimize iterating
     908             :  * over the block metadata to update chunk_md->contig_hint.
     909             :  * chunk_md->contig_hint may be off by up to a page, but it will never be more
     910             :  * than the available space.  If the contig hint is contained in one block, it
     911             :  * will be accurate.
     912             :  */
     913        1558 : static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
     914             :                                         int bits)
     915             : {
     916        1558 :         int nr_empty_pages = 0;
     917        1558 :         struct pcpu_block_md *s_block, *e_block, *block;
     918        1558 :         int s_index, e_index;   /* block indexes of the freed allocation */
     919        1558 :         int s_off, e_off;       /* block offsets of the freed allocation */
     920        1558 :         int start, end;         /* start and end of the whole free area */
     921             : 
     922             :         /*
     923             :          * Calculate per block offsets.
     924             :          * The calculation uses an inclusive range, but the resulting offsets
     925             :          * are [start, end).  e_index always points to the last block in the
     926             :          * range.
     927             :          */
     928        1558 :         s_index = pcpu_off_to_block_index(bit_off);
     929        1558 :         e_index = pcpu_off_to_block_index(bit_off + bits - 1);
     930        1558 :         s_off = pcpu_off_to_block_off(bit_off);
     931        1558 :         e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
     932             : 
     933        1558 :         s_block = chunk->md_blocks + s_index;
     934        1558 :         e_block = chunk->md_blocks + e_index;
     935             : 
     936             :         /*
     937             :          * Check if the freed area aligns with the block->contig_hint.
     938             :          * If it does, then the scan to find the beginning/end of the
     939             :          * larger free area can be avoided.
     940             :          *
     941             :          * start and end refer to beginning and end of the free area
     942             :          * within each their respective blocks.  This is not necessarily
     943             :          * the entire free area as it may span blocks past the beginning
     944             :          * or end of the block.
     945             :          */
     946        1558 :         start = s_off;
     947        1558 :         if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
     948             :                 start = s_block->contig_hint_start;
     949             :         } else {
     950             :                 /*
     951             :                  * Scan backwards to find the extent of the free area.
     952             :                  * find_last_bit returns the starting bit, so if the start bit
     953             :                  * is returned, that means there was no last bit and the
     954             :                  * remainder of the chunk is free.
     955             :                  */
     956        1540 :                 int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
     957             :                                           start);
     958        1540 :                 start = (start == l_bit) ? 0 : l_bit + 1;
     959             :         }
     960             : 
     961        1558 :         end = e_off;
     962        1558 :         if (e_off == e_block->contig_hint_start)
     963         113 :                 end = e_block->contig_hint_start + e_block->contig_hint;
     964             :         else
     965        1445 :                 end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
     966             :                                     PCPU_BITMAP_BLOCK_BITS, end);
     967             : 
     968             :         /* update s_block */
     969        1558 :         e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
     970        1558 :         if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
     971           0 :                 nr_empty_pages++;
     972        1558 :         pcpu_block_update(s_block, start, e_off);
     973             : 
     974             :         /* freeing in the same block */
     975        1558 :         if (s_index != e_index) {
     976             :                 /* update e_block */
     977           0 :                 if (end == PCPU_BITMAP_BLOCK_BITS)
     978           0 :                         nr_empty_pages++;
     979           0 :                 pcpu_block_update(e_block, 0, end);
     980             : 
     981             :                 /* reset md_blocks in the middle */
     982           0 :                 nr_empty_pages += (e_index - s_index - 1);
     983           0 :                 for (block = s_block + 1; block < e_block; block++) {
     984           0 :                         block->first_free = 0;
     985           0 :                         block->scan_hint = 0;
     986           0 :                         block->contig_hint_start = 0;
     987           0 :                         block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
     988           0 :                         block->left_free = PCPU_BITMAP_BLOCK_BITS;
     989           0 :                         block->right_free = PCPU_BITMAP_BLOCK_BITS;
     990             :                 }
     991             :         }
     992             : 
     993        1558 :         if (nr_empty_pages)
     994           0 :                 pcpu_update_empty_pages(chunk, nr_empty_pages);
     995             : 
     996             :         /*
     997             :          * Refresh chunk metadata when the free makes a block free or spans
     998             :          * across blocks.  The contig_hint may be off by up to a page, but if
     999             :          * the contig_hint is contained in a block, it will be accurate with
    1000             :          * the else condition below.
    1001             :          */
    1002        1558 :         if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
    1003           0 :                 pcpu_chunk_refresh_hint(chunk, true);
    1004             :         else
    1005        1558 :                 pcpu_block_update(&chunk->chunk_md,
    1006        1558 :                                   pcpu_block_off_to_off(s_index, start),
    1007             :                                   end);
    1008        1558 : }
    1009             : 
    1010             : /**
    1011             :  * pcpu_is_populated - determines if the region is populated
    1012             :  * @chunk: chunk of interest
    1013             :  * @bit_off: chunk offset
    1014             :  * @bits: size of area
    1015             :  * @next_off: return value for the next offset to start searching
    1016             :  *
    1017             :  * For atomic allocations, check if the backing pages are populated.
    1018             :  *
    1019             :  * RETURNS:
    1020             :  * Bool if the backing pages are populated.
    1021             :  * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
    1022             :  */
    1023           0 : static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
    1024             :                               int *next_off)
    1025             : {
    1026           0 :         unsigned int page_start, page_end, rs, re;
    1027             : 
    1028           0 :         page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
    1029           0 :         page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
    1030             : 
    1031           0 :         rs = page_start;
    1032           0 :         bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
    1033           0 :         if (rs >= page_end)
    1034             :                 return true;
    1035             : 
    1036           0 :         *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
    1037           0 :         return false;
    1038             : }
    1039             : 
    1040             : /**
    1041             :  * pcpu_find_block_fit - finds the block index to start searching
    1042             :  * @chunk: chunk of interest
    1043             :  * @alloc_bits: size of request in allocation units
    1044             :  * @align: alignment of area (max PAGE_SIZE bytes)
    1045             :  * @pop_only: use populated regions only
    1046             :  *
    1047             :  * Given a chunk and an allocation spec, find the offset to begin searching
    1048             :  * for a free region.  This iterates over the bitmap metadata blocks to
    1049             :  * find an offset that will be guaranteed to fit the requirements.  It is
    1050             :  * not quite first fit as if the allocation does not fit in the contig hint
    1051             :  * of a block or chunk, it is skipped.  This errs on the side of caution
    1052             :  * to prevent excess iteration.  Poor alignment can cause the allocator to
    1053             :  * skip over blocks and chunks that have valid free areas.
    1054             :  *
    1055             :  * RETURNS:
    1056             :  * The offset in the bitmap to begin searching.
    1057             :  * -1 if no offset is found.
    1058             :  */
    1059        2173 : static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
    1060             :                                size_t align, bool pop_only)
    1061             : {
    1062        2173 :         struct pcpu_block_md *chunk_md = &chunk->chunk_md;
    1063        2173 :         int bit_off, bits, next_off;
    1064             : 
    1065             :         /*
    1066             :          * Check to see if the allocation can fit in the chunk's contig hint.
    1067             :          * This is an optimization to prevent scanning by assuming if it
    1068             :          * cannot fit in the global hint, there is memory pressure and creating
    1069             :          * a new chunk would happen soon.
    1070             :          */
    1071        2173 :         bit_off = ALIGN(chunk_md->contig_hint_start, align) -
    1072             :                   chunk_md->contig_hint_start;
    1073        2173 :         if (bit_off + alloc_bits > chunk_md->contig_hint)
    1074             :                 return -1;
    1075             : 
    1076        2172 :         bit_off = pcpu_next_hint(chunk_md, alloc_bits);
    1077        2172 :         bits = 0;
    1078        2172 :         pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
    1079        2172 :                 if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
    1080             :                                                    &next_off))
    1081             :                         break;
    1082             : 
    1083           0 :                 bit_off = next_off;
    1084           0 :                 bits = 0;
    1085             :         }
    1086             : 
    1087        2172 :         if (bit_off == pcpu_chunk_map_bits(chunk))
    1088           0 :                 return -1;
    1089             : 
    1090             :         return bit_off;
    1091             : }
    1092             : 
    1093             : /*
    1094             :  * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
    1095             :  * @map: the address to base the search on
    1096             :  * @size: the bitmap size in bits
    1097             :  * @start: the bitnumber to start searching at
    1098             :  * @nr: the number of zeroed bits we're looking for
    1099             :  * @align_mask: alignment mask for zero area
    1100             :  * @largest_off: offset of the largest area skipped
    1101             :  * @largest_bits: size of the largest area skipped
    1102             :  *
    1103             :  * The @align_mask should be one less than a power of 2.
    1104             :  *
    1105             :  * This is a modified version of bitmap_find_next_zero_area_off() to remember
    1106             :  * the largest area that was skipped.  This is imperfect, but in general is
    1107             :  * good enough.  The largest remembered region is the largest failed region
    1108             :  * seen.  This does not include anything we possibly skipped due to alignment.
    1109             :  * pcpu_block_update_scan() does scan backwards to try and recover what was
    1110             :  * lost to alignment.  While this can cause scanning to miss earlier possible
    1111             :  * free areas, smaller allocations will eventually fill those holes.
    1112             :  */
    1113        2172 : static unsigned long pcpu_find_zero_area(unsigned long *map,
    1114             :                                          unsigned long size,
    1115             :                                          unsigned long start,
    1116             :                                          unsigned long nr,
    1117             :                                          unsigned long align_mask,
    1118             :                                          unsigned long *largest_off,
    1119             :                                          unsigned long *largest_bits)
    1120             : {
    1121        3085 :         unsigned long index, end, i, area_off, area_bits;
    1122        3085 : again:
    1123        3085 :         index = find_next_zero_bit(map, size, start);
    1124             : 
    1125             :         /* Align allocation */
    1126        3085 :         index = __ALIGN_MASK(index, align_mask);
    1127        3085 :         area_off = index;
    1128             : 
    1129        3085 :         end = index + nr;
    1130        3085 :         if (end > size)
    1131           0 :                 return end;
    1132        3085 :         i = find_next_bit(map, end, index);
    1133        3085 :         if (i < end) {
    1134         913 :                 area_bits = i - area_off;
    1135             :                 /* remember largest unused area with best alignment */
    1136         913 :                 if (area_bits > *largest_bits ||
    1137         619 :                     (area_bits == *largest_bits && *largest_off &&
    1138         284 :                      (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
    1139         338 :                         *largest_off = area_off;
    1140         338 :                         *largest_bits = area_bits;
    1141             :                 }
    1142             : 
    1143         913 :                 start = i + 1;
    1144         913 :                 goto again;
    1145             :         }
    1146             :         return index;
    1147             : }
    1148             : 
    1149             : /**
    1150             :  * pcpu_alloc_area - allocates an area from a pcpu_chunk
    1151             :  * @chunk: chunk of interest
    1152             :  * @alloc_bits: size of request in allocation units
    1153             :  * @align: alignment of area (max PAGE_SIZE)
    1154             :  * @start: bit_off to start searching
    1155             :  *
    1156             :  * This function takes in a @start offset to begin searching to fit an
    1157             :  * allocation of @alloc_bits with alignment @align.  It needs to scan
    1158             :  * the allocation map because if it fits within the block's contig hint,
    1159             :  * @start will be block->first_free. This is an attempt to fill the
    1160             :  * allocation prior to breaking the contig hint.  The allocation and
    1161             :  * boundary maps are updated accordingly if it confirms a valid
    1162             :  * free area.
    1163             :  *
    1164             :  * RETURNS:
    1165             :  * Allocated addr offset in @chunk on success.
    1166             :  * -1 if no matching area is found.
    1167             :  */
    1168        2172 : static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
    1169             :                            size_t align, int start)
    1170             : {
    1171        2172 :         struct pcpu_block_md *chunk_md = &chunk->chunk_md;
    1172        2172 :         size_t align_mask = (align) ? (align - 1) : 0;
    1173        2172 :         unsigned long area_off = 0, area_bits = 0;
    1174        2172 :         int bit_off, end, oslot;
    1175             : 
    1176        6516 :         lockdep_assert_held(&pcpu_lock);
    1177             : 
    1178        2172 :         oslot = pcpu_chunk_slot(chunk);
    1179             : 
    1180             :         /*
    1181             :          * Search to find a fit.
    1182             :          */
    1183        2172 :         end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
    1184             :                     pcpu_chunk_map_bits(chunk));
    1185        2172 :         bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
    1186             :                                       align_mask, &area_off, &area_bits);
    1187        2172 :         if (bit_off >= end)
    1188             :                 return -1;
    1189             : 
    1190        2172 :         if (area_bits)
    1191         275 :                 pcpu_block_update_scan(chunk, area_off, area_bits);
    1192             : 
    1193             :         /* update alloc map */
    1194        2172 :         bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
    1195             : 
    1196             :         /* update boundary map */
    1197        2172 :         set_bit(bit_off, chunk->bound_map);
    1198        2172 :         bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
    1199        2172 :         set_bit(bit_off + alloc_bits, chunk->bound_map);
    1200             : 
    1201        2172 :         chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
    1202             : 
    1203             :         /* update first free bit */
    1204        2172 :         if (bit_off == chunk_md->first_free)
    1205        1147 :                 chunk_md->first_free = find_next_zero_bit(
    1206        1147 :                                         chunk->alloc_map,
    1207        1147 :                                         pcpu_chunk_map_bits(chunk),
    1208             :                                         bit_off + alloc_bits);
    1209             : 
    1210        2172 :         pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
    1211             : 
    1212        2172 :         pcpu_chunk_relocate(chunk, oslot);
    1213             : 
    1214        2172 :         return bit_off * PCPU_MIN_ALLOC_SIZE;
    1215             : }
    1216             : 
    1217             : /**
    1218             :  * pcpu_free_area - frees the corresponding offset
    1219             :  * @chunk: chunk of interest
    1220             :  * @off: addr offset into chunk
    1221             :  *
    1222             :  * This function determines the size of an allocation to free using
    1223             :  * the boundary bitmap and clears the allocation map.
    1224             :  *
    1225             :  * RETURNS:
    1226             :  * Number of freed bytes.
    1227             :  */
    1228        1558 : static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
    1229             : {
    1230        1558 :         struct pcpu_block_md *chunk_md = &chunk->chunk_md;
    1231        1558 :         int bit_off, bits, end, oslot, freed;
    1232             : 
    1233        4674 :         lockdep_assert_held(&pcpu_lock);
    1234        1558 :         pcpu_stats_area_dealloc(chunk);
    1235             : 
    1236        1558 :         oslot = pcpu_chunk_slot(chunk);
    1237             : 
    1238        1558 :         bit_off = off / PCPU_MIN_ALLOC_SIZE;
    1239             : 
    1240             :         /* find end index */
    1241        3116 :         end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
    1242        1558 :                             bit_off + 1);
    1243        1558 :         bits = end - bit_off;
    1244        1558 :         bitmap_clear(chunk->alloc_map, bit_off, bits);
    1245             : 
    1246        1558 :         freed = bits * PCPU_MIN_ALLOC_SIZE;
    1247             : 
    1248             :         /* update metadata */
    1249        1558 :         chunk->free_bytes += freed;
    1250             : 
    1251             :         /* update first free bit */
    1252        1558 :         chunk_md->first_free = min(chunk_md->first_free, bit_off);
    1253             : 
    1254        1558 :         pcpu_block_update_hint_free(chunk, bit_off, bits);
    1255             : 
    1256        1558 :         pcpu_chunk_relocate(chunk, oslot);
    1257             : 
    1258        1558 :         return freed;
    1259             : }
    1260             : 
    1261         138 : static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
    1262             : {
    1263         138 :         block->scan_hint = 0;
    1264         138 :         block->contig_hint = nr_bits;
    1265         138 :         block->left_free = nr_bits;
    1266         138 :         block->right_free = nr_bits;
    1267         138 :         block->first_free = 0;
    1268         138 :         block->nr_bits = nr_bits;
    1269             : }
    1270             : 
    1271           2 : static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
    1272             : {
    1273           2 :         struct pcpu_block_md *md_block;
    1274             : 
    1275             :         /* init the chunk's block */
    1276           2 :         pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
    1277             : 
    1278         138 :         for (md_block = chunk->md_blocks;
    1279         138 :              md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
    1280         136 :              md_block++)
    1281         136 :                 pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
    1282           2 : }
    1283             : 
    1284             : /**
    1285             :  * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
    1286             :  * @tmp_addr: the start of the region served
    1287             :  * @map_size: size of the region served
    1288             :  *
    1289             :  * This is responsible for creating the chunks that serve the first chunk.  The
    1290             :  * base_addr is page aligned down of @tmp_addr while the region end is page
    1291             :  * aligned up.  Offsets are kept track of to determine the region served. All
    1292             :  * this is done to appease the bitmap allocator in avoiding partial blocks.
    1293             :  *
    1294             :  * RETURNS:
    1295             :  * Chunk serving the region at @tmp_addr of @map_size.
    1296             :  */
    1297           1 : static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
    1298             :                                                          int map_size)
    1299             : {
    1300           1 :         struct pcpu_chunk *chunk;
    1301           1 :         unsigned long aligned_addr, lcm_align;
    1302           1 :         int start_offset, offset_bits, region_size, region_bits;
    1303           1 :         size_t alloc_size;
    1304             : 
    1305             :         /* region calculations */
    1306           1 :         aligned_addr = tmp_addr & PAGE_MASK;
    1307             : 
    1308           1 :         start_offset = tmp_addr - aligned_addr;
    1309             : 
    1310             :         /*
    1311             :          * Align the end of the region with the LCM of PAGE_SIZE and
    1312             :          * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
    1313             :          * the other.
    1314             :          */
    1315           1 :         lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
    1316           1 :         region_size = ALIGN(start_offset + map_size, lcm_align);
    1317             : 
    1318             :         /* allocate chunk */
    1319           1 :         alloc_size = struct_size(chunk, populated,
    1320             :                                  BITS_TO_LONGS(region_size >> PAGE_SHIFT));
    1321           1 :         chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
    1322           1 :         if (!chunk)
    1323           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    1324             :                       alloc_size);
    1325             : 
    1326           1 :         INIT_LIST_HEAD(&chunk->list);
    1327             : 
    1328           1 :         chunk->base_addr = (void *)aligned_addr;
    1329           1 :         chunk->start_offset = start_offset;
    1330           1 :         chunk->end_offset = region_size - chunk->start_offset - map_size;
    1331             : 
    1332           1 :         chunk->nr_pages = region_size >> PAGE_SHIFT;
    1333           1 :         region_bits = pcpu_chunk_map_bits(chunk);
    1334             : 
    1335           1 :         alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
    1336           1 :         chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
    1337           1 :         if (!chunk->alloc_map)
    1338           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    1339             :                       alloc_size);
    1340             : 
    1341           1 :         alloc_size =
    1342           1 :                 BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
    1343           1 :         chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
    1344           1 :         if (!chunk->bound_map)
    1345           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    1346             :                       alloc_size);
    1347             : 
    1348           1 :         alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
    1349           1 :         chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
    1350           1 :         if (!chunk->md_blocks)
    1351           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    1352             :                       alloc_size);
    1353             : 
    1354             : #ifdef CONFIG_MEMCG_KMEM
    1355             :         /* first chunk isn't memcg-aware */
    1356             :         chunk->obj_cgroups = NULL;
    1357             : #endif
    1358           1 :         pcpu_init_md_blocks(chunk);
    1359             : 
    1360             :         /* manage populated page bitmap */
    1361           1 :         chunk->immutable = true;
    1362           1 :         bitmap_fill(chunk->populated, chunk->nr_pages);
    1363           1 :         chunk->nr_populated = chunk->nr_pages;
    1364           1 :         chunk->nr_empty_pop_pages = chunk->nr_pages;
    1365             : 
    1366           1 :         chunk->free_bytes = map_size;
    1367             : 
    1368           1 :         if (chunk->start_offset) {
    1369             :                 /* hide the beginning of the bitmap */
    1370           1 :                 offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
    1371           1 :                 bitmap_set(chunk->alloc_map, 0, offset_bits);
    1372           1 :                 set_bit(0, chunk->bound_map);
    1373           1 :                 set_bit(offset_bits, chunk->bound_map);
    1374             : 
    1375           1 :                 chunk->chunk_md.first_free = offset_bits;
    1376             : 
    1377           1 :                 pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
    1378             :         }
    1379             : 
    1380           1 :         if (chunk->end_offset) {
    1381             :                 /* hide the end of the bitmap */
    1382           0 :                 offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
    1383           0 :                 bitmap_set(chunk->alloc_map,
    1384           0 :                            pcpu_chunk_map_bits(chunk) - offset_bits,
    1385             :                            offset_bits);
    1386           0 :                 set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
    1387           0 :                         chunk->bound_map);
    1388           0 :                 set_bit(region_bits, chunk->bound_map);
    1389             : 
    1390           0 :                 pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
    1391             :                                              - offset_bits, offset_bits);
    1392             :         }
    1393             : 
    1394           1 :         return chunk;
    1395             : }
    1396             : 
    1397           1 : static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
    1398             : {
    1399           1 :         struct pcpu_chunk *chunk;
    1400           1 :         int region_bits;
    1401             : 
    1402           1 :         chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
    1403           1 :         if (!chunk)
    1404             :                 return NULL;
    1405             : 
    1406           1 :         INIT_LIST_HEAD(&chunk->list);
    1407           1 :         chunk->nr_pages = pcpu_unit_pages;
    1408           1 :         region_bits = pcpu_chunk_map_bits(chunk);
    1409             : 
    1410           1 :         chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
    1411             :                                            sizeof(chunk->alloc_map[0]), gfp);
    1412           1 :         if (!chunk->alloc_map)
    1413           0 :                 goto alloc_map_fail;
    1414             : 
    1415           1 :         chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
    1416             :                                            sizeof(chunk->bound_map[0]), gfp);
    1417           1 :         if (!chunk->bound_map)
    1418           0 :                 goto bound_map_fail;
    1419             : 
    1420           1 :         chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
    1421             :                                            sizeof(chunk->md_blocks[0]), gfp);
    1422           1 :         if (!chunk->md_blocks)
    1423           0 :                 goto md_blocks_fail;
    1424             : 
    1425             : #ifdef CONFIG_MEMCG_KMEM
    1426             :         if (pcpu_is_memcg_chunk(type)) {
    1427             :                 chunk->obj_cgroups =
    1428             :                         pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
    1429             :                                         sizeof(struct obj_cgroup *), gfp);
    1430             :                 if (!chunk->obj_cgroups)
    1431             :                         goto objcg_fail;
    1432             :         }
    1433             : #endif
    1434             : 
    1435           1 :         pcpu_init_md_blocks(chunk);
    1436             : 
    1437             :         /* init metadata */
    1438           1 :         chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
    1439             : 
    1440           1 :         return chunk;
    1441             : 
    1442             : #ifdef CONFIG_MEMCG_KMEM
    1443             : objcg_fail:
    1444             :         pcpu_mem_free(chunk->md_blocks);
    1445             : #endif
    1446           0 : md_blocks_fail:
    1447           0 :         pcpu_mem_free(chunk->bound_map);
    1448           0 : bound_map_fail:
    1449           0 :         pcpu_mem_free(chunk->alloc_map);
    1450           0 : alloc_map_fail:
    1451           0 :         pcpu_mem_free(chunk);
    1452             : 
    1453           0 :         return NULL;
    1454             : }
    1455             : 
    1456           0 : static void pcpu_free_chunk(struct pcpu_chunk *chunk)
    1457             : {
    1458           0 :         if (!chunk)
    1459             :                 return;
    1460             : #ifdef CONFIG_MEMCG_KMEM
    1461             :         pcpu_mem_free(chunk->obj_cgroups);
    1462             : #endif
    1463           0 :         pcpu_mem_free(chunk->md_blocks);
    1464           0 :         pcpu_mem_free(chunk->bound_map);
    1465           0 :         pcpu_mem_free(chunk->alloc_map);
    1466           0 :         pcpu_mem_free(chunk);
    1467             : }
    1468             : 
    1469             : /**
    1470             :  * pcpu_chunk_populated - post-population bookkeeping
    1471             :  * @chunk: pcpu_chunk which got populated
    1472             :  * @page_start: the start page
    1473             :  * @page_end: the end page
    1474             :  *
    1475             :  * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
    1476             :  * the bookkeeping information accordingly.  Must be called after each
    1477             :  * successful population.
    1478             :  *
    1479             :  * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
    1480             :  * is to serve an allocation in that area.
    1481             :  */
    1482           1 : static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
    1483             :                                  int page_end)
    1484             : {
    1485           1 :         int nr = page_end - page_start;
    1486             : 
    1487           3 :         lockdep_assert_held(&pcpu_lock);
    1488             : 
    1489           1 :         bitmap_set(chunk->populated, page_start, nr);
    1490           1 :         chunk->nr_populated += nr;
    1491           1 :         pcpu_nr_populated += nr;
    1492             : 
    1493           1 :         pcpu_update_empty_pages(chunk, nr);
    1494           1 : }
    1495             : 
    1496             : /**
    1497             :  * pcpu_chunk_depopulated - post-depopulation bookkeeping
    1498             :  * @chunk: pcpu_chunk which got depopulated
    1499             :  * @page_start: the start page
    1500             :  * @page_end: the end page
    1501             :  *
    1502             :  * Pages in [@page_start,@page_end) have been depopulated from @chunk.
    1503             :  * Update the bookkeeping information accordingly.  Must be called after
    1504             :  * each successful depopulation.
    1505             :  */
    1506           0 : static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
    1507             :                                    int page_start, int page_end)
    1508             : {
    1509           0 :         int nr = page_end - page_start;
    1510             : 
    1511           0 :         lockdep_assert_held(&pcpu_lock);
    1512             : 
    1513           0 :         bitmap_clear(chunk->populated, page_start, nr);
    1514           0 :         chunk->nr_populated -= nr;
    1515           0 :         pcpu_nr_populated -= nr;
    1516             : 
    1517           0 :         pcpu_update_empty_pages(chunk, -nr);
    1518           0 : }
    1519             : 
    1520             : /*
    1521             :  * Chunk management implementation.
    1522             :  *
    1523             :  * To allow different implementations, chunk alloc/free and
    1524             :  * [de]population are implemented in a separate file which is pulled
    1525             :  * into this file and compiled together.  The following functions
    1526             :  * should be implemented.
    1527             :  *
    1528             :  * pcpu_populate_chunk          - populate the specified range of a chunk
    1529             :  * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
    1530             :  * pcpu_create_chunk            - create a new chunk
    1531             :  * pcpu_destroy_chunk           - destroy a chunk, always preceded by full depop
    1532             :  * pcpu_addr_to_page            - translate address to physical address
    1533             :  * pcpu_verify_alloc_info       - check alloc_info is acceptable during init
    1534             :  */
    1535             : static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
    1536             :                                int page_start, int page_end, gfp_t gfp);
    1537             : static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
    1538             :                                   int page_start, int page_end);
    1539             : static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
    1540             :                                             gfp_t gfp);
    1541             : static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
    1542             : static struct page *pcpu_addr_to_page(void *addr);
    1543             : static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
    1544             : 
    1545             : #ifdef CONFIG_NEED_PER_CPU_KM
    1546             : #include "percpu-km.c"
    1547             : #else
    1548             : #include "percpu-vm.c"
    1549             : #endif
    1550             : 
    1551             : /**
    1552             :  * pcpu_chunk_addr_search - determine chunk containing specified address
    1553             :  * @addr: address for which the chunk needs to be determined.
    1554             :  *
    1555             :  * This is an internal function that handles all but static allocations.
    1556             :  * Static percpu address values should never be passed into the allocator.
    1557             :  *
    1558             :  * RETURNS:
    1559             :  * The address of the found chunk.
    1560             :  */
    1561        1558 : static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
    1562             : {
    1563             :         /* is it in the dynamic region (first chunk)? */
    1564        1558 :         if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
    1565             :                 return pcpu_first_chunk;
    1566             : 
    1567             :         /* is it in the reserved region? */
    1568         127 :         if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
    1569             :                 return pcpu_reserved_chunk;
    1570             : 
    1571             :         /*
    1572             :          * The address is relative to unit0 which might be unused and
    1573             :          * thus unmapped.  Offset the address to the unit space of the
    1574             :          * current processor before looking it up in the vmalloc
    1575             :          * space.  Note that any possible cpu id can be used here, so
    1576             :          * there's no need to worry about preemption or cpu hotplug.
    1577             :          */
    1578         127 :         addr += pcpu_unit_offsets[raw_smp_processor_id()];
    1579         127 :         return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
    1580             : }
    1581             : 
    1582             : #ifdef CONFIG_MEMCG_KMEM
    1583             : static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
    1584             :                                                      struct obj_cgroup **objcgp)
    1585             : {
    1586             :         struct obj_cgroup *objcg;
    1587             : 
    1588             :         if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
    1589             :                 return PCPU_CHUNK_ROOT;
    1590             : 
    1591             :         objcg = get_obj_cgroup_from_current();
    1592             :         if (!objcg)
    1593             :                 return PCPU_CHUNK_ROOT;
    1594             : 
    1595             :         if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
    1596             :                 obj_cgroup_put(objcg);
    1597             :                 return PCPU_FAIL_ALLOC;
    1598             :         }
    1599             : 
    1600             :         *objcgp = objcg;
    1601             :         return PCPU_CHUNK_MEMCG;
    1602             : }
    1603             : 
    1604             : static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
    1605             :                                        struct pcpu_chunk *chunk, int off,
    1606             :                                        size_t size)
    1607             : {
    1608             :         if (!objcg)
    1609             :                 return;
    1610             : 
    1611             :         if (chunk) {
    1612             :                 chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
    1613             : 
    1614             :                 rcu_read_lock();
    1615             :                 mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
    1616             :                                 size * num_possible_cpus());
    1617             :                 rcu_read_unlock();
    1618             :         } else {
    1619             :                 obj_cgroup_uncharge(objcg, size * num_possible_cpus());
    1620             :                 obj_cgroup_put(objcg);
    1621             :         }
    1622             : }
    1623             : 
    1624             : static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
    1625             : {
    1626             :         struct obj_cgroup *objcg;
    1627             : 
    1628             :         if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
    1629             :                 return;
    1630             : 
    1631             :         objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
    1632             :         chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
    1633             : 
    1634             :         obj_cgroup_uncharge(objcg, size * num_possible_cpus());
    1635             : 
    1636             :         rcu_read_lock();
    1637             :         mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
    1638             :                         -(size * num_possible_cpus()));
    1639             :         rcu_read_unlock();
    1640             : 
    1641             :         obj_cgroup_put(objcg);
    1642             : }
    1643             : 
    1644             : #else /* CONFIG_MEMCG_KMEM */
    1645             : static enum pcpu_chunk_type
    1646        2172 : pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
    1647             : {
    1648        2172 :         return PCPU_CHUNK_ROOT;
    1649             : }
    1650             : 
    1651        2172 : static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
    1652             :                                        struct pcpu_chunk *chunk, int off,
    1653             :                                        size_t size)
    1654             : {
    1655        2172 : }
    1656             : 
    1657        1558 : static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
    1658             : {
    1659        1558 : }
    1660             : #endif /* CONFIG_MEMCG_KMEM */
    1661             : 
    1662             : /**
    1663             :  * pcpu_alloc - the percpu allocator
    1664             :  * @size: size of area to allocate in bytes
    1665             :  * @align: alignment of area (max PAGE_SIZE)
    1666             :  * @reserved: allocate from the reserved chunk if available
    1667             :  * @gfp: allocation flags
    1668             :  *
    1669             :  * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
    1670             :  * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
    1671             :  * then no warning will be triggered on invalid or failed allocation
    1672             :  * requests.
    1673             :  *
    1674             :  * RETURNS:
    1675             :  * Percpu pointer to the allocated area on success, NULL on failure.
    1676             :  */
    1677        2172 : static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
    1678             :                                  gfp_t gfp)
    1679             : {
    1680        2172 :         gfp_t pcpu_gfp;
    1681        2172 :         bool is_atomic;
    1682        2172 :         bool do_warn;
    1683        2172 :         enum pcpu_chunk_type type;
    1684        2172 :         struct list_head *pcpu_slot;
    1685        2172 :         struct obj_cgroup *objcg = NULL;
    1686        2172 :         static int warn_limit = 10;
    1687        2172 :         struct pcpu_chunk *chunk, *next;
    1688        2172 :         const char *err;
    1689        2172 :         int slot, off, cpu, ret;
    1690        2172 :         unsigned long flags;
    1691        2172 :         void __percpu *ptr;
    1692        2172 :         size_t bits, bit_align;
    1693             : 
    1694        2172 :         gfp = current_gfp_context(gfp);
    1695             :         /* whitelisted flags that can be passed to the backing allocators */
    1696        2172 :         pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
    1697        2172 :         is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
    1698        2172 :         do_warn = !(gfp & __GFP_NOWARN);
    1699             : 
    1700             :         /*
    1701             :          * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
    1702             :          * therefore alignment must be a minimum of that many bytes.
    1703             :          * An allocation may have internal fragmentation from rounding up
    1704             :          * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
    1705             :          */
    1706        2172 :         if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
    1707           1 :                 align = PCPU_MIN_ALLOC_SIZE;
    1708             : 
    1709        2172 :         size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
    1710        2172 :         bits = size >> PCPU_MIN_ALLOC_SHIFT;
    1711        2172 :         bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
    1712             : 
    1713        4344 :         if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
    1714             :                      !is_power_of_2(align))) {
    1715           0 :                 WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
    1716             :                      size, align);
    1717           0 :                 return NULL;
    1718             :         }
    1719             : 
    1720        2172 :         type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
    1721        2172 :         if (unlikely(type == PCPU_FAIL_ALLOC))
    1722             :                 return NULL;
    1723        2172 :         pcpu_slot = pcpu_chunk_list(type);
    1724             : 
    1725        2172 :         if (!is_atomic) {
    1726             :                 /*
    1727             :                  * pcpu_balance_workfn() allocates memory under this mutex,
    1728             :                  * and it may wait for memory reclaim. Allow current task
    1729             :                  * to become OOM victim, in case of memory pressure.
    1730             :                  */
    1731        2172 :                 if (gfp & __GFP_NOFAIL) {
    1732           0 :                         mutex_lock(&pcpu_alloc_mutex);
    1733        2172 :                 } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
    1734        2172 :                         pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
    1735             :                         return NULL;
    1736             :                 }
    1737             :         }
    1738             : 
    1739        2172 :         spin_lock_irqsave(&pcpu_lock, flags);
    1740             : 
    1741             :         /* serve reserved allocations from the reserved chunk if available */
    1742        2172 :         if (reserved && pcpu_reserved_chunk) {
    1743           0 :                 chunk = pcpu_reserved_chunk;
    1744             : 
    1745           0 :                 off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
    1746           0 :                 if (off < 0) {
    1747           0 :                         err = "alloc from reserved chunk failed";
    1748           0 :                         goto fail_unlock;
    1749             :                 }
    1750             : 
    1751           0 :                 off = pcpu_alloc_area(chunk, bits, bit_align, off);
    1752           0 :                 if (off >= 0)
    1753           0 :                         goto area_found;
    1754             : 
    1755           0 :                 err = "alloc from reserved chunk failed";
    1756           0 :                 goto fail_unlock;
    1757             :         }
    1758             : 
    1759        2172 : restart:
    1760             :         /* search through normal chunks */
    1761       14801 :         for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
    1762       12630 :                 list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
    1763        2173 :                         off = pcpu_find_block_fit(chunk, bits, bit_align,
    1764             :                                                   is_atomic);
    1765        2173 :                         if (off < 0) {
    1766           1 :                                 if (slot < PCPU_SLOT_FAIL_THRESHOLD)
    1767           1 :                                         pcpu_chunk_move(chunk, 0);
    1768           1 :                                 continue;
    1769             :                         }
    1770             : 
    1771        2172 :                         off = pcpu_alloc_area(chunk, bits, bit_align, off);
    1772        2172 :                         if (off >= 0)
    1773        2172 :                                 goto area_found;
    1774             : 
    1775             :                 }
    1776             :         }
    1777             : 
    1778           0 :         spin_unlock_irqrestore(&pcpu_lock, flags);
    1779             : 
    1780             :         /*
    1781             :          * No space left.  Create a new chunk.  We don't want multiple
    1782             :          * tasks to create chunks simultaneously.  Serialize and create iff
    1783             :          * there's still no empty chunk after grabbing the mutex.
    1784             :          */
    1785           0 :         if (is_atomic) {
    1786           0 :                 err = "atomic alloc failed, no space left";
    1787           0 :                 goto fail;
    1788             :         }
    1789             : 
    1790           0 :         if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
    1791           0 :                 chunk = pcpu_create_chunk(type, pcpu_gfp);
    1792           0 :                 if (!chunk) {
    1793           0 :                         err = "failed to allocate new chunk";
    1794           0 :                         goto fail;
    1795             :                 }
    1796             : 
    1797           0 :                 spin_lock_irqsave(&pcpu_lock, flags);
    1798           0 :                 pcpu_chunk_relocate(chunk, -1);
    1799             :         } else {
    1800           0 :                 spin_lock_irqsave(&pcpu_lock, flags);
    1801             :         }
    1802             : 
    1803           0 :         goto restart;
    1804             : 
    1805        2172 : area_found:
    1806        2172 :         pcpu_stats_area_alloc(chunk, size);
    1807        2172 :         spin_unlock_irqrestore(&pcpu_lock, flags);
    1808             : 
    1809             :         /* populate if not all pages are already there */
    1810        2172 :         if (!is_atomic) {
    1811        2172 :                 unsigned int page_start, page_end, rs, re;
    1812             : 
    1813        2172 :                 page_start = PFN_DOWN(off);
    1814        2172 :                 page_end = PFN_UP(off + size);
    1815             : 
    1816        2172 :                 bitmap_for_each_clear_region(chunk->populated, rs, re,
    1817             :                                              page_start, page_end) {
    1818           0 :                         WARN_ON(chunk->immutable);
    1819             : 
    1820           0 :                         ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
    1821             : 
    1822           0 :                         spin_lock_irqsave(&pcpu_lock, flags);
    1823           0 :                         if (ret) {
    1824           0 :                                 pcpu_free_area(chunk, off);
    1825           0 :                                 err = "failed to populate";
    1826           0 :                                 goto fail_unlock;
    1827             :                         }
    1828           0 :                         pcpu_chunk_populated(chunk, rs, re);
    1829           0 :                         spin_unlock_irqrestore(&pcpu_lock, flags);
    1830             :                 }
    1831             : 
    1832        2172 :                 mutex_unlock(&pcpu_alloc_mutex);
    1833             :         }
    1834             : 
    1835        2172 :         if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
    1836           4 :                 pcpu_schedule_balance_work();
    1837             : 
    1838             :         /* clear the areas and return address relative to base address */
    1839       10860 :         for_each_possible_cpu(cpu)
    1840        8688 :                 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
    1841             : 
    1842        2172 :         ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
    1843        2172 :         kmemleak_alloc_percpu(ptr, size, gfp);
    1844             : 
    1845        2172 :         trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
    1846             :                         chunk->base_addr, off, ptr);
    1847             : 
    1848        2172 :         pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
    1849             : 
    1850        2172 :         return ptr;
    1851             : 
    1852           0 : fail_unlock:
    1853           0 :         spin_unlock_irqrestore(&pcpu_lock, flags);
    1854           0 : fail:
    1855           0 :         trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
    1856             : 
    1857           0 :         if (!is_atomic && do_warn && warn_limit) {
    1858           0 :                 pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
    1859             :                         size, align, is_atomic, err);
    1860           0 :                 dump_stack();
    1861           0 :                 if (!--warn_limit)
    1862           0 :                         pr_info("limit reached, disable warning\n");
    1863             :         }
    1864           0 :         if (is_atomic) {
    1865             :                 /* see the flag handling in pcpu_blance_workfn() */
    1866           0 :                 pcpu_atomic_alloc_failed = true;
    1867           0 :                 pcpu_schedule_balance_work();
    1868             :         } else {
    1869           0 :                 mutex_unlock(&pcpu_alloc_mutex);
    1870             :         }
    1871             : 
    1872        2172 :         pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
    1873             : 
    1874             :         return NULL;
    1875             : }
    1876             : 
    1877             : /**
    1878             :  * __alloc_percpu_gfp - allocate dynamic percpu area
    1879             :  * @size: size of area to allocate in bytes
    1880             :  * @align: alignment of area (max PAGE_SIZE)
    1881             :  * @gfp: allocation flags
    1882             :  *
    1883             :  * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
    1884             :  * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
    1885             :  * be called from any context but is a lot more likely to fail. If @gfp
    1886             :  * has __GFP_NOWARN then no warning will be triggered on invalid or failed
    1887             :  * allocation requests.
    1888             :  *
    1889             :  * RETURNS:
    1890             :  * Percpu pointer to the allocated area on success, NULL on failure.
    1891             :  */
    1892         323 : void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
    1893             : {
    1894         323 :         return pcpu_alloc(size, align, false, gfp);
    1895             : }
    1896             : EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
    1897             : 
    1898             : /**
    1899             :  * __alloc_percpu - allocate dynamic percpu area
    1900             :  * @size: size of area to allocate in bytes
    1901             :  * @align: alignment of area (max PAGE_SIZE)
    1902             :  *
    1903             :  * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
    1904             :  */
    1905        1849 : void __percpu *__alloc_percpu(size_t size, size_t align)
    1906             : {
    1907        1849 :         return pcpu_alloc(size, align, false, GFP_KERNEL);
    1908             : }
    1909             : EXPORT_SYMBOL_GPL(__alloc_percpu);
    1910             : 
    1911             : /**
    1912             :  * __alloc_reserved_percpu - allocate reserved percpu area
    1913             :  * @size: size of area to allocate in bytes
    1914             :  * @align: alignment of area (max PAGE_SIZE)
    1915             :  *
    1916             :  * Allocate zero-filled percpu area of @size bytes aligned at @align
    1917             :  * from reserved percpu area if arch has set it up; otherwise,
    1918             :  * allocation is served from the same dynamic area.  Might sleep.
    1919             :  * Might trigger writeouts.
    1920             :  *
    1921             :  * CONTEXT:
    1922             :  * Does GFP_KERNEL allocation.
    1923             :  *
    1924             :  * RETURNS:
    1925             :  * Percpu pointer to the allocated area on success, NULL on failure.
    1926             :  */
    1927           0 : void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
    1928             : {
    1929           0 :         return pcpu_alloc(size, align, true, GFP_KERNEL);
    1930             : }
    1931             : 
    1932             : /**
    1933             :  * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
    1934             :  * @type: chunk type
    1935             :  *
    1936             :  * Reclaim all fully free chunks except for the first one.  This is also
    1937             :  * responsible for maintaining the pool of empty populated pages.  However,
    1938             :  * it is possible that this is called when physical memory is scarce causing
    1939             :  * OOM killer to be triggered.  We should avoid doing so until an actual
    1940             :  * allocation causes the failure as it is possible that requests can be
    1941             :  * serviced from already backed regions.
    1942             :  */
    1943           1 : static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
    1944             : {
    1945             :         /* gfp flags passed to underlying allocators */
    1946           1 :         const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
    1947           1 :         LIST_HEAD(to_free);
    1948           1 :         struct list_head *pcpu_slot = pcpu_chunk_list(type);
    1949           1 :         struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
    1950           1 :         struct pcpu_chunk *chunk, *next;
    1951           1 :         int slot, nr_to_pop, ret;
    1952             : 
    1953             :         /*
    1954             :          * There's no reason to keep around multiple unused chunks and VM
    1955             :          * areas can be scarce.  Destroy all free chunks except for one.
    1956             :          */
    1957           1 :         mutex_lock(&pcpu_alloc_mutex);
    1958           1 :         spin_lock_irq(&pcpu_lock);
    1959             : 
    1960           1 :         list_for_each_entry_safe(chunk, next, free_head, list) {
    1961           0 :                 WARN_ON(chunk->immutable);
    1962             : 
    1963             :                 /* spare the first one */
    1964           0 :                 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
    1965           0 :                         continue;
    1966             : 
    1967           0 :                 list_move(&chunk->list, &to_free);
    1968             :         }
    1969             : 
    1970           1 :         spin_unlock_irq(&pcpu_lock);
    1971             : 
    1972           1 :         list_for_each_entry_safe(chunk, next, &to_free, list) {
    1973           0 :                 unsigned int rs, re;
    1974             : 
    1975           0 :                 bitmap_for_each_set_region(chunk->populated, rs, re, 0,
    1976             :                                            chunk->nr_pages) {
    1977           0 :                         pcpu_depopulate_chunk(chunk, rs, re);
    1978           0 :                         spin_lock_irq(&pcpu_lock);
    1979           0 :                         pcpu_chunk_depopulated(chunk, rs, re);
    1980           0 :                         spin_unlock_irq(&pcpu_lock);
    1981             :                 }
    1982           0 :                 pcpu_destroy_chunk(chunk);
    1983           0 :                 cond_resched();
    1984             :         }
    1985             : 
    1986             :         /*
    1987             :          * Ensure there are certain number of free populated pages for
    1988             :          * atomic allocs.  Fill up from the most packed so that atomic
    1989             :          * allocs don't increase fragmentation.  If atomic allocation
    1990             :          * failed previously, always populate the maximum amount.  This
    1991             :          * should prevent atomic allocs larger than PAGE_SIZE from keeping
    1992             :          * failing indefinitely; however, large atomic allocs are not
    1993             :          * something we support properly and can be highly unreliable and
    1994             :          * inefficient.
    1995             :          */
    1996           1 : retry_pop:
    1997           2 :         if (pcpu_atomic_alloc_failed) {
    1998           0 :                 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
    1999             :                 /* best effort anyway, don't worry about synchronization */
    2000           0 :                 pcpu_atomic_alloc_failed = false;
    2001             :         } else {
    2002           2 :                 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
    2003             :                                   pcpu_nr_empty_pop_pages,
    2004             :                                   0, PCPU_EMPTY_POP_PAGES_HIGH);
    2005             :         }
    2006             : 
    2007          22 :         for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
    2008          18 :                 unsigned int nr_unpop = 0, rs, re;
    2009             : 
    2010          18 :                 if (!nr_to_pop)
    2011             :                         break;
    2012             : 
    2013          18 :                 spin_lock_irq(&pcpu_lock);
    2014          20 :                 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
    2015           3 :                         nr_unpop = chunk->nr_pages - chunk->nr_populated;
    2016           3 :                         if (nr_unpop)
    2017             :                                 break;
    2018             :                 }
    2019          18 :                 spin_unlock_irq(&pcpu_lock);
    2020             : 
    2021          18 :                 if (!nr_unpop)
    2022          17 :                         continue;
    2023             : 
    2024             :                 /* @chunk can't go away while pcpu_alloc_mutex is held */
    2025           1 :                 bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
    2026             :                                              chunk->nr_pages) {
    2027           1 :                         int nr = min_t(int, re - rs, nr_to_pop);
    2028             : 
    2029           1 :                         ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
    2030           1 :                         if (!ret) {
    2031           1 :                                 nr_to_pop -= nr;
    2032           1 :                                 spin_lock_irq(&pcpu_lock);
    2033           1 :                                 pcpu_chunk_populated(chunk, rs, rs + nr);
    2034           2 :                                 spin_unlock_irq(&pcpu_lock);
    2035             :                         } else {
    2036             :                                 nr_to_pop = 0;
    2037             :                         }
    2038             : 
    2039           1 :                         if (!nr_to_pop)
    2040             :                                 break;
    2041             :                 }
    2042             :         }
    2043             : 
    2044           2 :         if (nr_to_pop) {
    2045             :                 /* ran out of chunks to populate, create a new one and retry */
    2046           1 :                 chunk = pcpu_create_chunk(type, gfp);
    2047           1 :                 if (chunk) {
    2048           1 :                         spin_lock_irq(&pcpu_lock);
    2049           1 :                         pcpu_chunk_relocate(chunk, -1);
    2050           1 :                         spin_unlock_irq(&pcpu_lock);
    2051           1 :                         goto retry_pop;
    2052             :                 }
    2053             :         }
    2054             : 
    2055           1 :         mutex_unlock(&pcpu_alloc_mutex);
    2056           1 : }
    2057             : 
    2058             : /**
    2059             :  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
    2060             :  * @work: unused
    2061             :  *
    2062             :  * Call __pcpu_balance_workfn() for each chunk type.
    2063             :  */
    2064           1 : static void pcpu_balance_workfn(struct work_struct *work)
    2065             : {
    2066           1 :         enum pcpu_chunk_type type;
    2067             : 
    2068           2 :         for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
    2069           1 :                 __pcpu_balance_workfn(type);
    2070           1 : }
    2071             : 
    2072             : /**
    2073             :  * free_percpu - free percpu area
    2074             :  * @ptr: pointer to area to free
    2075             :  *
    2076             :  * Free percpu area @ptr.
    2077             :  *
    2078             :  * CONTEXT:
    2079             :  * Can be called from atomic context.
    2080             :  */
    2081        1653 : void free_percpu(void __percpu *ptr)
    2082             : {
    2083        1653 :         void *addr;
    2084        1653 :         struct pcpu_chunk *chunk;
    2085        1653 :         unsigned long flags;
    2086        1653 :         int size, off;
    2087        1653 :         bool need_balance = false;
    2088        1653 :         struct list_head *pcpu_slot;
    2089             : 
    2090        1653 :         if (!ptr)
    2091             :                 return;
    2092             : 
    2093        1558 :         kmemleak_free_percpu(ptr);
    2094             : 
    2095        1558 :         addr = __pcpu_ptr_to_addr(ptr);
    2096             : 
    2097        1558 :         spin_lock_irqsave(&pcpu_lock, flags);
    2098             : 
    2099        1558 :         chunk = pcpu_chunk_addr_search(addr);
    2100        1558 :         off = addr - chunk->base_addr;
    2101             : 
    2102        1558 :         size = pcpu_free_area(chunk, off);
    2103             : 
    2104        1558 :         pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
    2105             : 
    2106        1558 :         pcpu_memcg_free_hook(chunk, off, size);
    2107             : 
    2108             :         /* if there are more than one fully free chunks, wake up grim reaper */
    2109        1558 :         if (chunk->free_bytes == pcpu_unit_size) {
    2110           0 :                 struct pcpu_chunk *pos;
    2111             : 
    2112           0 :                 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
    2113           0 :                         if (pos != chunk) {
    2114             :                                 need_balance = true;
    2115             :                                 break;
    2116             :                         }
    2117             :         }
    2118             : 
    2119        1558 :         trace_percpu_free_percpu(chunk->base_addr, off, ptr);
    2120             : 
    2121        1558 :         spin_unlock_irqrestore(&pcpu_lock, flags);
    2122             : 
    2123        1558 :         if (need_balance)
    2124           0 :                 pcpu_schedule_balance_work();
    2125             : }
    2126             : EXPORT_SYMBOL_GPL(free_percpu);
    2127             : 
    2128         333 : bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
    2129             : {
    2130             : #ifdef CONFIG_SMP
    2131         333 :         const size_t static_size = __per_cpu_end - __per_cpu_start;
    2132         333 :         void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
    2133         333 :         unsigned int cpu;
    2134             : 
    2135        1553 :         for_each_possible_cpu(cpu) {
    2136        1244 :                 void *start = per_cpu_ptr(base, cpu);
    2137        1244 :                 void *va = (void *)addr;
    2138             : 
    2139        1244 :                 if (va >= start && va < start + static_size) {
    2140          24 :                         if (can_addr) {
    2141          24 :                                 *can_addr = (unsigned long) (va - start);
    2142          24 :                                 *can_addr += (unsigned long)
    2143          24 :                                         per_cpu_ptr(base, get_boot_cpu_id());
    2144             :                         }
    2145          24 :                         return true;
    2146             :                 }
    2147             :         }
    2148             : #endif
    2149             :         /* on UP, can't distinguish from other static vars, always false */
    2150             :         return false;
    2151             : }
    2152             : 
    2153             : /**
    2154             :  * is_kernel_percpu_address - test whether address is from static percpu area
    2155             :  * @addr: address to test
    2156             :  *
    2157             :  * Test whether @addr belongs to in-kernel static percpu area.  Module
    2158             :  * static percpu areas are not considered.  For those, use
    2159             :  * is_module_percpu_address().
    2160             :  *
    2161             :  * RETURNS:
    2162             :  * %true if @addr is from in-kernel static percpu area, %false otherwise.
    2163             :  */
    2164          85 : bool is_kernel_percpu_address(unsigned long addr)
    2165             : {
    2166          85 :         return __is_kernel_percpu_address(addr, NULL);
    2167             : }
    2168             : 
    2169             : /**
    2170             :  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
    2171             :  * @addr: the address to be converted to physical address
    2172             :  *
    2173             :  * Given @addr which is dereferenceable address obtained via one of
    2174             :  * percpu access macros, this function translates it into its physical
    2175             :  * address.  The caller is responsible for ensuring @addr stays valid
    2176             :  * until this function finishes.
    2177             :  *
    2178             :  * percpu allocator has special setup for the first chunk, which currently
    2179             :  * supports either embedding in linear address space or vmalloc mapping,
    2180             :  * and, from the second one, the backing allocator (currently either vm or
    2181             :  * km) provides translation.
    2182             :  *
    2183             :  * The addr can be translated simply without checking if it falls into the
    2184             :  * first chunk. But the current code reflects better how percpu allocator
    2185             :  * actually works, and the verification can discover both bugs in percpu
    2186             :  * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
    2187             :  * code.
    2188             :  *
    2189             :  * RETURNS:
    2190             :  * The physical address for @addr.
    2191             :  */
    2192          64 : phys_addr_t per_cpu_ptr_to_phys(void *addr)
    2193             : {
    2194          64 :         void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
    2195          64 :         bool in_first_chunk = false;
    2196          64 :         unsigned long first_low, first_high;
    2197          64 :         unsigned int cpu;
    2198             : 
    2199             :         /*
    2200             :          * The following test on unit_low/high isn't strictly
    2201             :          * necessary but will speed up lookups of addresses which
    2202             :          * aren't in the first chunk.
    2203             :          *
    2204             :          * The address check is against full chunk sizes.  pcpu_base_addr
    2205             :          * points to the beginning of the first chunk including the
    2206             :          * static region.  Assumes good intent as the first chunk may
    2207             :          * not be full (ie. < pcpu_unit_pages in size).
    2208             :          */
    2209          64 :         first_low = (unsigned long)pcpu_base_addr +
    2210          64 :                     pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
    2211          64 :         first_high = (unsigned long)pcpu_base_addr +
    2212          64 :                      pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
    2213          64 :         if ((unsigned long)addr >= first_low &&
    2214          64 :             (unsigned long)addr < first_high) {
    2215         160 :                 for_each_possible_cpu(cpu) {
    2216         160 :                         void *start = per_cpu_ptr(base, cpu);
    2217             : 
    2218         160 :                         if (addr >= start && addr < start + pcpu_unit_size) {
    2219             :                                 in_first_chunk = true;
    2220             :                                 break;
    2221             :                         }
    2222             :                 }
    2223             :         }
    2224             : 
    2225          64 :         if (in_first_chunk) {
    2226          64 :                 if (!is_vmalloc_addr(addr))
    2227          64 :                         return __pa(addr);
    2228             :                 else
    2229           0 :                         return page_to_phys(vmalloc_to_page(addr)) +
    2230           0 :                                offset_in_page(addr);
    2231             :         } else
    2232           0 :                 return page_to_phys(pcpu_addr_to_page(addr)) +
    2233           0 :                        offset_in_page(addr);
    2234             : }
    2235             : 
    2236             : /**
    2237             :  * pcpu_alloc_alloc_info - allocate percpu allocation info
    2238             :  * @nr_groups: the number of groups
    2239             :  * @nr_units: the number of units
    2240             :  *
    2241             :  * Allocate ai which is large enough for @nr_groups groups containing
    2242             :  * @nr_units units.  The returned ai's groups[0].cpu_map points to the
    2243             :  * cpu_map array which is long enough for @nr_units and filled with
    2244             :  * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
    2245             :  * pointer of other groups.
    2246             :  *
    2247             :  * RETURNS:
    2248             :  * Pointer to the allocated pcpu_alloc_info on success, NULL on
    2249             :  * failure.
    2250             :  */
    2251           1 : struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
    2252             :                                                       int nr_units)
    2253             : {
    2254           1 :         struct pcpu_alloc_info *ai;
    2255           1 :         size_t base_size, ai_size;
    2256           1 :         void *ptr;
    2257           1 :         int unit;
    2258             : 
    2259           0 :         base_size = ALIGN(struct_size(ai, groups, nr_groups),
    2260             :                           __alignof__(ai->groups[0].cpu_map[0]));
    2261           1 :         ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
    2262             : 
    2263           1 :         ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
    2264           1 :         if (!ptr)
    2265             :                 return NULL;
    2266           1 :         ai = ptr;
    2267           1 :         ptr += base_size;
    2268             : 
    2269           1 :         ai->groups[0].cpu_map = ptr;
    2270             : 
    2271           5 :         for (unit = 0; unit < nr_units; unit++)
    2272           4 :                 ai->groups[0].cpu_map[unit] = NR_CPUS;
    2273             : 
    2274           1 :         ai->nr_groups = nr_groups;
    2275           1 :         ai->__ai_size = PFN_ALIGN(ai_size);
    2276             : 
    2277           0 :         return ai;
    2278             : }
    2279             : 
    2280             : /**
    2281             :  * pcpu_free_alloc_info - free percpu allocation info
    2282             :  * @ai: pcpu_alloc_info to free
    2283             :  *
    2284             :  * Free @ai which was allocated by pcpu_alloc_alloc_info().
    2285             :  */
    2286           1 : void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
    2287             : {
    2288           1 :         memblock_free_early(__pa(ai), ai->__ai_size);
    2289           1 : }
    2290             : 
    2291             : /**
    2292             :  * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
    2293             :  * @lvl: loglevel
    2294             :  * @ai: allocation info to dump
    2295             :  *
    2296             :  * Print out information about @ai using loglevel @lvl.
    2297             :  */
    2298           1 : static void pcpu_dump_alloc_info(const char *lvl,
    2299             :                                  const struct pcpu_alloc_info *ai)
    2300             : {
    2301           1 :         int group_width = 1, cpu_width = 1, width;
    2302           1 :         char empty_str[] = "--------";
    2303           1 :         int alloc = 0, alloc_end = 0;
    2304           1 :         int group, v;
    2305           1 :         int upa, apl;   /* units per alloc, allocs per line */
    2306             : 
    2307           1 :         v = ai->nr_groups;
    2308           1 :         while (v /= 10)
    2309           0 :                 group_width++;
    2310             : 
    2311           1 :         v = num_possible_cpus();
    2312           1 :         while (v /= 10)
    2313           0 :                 cpu_width++;
    2314           1 :         empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
    2315             : 
    2316           1 :         upa = ai->alloc_size / ai->unit_size;
    2317           1 :         width = upa * (cpu_width + 1) + group_width + 3;
    2318           1 :         apl = rounddown_pow_of_two(max(60 / width, 1));
    2319             : 
    2320           1 :         printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
    2321             :                lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
    2322             :                ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
    2323             : 
    2324           3 :         for (group = 0; group < ai->nr_groups; group++) {
    2325           1 :                 const struct pcpu_group_info *gi = &ai->groups[group];
    2326           1 :                 int unit = 0, unit_end = 0;
    2327             : 
    2328           1 :                 BUG_ON(gi->nr_units % upa);
    2329           1 :                 for (alloc_end += gi->nr_units / upa;
    2330           2 :                      alloc < alloc_end; alloc++) {
    2331           1 :                         if (!(alloc % apl)) {
    2332           1 :                                 pr_cont("\n");
    2333           1 :                                 printk("%spcpu-alloc: ", lvl);
    2334             :                         }
    2335           1 :                         pr_cont("[%0*d] ", group_width, group);
    2336             : 
    2337           5 :                         for (unit_end += upa; unit < unit_end; unit++)
    2338           4 :                                 if (gi->cpu_map[unit] != NR_CPUS)
    2339           4 :                                         pr_cont("%0*d ",
    2340             :                                                 cpu_width, gi->cpu_map[unit]);
    2341             :                                 else
    2342           0 :                                         pr_cont("%s ", empty_str);
    2343             :                 }
    2344             :         }
    2345           1 :         pr_cont("\n");
    2346           1 : }
    2347             : 
    2348             : /**
    2349             :  * pcpu_setup_first_chunk - initialize the first percpu chunk
    2350             :  * @ai: pcpu_alloc_info describing how to percpu area is shaped
    2351             :  * @base_addr: mapped address
    2352             :  *
    2353             :  * Initialize the first percpu chunk which contains the kernel static
    2354             :  * percpu area.  This function is to be called from arch percpu area
    2355             :  * setup path.
    2356             :  *
    2357             :  * @ai contains all information necessary to initialize the first
    2358             :  * chunk and prime the dynamic percpu allocator.
    2359             :  *
    2360             :  * @ai->static_size is the size of static percpu area.
    2361             :  *
    2362             :  * @ai->reserved_size, if non-zero, specifies the amount of bytes to
    2363             :  * reserve after the static area in the first chunk.  This reserves
    2364             :  * the first chunk such that it's available only through reserved
    2365             :  * percpu allocation.  This is primarily used to serve module percpu
    2366             :  * static areas on architectures where the addressing model has
    2367             :  * limited offset range for symbol relocations to guarantee module
    2368             :  * percpu symbols fall inside the relocatable range.
    2369             :  *
    2370             :  * @ai->dyn_size determines the number of bytes available for dynamic
    2371             :  * allocation in the first chunk.  The area between @ai->static_size +
    2372             :  * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
    2373             :  *
    2374             :  * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
    2375             :  * and equal to or larger than @ai->static_size + @ai->reserved_size +
    2376             :  * @ai->dyn_size.
    2377             :  *
    2378             :  * @ai->atom_size is the allocation atom size and used as alignment
    2379             :  * for vm areas.
    2380             :  *
    2381             :  * @ai->alloc_size is the allocation size and always multiple of
    2382             :  * @ai->atom_size.  This is larger than @ai->atom_size if
    2383             :  * @ai->unit_size is larger than @ai->atom_size.
    2384             :  *
    2385             :  * @ai->nr_groups and @ai->groups describe virtual memory layout of
    2386             :  * percpu areas.  Units which should be colocated are put into the
    2387             :  * same group.  Dynamic VM areas will be allocated according to these
    2388             :  * groupings.  If @ai->nr_groups is zero, a single group containing
    2389             :  * all units is assumed.
    2390             :  *
    2391             :  * The caller should have mapped the first chunk at @base_addr and
    2392             :  * copied static data to each unit.
    2393             :  *
    2394             :  * The first chunk will always contain a static and a dynamic region.
    2395             :  * However, the static region is not managed by any chunk.  If the first
    2396             :  * chunk also contains a reserved region, it is served by two chunks -
    2397             :  * one for the reserved region and one for the dynamic region.  They
    2398             :  * share the same vm, but use offset regions in the area allocation map.
    2399             :  * The chunk serving the dynamic region is circulated in the chunk slots
    2400             :  * and available for dynamic allocation like any other chunk.
    2401             :  */
    2402           1 : void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
    2403             :                                    void *base_addr)
    2404             : {
    2405           1 :         size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
    2406           1 :         size_t static_size, dyn_size;
    2407           1 :         struct pcpu_chunk *chunk;
    2408           1 :         unsigned long *group_offsets;
    2409           1 :         size_t *group_sizes;
    2410           1 :         unsigned long *unit_off;
    2411           1 :         unsigned int cpu;
    2412           1 :         int *unit_map;
    2413           1 :         int group, unit, i;
    2414           1 :         int map_size;
    2415           1 :         unsigned long tmp_addr;
    2416           1 :         size_t alloc_size;
    2417           1 :         enum pcpu_chunk_type type;
    2418             : 
    2419             : #define PCPU_SETUP_BUG_ON(cond) do {                                    \
    2420             :         if (unlikely(cond)) {                                           \
    2421             :                 pr_emerg("failed to initialize, %s\n", #cond);                \
    2422             :                 pr_emerg("cpu_possible_mask=%*pb\n",                  \
    2423             :                          cpumask_pr_args(cpu_possible_mask));           \
    2424             :                 pcpu_dump_alloc_info(KERN_EMERG, ai);                   \
    2425             :                 BUG();                                                  \
    2426             :         }                                                               \
    2427             : } while (0)
    2428             : 
    2429             :         /* sanity checks */
    2430           1 :         PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
    2431             : #ifdef CONFIG_SMP
    2432           1 :         PCPU_SETUP_BUG_ON(!ai->static_size);
    2433           1 :         PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
    2434             : #endif
    2435           1 :         PCPU_SETUP_BUG_ON(!base_addr);
    2436           1 :         PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
    2437           1 :         PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
    2438           1 :         PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
    2439           1 :         PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
    2440           1 :         PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
    2441           1 :         PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
    2442           1 :         PCPU_SETUP_BUG_ON(!ai->dyn_size);
    2443           1 :         PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
    2444           1 :         PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
    2445             :                             IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
    2446           1 :         PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
    2447             : 
    2448             :         /* process group information and build config tables accordingly */
    2449           1 :         alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
    2450           1 :         group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
    2451           1 :         if (!group_offsets)
    2452           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    2453             :                       alloc_size);
    2454             : 
    2455           1 :         alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
    2456           1 :         group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
    2457           1 :         if (!group_sizes)
    2458           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    2459             :                       alloc_size);
    2460             : 
    2461           1 :         alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
    2462           1 :         unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
    2463           1 :         if (!unit_map)
    2464           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    2465             :                       alloc_size);
    2466             : 
    2467           1 :         alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
    2468           1 :         unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
    2469           1 :         if (!unit_off)
    2470           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    2471             :                       alloc_size);
    2472             : 
    2473           5 :         for (cpu = 0; cpu < nr_cpu_ids; cpu++)
    2474           4 :                 unit_map[cpu] = UINT_MAX;
    2475             : 
    2476           1 :         pcpu_low_unit_cpu = NR_CPUS;
    2477           1 :         pcpu_high_unit_cpu = NR_CPUS;
    2478             : 
    2479           2 :         for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
    2480           1 :                 const struct pcpu_group_info *gi = &ai->groups[group];
    2481             : 
    2482           1 :                 group_offsets[group] = gi->base_offset;
    2483           1 :                 group_sizes[group] = gi->nr_units * ai->unit_size;
    2484             : 
    2485           5 :                 for (i = 0; i < gi->nr_units; i++) {
    2486           4 :                         cpu = gi->cpu_map[i];
    2487           4 :                         if (cpu == NR_CPUS)
    2488           0 :                                 continue;
    2489             : 
    2490           4 :                         PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
    2491           4 :                         PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
    2492           4 :                         PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
    2493             : 
    2494           4 :                         unit_map[cpu] = unit + i;
    2495           4 :                         unit_off[cpu] = gi->base_offset + i * ai->unit_size;
    2496             : 
    2497             :                         /* determine low/high unit_cpu */
    2498           4 :                         if (pcpu_low_unit_cpu == NR_CPUS ||
    2499           3 :                             unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
    2500           1 :                                 pcpu_low_unit_cpu = cpu;
    2501           4 :                         if (pcpu_high_unit_cpu == NR_CPUS ||
    2502           3 :                             unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
    2503           4 :                                 pcpu_high_unit_cpu = cpu;
    2504             :                 }
    2505             :         }
    2506           1 :         pcpu_nr_units = unit;
    2507             : 
    2508           6 :         for_each_possible_cpu(cpu)
    2509           5 :                 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
    2510             : 
    2511             :         /* we're done parsing the input, undefine BUG macro and dump config */
    2512             : #undef PCPU_SETUP_BUG_ON
    2513           1 :         pcpu_dump_alloc_info(KERN_DEBUG, ai);
    2514             : 
    2515           1 :         pcpu_nr_groups = ai->nr_groups;
    2516           1 :         pcpu_group_offsets = group_offsets;
    2517           1 :         pcpu_group_sizes = group_sizes;
    2518           1 :         pcpu_unit_map = unit_map;
    2519           1 :         pcpu_unit_offsets = unit_off;
    2520             : 
    2521             :         /* determine basic parameters */
    2522           1 :         pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
    2523           1 :         pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
    2524           1 :         pcpu_atom_size = ai->atom_size;
    2525           1 :         pcpu_chunk_struct_size = struct_size(chunk, populated,
    2526             :                                              BITS_TO_LONGS(pcpu_unit_pages));
    2527             : 
    2528           1 :         pcpu_stats_save_ai(ai);
    2529             : 
    2530             :         /*
    2531             :          * Allocate chunk slots.  The additional last slot is for
    2532             :          * empty chunks.
    2533             :          */
    2534           1 :         pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
    2535           2 :         pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
    2536           1 :                                           sizeof(pcpu_chunk_lists[0]) *
    2537             :                                           PCPU_NR_CHUNK_TYPES,
    2538             :                                           SMP_CACHE_BYTES);
    2539           1 :         if (!pcpu_chunk_lists)
    2540           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    2541             :                       pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
    2542             :                       PCPU_NR_CHUNK_TYPES);
    2543             : 
    2544           2 :         for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
    2545          20 :                 for (i = 0; i < pcpu_nr_slots; i++)
    2546          19 :                         INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
    2547             : 
    2548             :         /*
    2549             :          * The end of the static region needs to be aligned with the
    2550             :          * minimum allocation size as this offsets the reserved and
    2551             :          * dynamic region.  The first chunk ends page aligned by
    2552             :          * expanding the dynamic region, therefore the dynamic region
    2553             :          * can be shrunk to compensate while still staying above the
    2554             :          * configured sizes.
    2555             :          */
    2556           1 :         static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
    2557           1 :         dyn_size = ai->dyn_size - (static_size - ai->static_size);
    2558             : 
    2559             :         /*
    2560             :          * Initialize first chunk.
    2561             :          * If the reserved_size is non-zero, this initializes the reserved
    2562             :          * chunk.  If the reserved_size is zero, the reserved chunk is NULL
    2563             :          * and the dynamic region is initialized here.  The first chunk,
    2564             :          * pcpu_first_chunk, will always point to the chunk that serves
    2565             :          * the dynamic region.
    2566             :          */
    2567           1 :         tmp_addr = (unsigned long)base_addr + static_size;
    2568           1 :         map_size = ai->reserved_size ?: dyn_size;
    2569           1 :         chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
    2570             : 
    2571             :         /* init dynamic chunk if necessary */
    2572           1 :         if (ai->reserved_size) {
    2573           0 :                 pcpu_reserved_chunk = chunk;
    2574             : 
    2575           0 :                 tmp_addr = (unsigned long)base_addr + static_size +
    2576             :                            ai->reserved_size;
    2577           0 :                 map_size = dyn_size;
    2578           0 :                 chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
    2579             :         }
    2580             : 
    2581             :         /* link the first chunk in */
    2582           1 :         pcpu_first_chunk = chunk;
    2583           1 :         pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
    2584           1 :         pcpu_chunk_relocate(pcpu_first_chunk, -1);
    2585             : 
    2586             :         /* include all regions of the first chunk */
    2587           1 :         pcpu_nr_populated += PFN_DOWN(size_sum);
    2588             : 
    2589           1 :         pcpu_stats_chunk_alloc();
    2590           1 :         trace_percpu_create_chunk(base_addr);
    2591             : 
    2592             :         /* we're done */
    2593           1 :         pcpu_base_addr = base_addr;
    2594           1 : }
    2595             : 
    2596             : #ifdef CONFIG_SMP
    2597             : 
    2598             : const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
    2599             :         [PCPU_FC_AUTO]  = "auto",
    2600             :         [PCPU_FC_EMBED] = "embed",
    2601             :         [PCPU_FC_PAGE]  = "page",
    2602             : };
    2603             : 
    2604             : enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
    2605             : 
    2606           0 : static int __init percpu_alloc_setup(char *str)
    2607             : {
    2608           0 :         if (!str)
    2609             :                 return -EINVAL;
    2610             : 
    2611           0 :         if (0)
    2612             :                 /* nada */;
    2613             : #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
    2614           0 :         else if (!strcmp(str, "embed"))
    2615           0 :                 pcpu_chosen_fc = PCPU_FC_EMBED;
    2616             : #endif
    2617             : #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
    2618           0 :         else if (!strcmp(str, "page"))
    2619           0 :                 pcpu_chosen_fc = PCPU_FC_PAGE;
    2620             : #endif
    2621             :         else
    2622           0 :                 pr_warn("unknown allocator %s specified\n", str);
    2623             : 
    2624             :         return 0;
    2625             : }
    2626             : early_param("percpu_alloc", percpu_alloc_setup);
    2627             : 
    2628             : /*
    2629             :  * pcpu_embed_first_chunk() is used by the generic percpu setup.
    2630             :  * Build it if needed by the arch config or the generic setup is going
    2631             :  * to be used.
    2632             :  */
    2633             : #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
    2634             :         !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
    2635             : #define BUILD_EMBED_FIRST_CHUNK
    2636             : #endif
    2637             : 
    2638             : /* build pcpu_page_first_chunk() iff needed by the arch config */
    2639             : #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
    2640             : #define BUILD_PAGE_FIRST_CHUNK
    2641             : #endif
    2642             : 
    2643             : /* pcpu_build_alloc_info() is used by both embed and page first chunk */
    2644             : #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
    2645             : /**
    2646             :  * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
    2647             :  * @reserved_size: the size of reserved percpu area in bytes
    2648             :  * @dyn_size: minimum free size for dynamic allocation in bytes
    2649             :  * @atom_size: allocation atom size
    2650             :  * @cpu_distance_fn: callback to determine distance between cpus, optional
    2651             :  *
    2652             :  * This function determines grouping of units, their mappings to cpus
    2653             :  * and other parameters considering needed percpu size, allocation
    2654             :  * atom size and distances between CPUs.
    2655             :  *
    2656             :  * Groups are always multiples of atom size and CPUs which are of
    2657             :  * LOCAL_DISTANCE both ways are grouped together and share space for
    2658             :  * units in the same group.  The returned configuration is guaranteed
    2659             :  * to have CPUs on different nodes on different groups and >=75% usage
    2660             :  * of allocated virtual address space.
    2661             :  *
    2662             :  * RETURNS:
    2663             :  * On success, pointer to the new allocation_info is returned.  On
    2664             :  * failure, ERR_PTR value is returned.
    2665             :  */
    2666           1 : static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
    2667             :                                 size_t reserved_size, size_t dyn_size,
    2668             :                                 size_t atom_size,
    2669             :                                 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
    2670             : {
    2671           1 :         static int group_map[NR_CPUS] __initdata;
    2672           1 :         static int group_cnt[NR_CPUS] __initdata;
    2673           1 :         static struct cpumask mask __initdata;
    2674           1 :         const size_t static_size = __per_cpu_end - __per_cpu_start;
    2675           1 :         int nr_groups = 1, nr_units = 0;
    2676           1 :         size_t size_sum, min_unit_size, alloc_size;
    2677           1 :         int upa, max_upa, best_upa;     /* units_per_alloc */
    2678           1 :         int last_allocs, group, unit;
    2679           1 :         unsigned int cpu, tcpu;
    2680           1 :         struct pcpu_alloc_info *ai;
    2681           1 :         unsigned int *cpu_map;
    2682             : 
    2683             :         /* this function may be called multiple times */
    2684           1 :         memset(group_map, 0, sizeof(group_map));
    2685           1 :         memset(group_cnt, 0, sizeof(group_cnt));
    2686           1 :         cpumask_clear(&mask);
    2687             : 
    2688             :         /* calculate size_sum and ensure dyn_size is enough for early alloc */
    2689           1 :         size_sum = PFN_ALIGN(static_size + reserved_size +
    2690             :                             max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
    2691           1 :         dyn_size = size_sum - static_size - reserved_size;
    2692             : 
    2693             :         /*
    2694             :          * Determine min_unit_size, alloc_size and max_upa such that
    2695             :          * alloc_size is multiple of atom_size and is the smallest
    2696             :          * which can accommodate 4k aligned segments which are equal to
    2697             :          * or larger than min_unit_size.
    2698             :          */
    2699           1 :         min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
    2700             : 
    2701             :         /* determine the maximum # of units that can fit in an allocation */
    2702           1 :         alloc_size = roundup(min_unit_size, atom_size);
    2703           1 :         upa = alloc_size / min_unit_size;
    2704           3 :         while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
    2705           2 :                 upa--;
    2706           1 :         max_upa = upa;
    2707             : 
    2708           1 :         cpumask_copy(&mask, cpu_possible_mask);
    2709             : 
    2710             :         /* group cpus according to their proximity */
    2711           2 :         for (group = 0; !cpumask_empty(&mask); group++) {
    2712             :                 /* pop the group's first cpu */
    2713           1 :                 cpu = cpumask_first(&mask);
    2714           1 :                 group_map[cpu] = group;
    2715           1 :                 group_cnt[group]++;
    2716           1 :                 cpumask_clear_cpu(cpu, &mask);
    2717             : 
    2718           4 :                 for_each_cpu(tcpu, &mask) {
    2719           6 :                         if (!cpu_distance_fn ||
    2720           6 :                             (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
    2721           3 :                              cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
    2722           3 :                                 group_map[tcpu] = group;
    2723           3 :                                 group_cnt[group]++;
    2724           7 :                                 cpumask_clear_cpu(tcpu, &mask);
    2725             :                         }
    2726             :                 }
    2727             :         }
    2728           3 :         nr_groups = group;
    2729             : 
    2730             :         /*
    2731             :          * Wasted space is caused by a ratio imbalance of upa to group_cnt.
    2732             :          * Expand the unit_size until we use >= 75% of the units allocated.
    2733             :          * Related to atom_size, which could be much larger than the unit_size.
    2734             :          */
    2735             :         last_allocs = INT_MAX;
    2736           3 :         for (upa = max_upa; upa; upa--) {
    2737           3 :                 int allocs = 0, wasted = 0;
    2738             : 
    2739           3 :                 if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
    2740           1 :                         continue;
    2741             : 
    2742           4 :                 for (group = 0; group < nr_groups; group++) {
    2743           2 :                         int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
    2744           2 :                         allocs += this_allocs;
    2745           2 :                         wasted += this_allocs * upa - group_cnt[group];
    2746             :                 }
    2747             : 
    2748             :                 /*
    2749             :                  * Don't accept if wastage is over 1/3.  The
    2750             :                  * greater-than comparison ensures upa==1 always
    2751             :                  * passes the following check.
    2752             :                  */
    2753           4 :                 if (wasted > num_possible_cpus() / 3)
    2754           0 :                         continue;
    2755             : 
    2756             :                 /* and then don't consume more memory */
    2757           2 :                 if (allocs > last_allocs)
    2758             :                         break;
    2759             :                 last_allocs = allocs;
    2760             :                 best_upa = upa;
    2761             :         }
    2762           1 :         upa = best_upa;
    2763             : 
    2764             :         /* allocate and fill alloc_info */
    2765           2 :         for (group = 0; group < nr_groups; group++)
    2766           1 :                 nr_units += roundup(group_cnt[group], upa);
    2767             : 
    2768           1 :         ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
    2769           1 :         if (!ai)
    2770           1 :                 return ERR_PTR(-ENOMEM);
    2771           1 :         cpu_map = ai->groups[0].cpu_map;
    2772             : 
    2773           2 :         for (group = 0; group < nr_groups; group++) {
    2774           1 :                 ai->groups[group].cpu_map = cpu_map;
    2775           1 :                 cpu_map += roundup(group_cnt[group], upa);
    2776             :         }
    2777             : 
    2778           1 :         ai->static_size = static_size;
    2779           1 :         ai->reserved_size = reserved_size;
    2780           1 :         ai->dyn_size = dyn_size;
    2781           1 :         ai->unit_size = alloc_size / upa;
    2782           1 :         ai->atom_size = atom_size;
    2783           1 :         ai->alloc_size = alloc_size;
    2784             : 
    2785           2 :         for (group = 0, unit = 0; group < nr_groups; group++) {
    2786           1 :                 struct pcpu_group_info *gi = &ai->groups[group];
    2787             : 
    2788             :                 /*
    2789             :                  * Initialize base_offset as if all groups are located
    2790             :                  * back-to-back.  The caller should update this to
    2791             :                  * reflect actual allocation.
    2792             :                  */
    2793           1 :                 gi->base_offset = unit * ai->unit_size;
    2794             : 
    2795           5 :                 for_each_possible_cpu(cpu)
    2796           4 :                         if (group_map[cpu] == group)
    2797           4 :                                 gi->cpu_map[gi->nr_units++] = cpu;
    2798           1 :                 gi->nr_units = roundup(gi->nr_units, upa);
    2799           1 :                 unit += gi->nr_units;
    2800             :         }
    2801           1 :         BUG_ON(unit != nr_units);
    2802             : 
    2803             :         return ai;
    2804             : }
    2805             : #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
    2806             : 
    2807             : #if defined(BUILD_EMBED_FIRST_CHUNK)
    2808             : /**
    2809             :  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
    2810             :  * @reserved_size: the size of reserved percpu area in bytes
    2811             :  * @dyn_size: minimum free size for dynamic allocation in bytes
    2812             :  * @atom_size: allocation atom size
    2813             :  * @cpu_distance_fn: callback to determine distance between cpus, optional
    2814             :  * @alloc_fn: function to allocate percpu page
    2815             :  * @free_fn: function to free percpu page
    2816             :  *
    2817             :  * This is a helper to ease setting up embedded first percpu chunk and
    2818             :  * can be called where pcpu_setup_first_chunk() is expected.
    2819             :  *
    2820             :  * If this function is used to setup the first chunk, it is allocated
    2821             :  * by calling @alloc_fn and used as-is without being mapped into
    2822             :  * vmalloc area.  Allocations are always whole multiples of @atom_size
    2823             :  * aligned to @atom_size.
    2824             :  *
    2825             :  * This enables the first chunk to piggy back on the linear physical
    2826             :  * mapping which often uses larger page size.  Please note that this
    2827             :  * can result in very sparse cpu->unit mapping on NUMA machines thus
    2828             :  * requiring large vmalloc address space.  Don't use this allocator if
    2829             :  * vmalloc space is not orders of magnitude larger than distances
    2830             :  * between node memory addresses (ie. 32bit NUMA machines).
    2831             :  *
    2832             :  * @dyn_size specifies the minimum dynamic area size.
    2833             :  *
    2834             :  * If the needed size is smaller than the minimum or specified unit
    2835             :  * size, the leftover is returned using @free_fn.
    2836             :  *
    2837             :  * RETURNS:
    2838             :  * 0 on success, -errno on failure.
    2839             :  */
    2840           1 : int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
    2841             :                                   size_t atom_size,
    2842             :                                   pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
    2843             :                                   pcpu_fc_alloc_fn_t alloc_fn,
    2844             :                                   pcpu_fc_free_fn_t free_fn)
    2845             : {
    2846           1 :         void *base = (void *)ULONG_MAX;
    2847           1 :         void **areas = NULL;
    2848           1 :         struct pcpu_alloc_info *ai;
    2849           1 :         size_t size_sum, areas_size;
    2850           1 :         unsigned long max_distance;
    2851           1 :         int group, i, highest_group, rc = 0;
    2852             : 
    2853           1 :         ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
    2854             :                                    cpu_distance_fn);
    2855           1 :         if (IS_ERR(ai))
    2856           0 :                 return PTR_ERR(ai);
    2857             : 
    2858           1 :         size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
    2859           1 :         areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
    2860             : 
    2861           1 :         areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
    2862           1 :         if (!areas) {
    2863           0 :                 rc = -ENOMEM;
    2864           0 :                 goto out_free;
    2865             :         }
    2866             : 
    2867             :         /* allocate, copy and determine base address & max_distance */
    2868             :         highest_group = 0;
    2869           2 :         for (group = 0; group < ai->nr_groups; group++) {
    2870           2 :                 struct pcpu_group_info *gi = &ai->groups[group];
    2871             :                 unsigned int cpu = NR_CPUS;
    2872             :                 void *ptr;
    2873             : 
    2874           2 :                 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
    2875           1 :                         cpu = gi->cpu_map[i];
    2876           1 :                 BUG_ON(cpu == NR_CPUS);
    2877             : 
    2878             :                 /* allocate space for the whole group */
    2879           1 :                 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
    2880           1 :                 if (!ptr) {
    2881           0 :                         rc = -ENOMEM;
    2882           0 :                         goto out_free_areas;
    2883             :                 }
    2884             :                 /* kmemleak tracks the percpu allocations separately */
    2885           1 :                 kmemleak_free(ptr);
    2886           1 :                 areas[group] = ptr;
    2887             : 
    2888           1 :                 base = min(ptr, base);
    2889           1 :                 if (ptr > areas[highest_group])
    2890           0 :                         highest_group = group;
    2891             :         }
    2892           1 :         max_distance = areas[highest_group] - base;
    2893           1 :         max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
    2894             : 
    2895             :         /* warn if maximum distance is further than 75% of vmalloc space */
    2896           1 :         if (max_distance > VMALLOC_TOTAL * 3 / 4) {
    2897           0 :                 pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
    2898             :                                 max_distance, VMALLOC_TOTAL);
    2899             : #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
    2900             :                 /* and fail if we have fallback */
    2901           0 :                 rc = -EINVAL;
    2902           0 :                 goto out_free_areas;
    2903             : #endif
    2904             :         }
    2905             : 
    2906             :         /*
    2907             :          * Copy data and free unused parts.  This should happen after all
    2908             :          * allocations are complete; otherwise, we may end up with
    2909             :          * overlapping groups.
    2910             :          */
    2911           2 :         for (group = 0; group < ai->nr_groups; group++) {
    2912           1 :                 struct pcpu_group_info *gi = &ai->groups[group];
    2913           1 :                 void *ptr = areas[group];
    2914             : 
    2915           5 :                 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
    2916           4 :                         if (gi->cpu_map[i] == NR_CPUS) {
    2917             :                                 /* unused unit, free whole */
    2918           0 :                                 free_fn(ptr, ai->unit_size);
    2919           0 :                                 continue;
    2920             :                         }
    2921             :                         /* copy and return the unused part */
    2922           4 :                         memcpy(ptr, __per_cpu_load, ai->static_size);
    2923           4 :                         free_fn(ptr + size_sum, ai->unit_size - size_sum);
    2924             :                 }
    2925             :         }
    2926             : 
    2927             :         /* base address is now known, determine group base offsets */
    2928           2 :         for (group = 0; group < ai->nr_groups; group++) {
    2929           1 :                 ai->groups[group].base_offset = areas[group] - base;
    2930             :         }
    2931             : 
    2932           1 :         pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
    2933             :                 PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
    2934             :                 ai->dyn_size, ai->unit_size);
    2935             : 
    2936           1 :         pcpu_setup_first_chunk(ai, base);
    2937           1 :         goto out_free;
    2938             : 
    2939           0 : out_free_areas:
    2940           0 :         for (group = 0; group < ai->nr_groups; group++)
    2941           0 :                 if (areas[group])
    2942           0 :                         free_fn(areas[group],
    2943           0 :                                 ai->groups[group].nr_units * ai->unit_size);
    2944           0 : out_free:
    2945           1 :         pcpu_free_alloc_info(ai);
    2946           1 :         if (areas)
    2947           1 :                 memblock_free_early(__pa(areas), areas_size);
    2948             :         return rc;
    2949             : }
    2950             : #endif /* BUILD_EMBED_FIRST_CHUNK */
    2951             : 
    2952             : #ifdef BUILD_PAGE_FIRST_CHUNK
    2953             : /**
    2954             :  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
    2955             :  * @reserved_size: the size of reserved percpu area in bytes
    2956             :  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
    2957             :  * @free_fn: function to free percpu page, always called with PAGE_SIZE
    2958             :  * @populate_pte_fn: function to populate pte
    2959             :  *
    2960             :  * This is a helper to ease setting up page-remapped first percpu
    2961             :  * chunk and can be called where pcpu_setup_first_chunk() is expected.
    2962             :  *
    2963             :  * This is the basic allocator.  Static percpu area is allocated
    2964             :  * page-by-page into vmalloc area.
    2965             :  *
    2966             :  * RETURNS:
    2967             :  * 0 on success, -errno on failure.
    2968             :  */
    2969           0 : int __init pcpu_page_first_chunk(size_t reserved_size,
    2970             :                                  pcpu_fc_alloc_fn_t alloc_fn,
    2971             :                                  pcpu_fc_free_fn_t free_fn,
    2972             :                                  pcpu_fc_populate_pte_fn_t populate_pte_fn)
    2973             : {
    2974           0 :         static struct vm_struct vm;
    2975           0 :         struct pcpu_alloc_info *ai;
    2976           0 :         char psize_str[16];
    2977           0 :         int unit_pages;
    2978           0 :         size_t pages_size;
    2979           0 :         struct page **pages;
    2980           0 :         int unit, i, j, rc = 0;
    2981           0 :         int upa;
    2982           0 :         int nr_g0_units;
    2983             : 
    2984           0 :         snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
    2985             : 
    2986           0 :         ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
    2987           0 :         if (IS_ERR(ai))
    2988           0 :                 return PTR_ERR(ai);
    2989           0 :         BUG_ON(ai->nr_groups != 1);
    2990           0 :         upa = ai->alloc_size/ai->unit_size;
    2991           0 :         nr_g0_units = roundup(num_possible_cpus(), upa);
    2992           0 :         if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
    2993           0 :                 pcpu_free_alloc_info(ai);
    2994           0 :                 return -EINVAL;
    2995             :         }
    2996             : 
    2997           0 :         unit_pages = ai->unit_size >> PAGE_SHIFT;
    2998             : 
    2999             :         /* unaligned allocations can't be freed, round up to page size */
    3000           0 :         pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
    3001             :                                sizeof(pages[0]));
    3002           0 :         pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
    3003           0 :         if (!pages)
    3004           0 :                 panic("%s: Failed to allocate %zu bytes\n", __func__,
    3005             :                       pages_size);
    3006             : 
    3007             :         /* allocate pages */
    3008             :         j = 0;
    3009           0 :         for (unit = 0; unit < num_possible_cpus(); unit++) {
    3010           0 :                 unsigned int cpu = ai->groups[0].cpu_map[unit];
    3011           0 :                 for (i = 0; i < unit_pages; i++) {
    3012           0 :                         void *ptr;
    3013             : 
    3014           0 :                         ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
    3015           0 :                         if (!ptr) {
    3016           0 :                                 pr_warn("failed to allocate %s page for cpu%u\n",
    3017             :                                                 psize_str, cpu);
    3018           0 :                                 goto enomem;
    3019             :                         }
    3020             :                         /* kmemleak tracks the percpu allocations separately */
    3021           0 :                         kmemleak_free(ptr);
    3022           0 :                         pages[j++] = virt_to_page(ptr);
    3023             :                 }
    3024             :         }
    3025             : 
    3026             :         /* allocate vm area, map the pages and copy static data */
    3027           0 :         vm.flags = VM_ALLOC;
    3028           0 :         vm.size = num_possible_cpus() * ai->unit_size;
    3029           0 :         vm_area_register_early(&vm, PAGE_SIZE);
    3030             : 
    3031           0 :         for (unit = 0; unit < num_possible_cpus(); unit++) {
    3032           0 :                 unsigned long unit_addr =
    3033           0 :                         (unsigned long)vm.addr + unit * ai->unit_size;
    3034             : 
    3035           0 :                 for (i = 0; i < unit_pages; i++)
    3036           0 :                         populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
    3037             : 
    3038             :                 /* pte already populated, the following shouldn't fail */
    3039           0 :                 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
    3040             :                                       unit_pages);
    3041           0 :                 if (rc < 0)
    3042           0 :                         panic("failed to map percpu area, err=%d\n", rc);
    3043             : 
    3044             :                 /*
    3045             :                  * FIXME: Archs with virtual cache should flush local
    3046             :                  * cache for the linear mapping here - something
    3047             :                  * equivalent to flush_cache_vmap() on the local cpu.
    3048             :                  * flush_cache_vmap() can't be used as most supporting
    3049             :                  * data structures are not set up yet.
    3050             :                  */
    3051             : 
    3052             :                 /* copy static data */
    3053           0 :                 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
    3054             :         }
    3055             : 
    3056             :         /* we're ready, commit */
    3057           0 :         pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
    3058             :                 unit_pages, psize_str, ai->static_size,
    3059             :                 ai->reserved_size, ai->dyn_size);
    3060             : 
    3061           0 :         pcpu_setup_first_chunk(ai, vm.addr);
    3062           0 :         goto out_free_ar;
    3063             : 
    3064           0 : enomem:
    3065           0 :         while (--j >= 0)
    3066           0 :                 free_fn(page_address(pages[j]), PAGE_SIZE);
    3067             :         rc = -ENOMEM;
    3068           0 : out_free_ar:
    3069           0 :         memblock_free_early(__pa(pages), pages_size);
    3070           0 :         pcpu_free_alloc_info(ai);
    3071           0 :         return rc;
    3072             : }
    3073             : #endif /* BUILD_PAGE_FIRST_CHUNK */
    3074             : 
    3075             : #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
    3076             : /*
    3077             :  * Generic SMP percpu area setup.
    3078             :  *
    3079             :  * The embedding helper is used because its behavior closely resembles
    3080             :  * the original non-dynamic generic percpu area setup.  This is
    3081             :  * important because many archs have addressing restrictions and might
    3082             :  * fail if the percpu area is located far away from the previous
    3083             :  * location.  As an added bonus, in non-NUMA cases, embedding is
    3084             :  * generally a good idea TLB-wise because percpu area can piggy back
    3085             :  * on the physical linear memory mapping which uses large page
    3086             :  * mappings on applicable archs.
    3087             :  */
    3088             : unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
    3089             : EXPORT_SYMBOL(__per_cpu_offset);
    3090             : 
    3091             : static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
    3092             :                                        size_t align)
    3093             : {
    3094             :         return  memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
    3095             : }
    3096             : 
    3097             : static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
    3098             : {
    3099             :         memblock_free_early(__pa(ptr), size);
    3100             : }
    3101             : 
    3102             : void __init setup_per_cpu_areas(void)
    3103             : {
    3104             :         unsigned long delta;
    3105             :         unsigned int cpu;
    3106             :         int rc;
    3107             : 
    3108             :         /*
    3109             :          * Always reserve area for module percpu variables.  That's
    3110             :          * what the legacy allocator did.
    3111             :          */
    3112             :         rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
    3113             :                                     PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
    3114             :                                     pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
    3115             :         if (rc < 0)
    3116             :                 panic("Failed to initialize percpu areas.");
    3117             : 
    3118             :         delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
    3119             :         for_each_possible_cpu(cpu)
    3120             :                 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
    3121             : }
    3122             : #endif  /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
    3123             : 
    3124             : #else   /* CONFIG_SMP */
    3125             : 
    3126             : /*
    3127             :  * UP percpu area setup.
    3128             :  *
    3129             :  * UP always uses km-based percpu allocator with identity mapping.
    3130             :  * Static percpu variables are indistinguishable from the usual static
    3131             :  * variables and don't require any special preparation.
    3132             :  */
    3133             : void __init setup_per_cpu_areas(void)
    3134             : {
    3135             :         const size_t unit_size =
    3136             :                 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
    3137             :                                          PERCPU_DYNAMIC_RESERVE));
    3138             :         struct pcpu_alloc_info *ai;
    3139             :         void *fc;
    3140             : 
    3141             :         ai = pcpu_alloc_alloc_info(1, 1);
    3142             :         fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
    3143             :         if (!ai || !fc)
    3144             :                 panic("Failed to allocate memory for percpu areas.");
    3145             :         /* kmemleak tracks the percpu allocations separately */
    3146             :         kmemleak_free(fc);
    3147             : 
    3148             :         ai->dyn_size = unit_size;
    3149             :         ai->unit_size = unit_size;
    3150             :         ai->atom_size = unit_size;
    3151             :         ai->alloc_size = unit_size;
    3152             :         ai->groups[0].nr_units = 1;
    3153             :         ai->groups[0].cpu_map[0] = 0;
    3154             : 
    3155             :         pcpu_setup_first_chunk(ai, fc);
    3156             :         pcpu_free_alloc_info(ai);
    3157             : }
    3158             : 
    3159             : #endif  /* CONFIG_SMP */
    3160             : 
    3161             : /*
    3162             :  * pcpu_nr_pages - calculate total number of populated backing pages
    3163             :  *
    3164             :  * This reflects the number of pages populated to back chunks.  Metadata is
    3165             :  * excluded in the number exposed in meminfo as the number of backing pages
    3166             :  * scales with the number of cpus and can quickly outweigh the memory used for
    3167             :  * metadata.  It also keeps this calculation nice and simple.
    3168             :  *
    3169             :  * RETURNS:
    3170             :  * Total number of populated backing pages in use by the allocator.
    3171             :  */
    3172           1 : unsigned long pcpu_nr_pages(void)
    3173             : {
    3174           1 :         return pcpu_nr_populated * pcpu_nr_units;
    3175             : }
    3176             : 
    3177             : /*
    3178             :  * Percpu allocator is initialized early during boot when neither slab or
    3179             :  * workqueue is available.  Plug async management until everything is up
    3180             :  * and running.
    3181             :  */
    3182           1 : static int __init percpu_enable_async(void)
    3183             : {
    3184           1 :         pcpu_async_enabled = true;
    3185           1 :         return 0;
    3186             : }
    3187             : subsys_initcall(percpu_enable_async);

Generated by: LCOV version 1.14