LCOV - landlock.info - mm/vmalloc.c

LCOV - code coverage report

Current view:	top level - mm - vmalloc.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	713	1375	51.9 %
Date:	2021-04-22 12:43:58	Functions:	51	98	52.0 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  Copyright (C) 1993  Linus Torvalds
       4             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
       5             :  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
       6             :  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
       7             :  *  Numa awareness, Christoph Lameter, SGI, June 2005
       8             :  *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
       9             :  */
      10             : 
      11             : #include <linux/vmalloc.h>
      12             : #include <linux/mm.h>
      13             : #include <linux/module.h>
      14             : #include <linux/highmem.h>
      15             : #include <linux/sched/signal.h>
      16             : #include <linux/slab.h>
      17             : #include <linux/spinlock.h>
      18             : #include <linux/interrupt.h>
      19             : #include <linux/proc_fs.h>
      20             : #include <linux/seq_file.h>
      21             : #include <linux/set_memory.h>
      22             : #include <linux/debugobjects.h>
      23             : #include <linux/kallsyms.h>
      24             : #include <linux/list.h>
      25             : #include <linux/notifier.h>
      26             : #include <linux/rbtree.h>
      27             : #include <linux/xarray.h>
      28             : #include <linux/rcupdate.h>
      29             : #include <linux/pfn.h>
      30             : #include <linux/kmemleak.h>
      31             : #include <linux/atomic.h>
      32             : #include <linux/compiler.h>
      33             : #include <linux/llist.h>
      34             : #include <linux/bitops.h>
      35             : #include <linux/rbtree_augmented.h>
      36             : #include <linux/overflow.h>
      37             : 
      38             : #include <linux/uaccess.h>
      39             : #include <asm/tlbflush.h>
      40             : #include <asm/shmparam.h>
      41             : 
      42             : #include "internal.h"
      43             : #include "pgalloc-track.h"
      44             : 
      45       47131 : bool is_vmalloc_addr(const void *x)
      46             : {
      47       47131 :         unsigned long addr = (unsigned long)x;
      48             : 
      49       47131 :         return addr >= VMALLOC_START && addr < VMALLOC_END;
      50             : }
      51             : EXPORT_SYMBOL(is_vmalloc_addr);
      52             : 
      53             : struct vfree_deferred {
      54             :         struct llist_head list;
      55             :         struct work_struct wq;
      56             : };
      57             : static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
      58             : 
      59             : static void __vunmap(const void *, int);
      60             : 
      61           0 : static void free_work(struct work_struct *w)
      62             : {
      63           0 :         struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
      64           0 :         struct llist_node *t, *llnode;
      65             : 
      66           0 :         llist_for_each_safe(llnode, t, llist_del_all(&p->list))
      67           0 :                 __vunmap((void *)llnode, 1);
      68           0 : }
      69             : 
      70             : /*** Page table manipulation functions ***/
      71             : 
      72       10598 : static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
      73             :                              pgtbl_mod_mask *mask)
      74             : {
      75       10598 :         pte_t *pte;
      76             : 
      77       21196 :         pte = pte_offset_kernel(pmd, addr);
      78       21655 :         do {
      79       21655 :                 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
      80       32725 :                 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
      81       21655 :         } while (pte++, addr += PAGE_SIZE, addr != end);
      82       10598 :         *mask |= PGTBL_PTE_MODIFIED;
      83       10598 : }
      84             : 
      85       10585 : static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
      86             :                              pgtbl_mod_mask *mask)
      87             : {
      88       10585 :         pmd_t *pmd;
      89       10585 :         unsigned long next;
      90       10585 :         int cleared;
      91             : 
      92       21170 :         pmd = pmd_offset(pud, addr);
      93       10598 :         do {
      94       10598 :                 next = pmd_addr_end(addr, end);
      95             : 
      96       10598 :                 cleared = pmd_clear_huge(pmd);
      97       21196 :                 if (cleared || pmd_bad(*pmd))
      98           0 :                         *mask |= PGTBL_PMD_MODIFIED;
      99             : 
     100       10598 :                 if (cleared)
     101           0 :                         continue;
     102       10598 :                 if (pmd_none_or_clear_bad(pmd))
     103           0 :                         continue;
     104       10598 :                 vunmap_pte_range(pmd, addr, next, mask);
     105             : 
     106       10598 :                 cond_resched();
     107       10598 :         } while (pmd++, addr = next, addr != end);
     108       10585 : }
     109             : 
     110       10585 : static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     111             :                              pgtbl_mod_mask *mask)
     112             : {
     113       10585 :         pud_t *pud;
     114       10585 :         unsigned long next;
     115       10585 :         int cleared;
     116             : 
     117       10585 :         pud = pud_offset(p4d, addr);
     118       10585 :         do {
     119       10585 :                 next = pud_addr_end(addr, end);
     120             : 
     121       10585 :                 cleared = pud_clear_huge(pud);
     122       21170 :                 if (cleared || pud_bad(*pud))
     123           0 :                         *mask |= PGTBL_PUD_MODIFIED;
     124             : 
     125       10585 :                 if (cleared)
     126           0 :                         continue;
     127       10585 :                 if (pud_none_or_clear_bad(pud))
     128           0 :                         continue;
     129       10585 :                 vunmap_pmd_range(pud, addr, next, mask);
     130       10585 :         } while (pud++, addr = next, addr != end);
     131       10585 : }
     132             : 
     133       10585 : static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     134             :                              pgtbl_mod_mask *mask)
     135             : {
     136       10585 :         p4d_t *p4d;
     137       10585 :         unsigned long next;
     138       10585 :         int cleared;
     139             : 
     140       10585 :         p4d = p4d_offset(pgd, addr);
     141       10585 :         do {
     142       10585 :                 next = p4d_addr_end(addr, end);
     143             : 
     144       10585 :                 cleared = p4d_clear_huge(p4d);
     145       10585 :                 if (cleared || p4d_bad(*p4d))
     146           0 :                         *mask |= PGTBL_P4D_MODIFIED;
     147             : 
     148       10585 :                 if (cleared)
     149             :                         continue;
     150       10585 :                 if (p4d_none_or_clear_bad(p4d))
     151           0 :                         continue;
     152       10585 :                 vunmap_pud_range(p4d, addr, next, mask);
     153       10585 :         } while (p4d++, addr = next, addr != end);
     154       10585 : }
     155             : 
     156             : /**
     157             :  * unmap_kernel_range_noflush - unmap kernel VM area
     158             :  * @start: start of the VM area to unmap
     159             :  * @size: size of the VM area to unmap
     160             :  *
     161             :  * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify
     162             :  * should have been allocated using get_vm_area() and its friends.
     163             :  *
     164             :  * NOTE:
     165             :  * This function does NOT do any cache flushing.  The caller is responsible
     166             :  * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
     167             :  * function and flush_tlb_kernel_range() after.
     168             :  */
     169       10585 : void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
     170             : {
     171       10585 :         unsigned long end = start + size;
     172       10585 :         unsigned long next;
     173       10585 :         pgd_t *pgd;
     174       10585 :         unsigned long addr = start;
     175       10585 :         pgtbl_mod_mask mask = 0;
     176             : 
     177       10585 :         BUG_ON(addr >= end);
     178       10585 :         pgd = pgd_offset_k(addr);
     179       10585 :         do {
     180       10585 :                 next = pgd_addr_end(addr, end);
     181       10585 :                 if (pgd_bad(*pgd))
     182             :                         mask |= PGTBL_PGD_MODIFIED;
     183       10585 :                 if (pgd_none_or_clear_bad(pgd))
     184             :                         continue;
     185       10585 :                 vunmap_p4d_range(pgd, addr, next, &mask);
     186       10585 :         } while (pgd++, addr = next, addr != end);
     187             : 
     188       10585 :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     189             :                 arch_sync_kernel_mappings(start, end);
     190       10585 : }
     191             : 
     192       10688 : static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
     193             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     194             :                 pgtbl_mod_mask *mask)
     195             : {
     196       10688 :         pte_t *pte;
     197             : 
     198             :         /*
     199             :          * nr is a running index into the array which helps higher level
     200             :          * callers keep track of where we're up to.
     201             :          */
     202             : 
     203       10688 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     204       10688 :         if (!pte)
     205           0 :                 return -ENOMEM;
     206       11192 :         do {
     207       11192 :                 struct page *page = pages[*nr];
     208             : 
     209       11192 :                 if (WARN_ON(!pte_none(*pte)))
     210             :                         return -EBUSY;
     211       11192 :                 if (WARN_ON(!page))
     212             :                         return -ENOMEM;
     213       11192 :                 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
     214       11192 :                 (*nr)++;
     215       11192 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     216       10688 :         *mask |= PGTBL_PTE_MODIFIED;
     217       10688 :         return 0;
     218             : }
     219             : 
     220       10687 : static int vmap_pmd_range(pud_t *pud, unsigned long addr,
     221             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     222             :                 pgtbl_mod_mask *mask)
     223             : {
     224       10687 :         pmd_t *pmd;
     225       10687 :         unsigned long next;
     226             : 
     227       10687 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     228       10687 :         if (!pmd)
     229             :                 return -ENOMEM;
     230       10687 :         do {
     231       10687 :                 next = pmd_addr_end(addr, end);
     232       10687 :                 if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
     233             :                         return -ENOMEM;
     234       10687 :         } while (pmd++, addr = next, addr != end);
     235             :         return 0;
     236             : }
     237             : 
     238       10686 : static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
     239             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     240             :                 pgtbl_mod_mask *mask)
     241             : {
     242       10686 :         pud_t *pud;
     243       10686 :         unsigned long next;
     244             : 
     245       10686 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     246       10686 :         if (!pud)
     247             :                 return -ENOMEM;
     248       10686 :         do {
     249       10686 :                 next = pud_addr_end(addr, end);
     250       10686 :                 if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
     251             :                         return -ENOMEM;
     252       10686 :         } while (pud++, addr = next, addr != end);
     253             :         return 0;
     254             : }
     255             : 
     256       10685 : static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
     257             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     258             :                 pgtbl_mod_mask *mask)
     259             : {
     260       10685 :         p4d_t *p4d;
     261       10685 :         unsigned long next;
     262             : 
     263       10685 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     264       10685 :         if (!p4d)
     265             :                 return -ENOMEM;
     266       10685 :         do {
     267       10685 :                 next = p4d_addr_end(addr, end);
     268       10685 :                 if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
     269           0 :                         return -ENOMEM;
     270       10685 :         } while (p4d++, addr = next, addr != end);
     271             :         return 0;
     272             : }
     273             : 
     274             : /**
     275             :  * map_kernel_range_noflush - map kernel VM area with the specified pages
     276             :  * @addr: start of the VM area to map
     277             :  * @size: size of the VM area to map
     278             :  * @prot: page protection flags to use
     279             :  * @pages: pages to map
     280             :  *
     281             :  * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
     282             :  * have been allocated using get_vm_area() and its friends.
     283             :  *
     284             :  * NOTE:
     285             :  * This function does NOT do any cache flushing.  The caller is responsible for
     286             :  * calling flush_cache_vmap() on to-be-mapped areas before calling this
     287             :  * function.
     288             :  *
     289             :  * RETURNS:
     290             :  * 0 on success, -errno on failure.
     291             :  */
     292       10684 : int map_kernel_range_noflush(unsigned long addr, unsigned long size,
     293             :                              pgprot_t prot, struct page **pages)
     294             : {
     295       10684 :         unsigned long start = addr;
     296       10684 :         unsigned long end = addr + size;
     297       10684 :         unsigned long next;
     298       10684 :         pgd_t *pgd;
     299       10684 :         int err = 0;
     300       10684 :         int nr = 0;
     301       10684 :         pgtbl_mod_mask mask = 0;
     302             : 
     303       10684 :         BUG_ON(addr >= end);
     304       10684 :         pgd = pgd_offset_k(addr);
     305       10684 :         do {
     306       10684 :                 next = pgd_addr_end(addr, end);
     307       10684 :                 if (pgd_bad(*pgd))
     308             :                         mask |= PGTBL_PGD_MODIFIED;
     309       10684 :                 err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
     310       10684 :                 if (err)
     311           0 :                         return err;
     312       10684 :         } while (pgd++, addr = next, addr != end);
     313             : 
     314             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     315             :                 arch_sync_kernel_mappings(start, end);
     316             : 
     317             :         return 0;
     318             : }
     319             : 
     320       10626 : int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
     321             :                 struct page **pages)
     322             : {
     323       10626 :         int ret;
     324             : 
     325           0 :         ret = map_kernel_range_noflush(start, size, prot, pages);
     326       10626 :         flush_cache_vmap(start, start + size);
     327       10626 :         return ret;
     328             : }
     329             : 
     330       31956 : int is_vmalloc_or_module_addr(const void *x)
     331             : {
     332             :         /*
     333             :          * ARM, x86-64 and sparc64 put modules in a special place,
     334             :          * and fall back on vmalloc() if that fails. Others
     335             :          * just put it in the vmalloc space.
     336             :          */
     337             : #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
     338             :         unsigned long addr = (unsigned long)x;
     339             :         if (addr >= MODULES_VADDR && addr < MODULES_END)
     340             :                 return 1;
     341             : #endif
     342       31956 :         return is_vmalloc_addr(x);
     343             : }
     344             : 
     345             : /*
     346             :  * Walk a vmap address to the struct page it maps.
     347             :  */
     348         127 : struct page *vmalloc_to_page(const void *vmalloc_addr)
     349             : {
     350         127 :         unsigned long addr = (unsigned long) vmalloc_addr;
     351         127 :         struct page *page = NULL;
     352         127 :         pgd_t *pgd = pgd_offset_k(addr);
     353         127 :         p4d_t *p4d;
     354         127 :         pud_t *pud;
     355         127 :         pmd_t *pmd;
     356         127 :         pte_t *ptep, pte;
     357             : 
     358             :         /*
     359             :          * XXX we might need to change this if we add VIRTUAL_BUG_ON for
     360             :          * architectures that do not vmalloc module space
     361             :          */
     362         127 :         VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
     363             : 
     364         127 :         if (pgd_none(*pgd))
     365             :                 return NULL;
     366         127 :         p4d = p4d_offset(pgd, addr);
     367         127 :         if (p4d_none(*p4d))
     368             :                 return NULL;
     369         127 :         pud = pud_offset(p4d, addr);
     370             : 
     371             :         /*
     372             :          * Don't dereference bad PUD or PMD (below) entries. This will also
     373             :          * identify huge mappings, which we may encounter on architectures
     374             :          * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
     375             :          * identified as vmalloc addresses by is_vmalloc_addr(), but are
     376             :          * not [unambiguously] associated with a struct page, so there is
     377             :          * no correct value to return for them.
     378             :          */
     379         254 :         WARN_ON_ONCE(pud_bad(*pud));
     380         127 :         if (pud_none(*pud) || pud_bad(*pud))
     381             :                 return NULL;
     382         127 :         pmd = pmd_offset(pud, addr);
     383         254 :         WARN_ON_ONCE(pmd_bad(*pmd));
     384         127 :         if (pmd_none(*pmd) || pmd_bad(*pmd))
     385             :                 return NULL;
     386             : 
     387         127 :         ptep = pte_offset_map(pmd, addr);
     388         127 :         pte = *ptep;
     389         127 :         if (pte_present(pte))
     390         127 :                 page = pte_page(pte);
     391             :         pte_unmap(ptep);
     392             :         return page;
     393             : }
     394             : EXPORT_SYMBOL(vmalloc_to_page);
     395             : 
     396             : /*
     397             :  * Map a vmalloc()-space virtual address to the physical page frame number.
     398             :  */
     399           0 : unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
     400             : {
     401           0 :         return page_to_pfn(vmalloc_to_page(vmalloc_addr));
     402             : }
     403             : EXPORT_SYMBOL(vmalloc_to_pfn);
     404             : 
     405             : 
     406             : /*** Global kva allocator ***/
     407             : 
     408             : #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
     409             : #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
     410             : 
     411             : 
     412             : static DEFINE_SPINLOCK(vmap_area_lock);
     413             : static DEFINE_SPINLOCK(free_vmap_area_lock);
     414             : /* Export for kexec only */
     415             : LIST_HEAD(vmap_area_list);
     416             : static struct rb_root vmap_area_root = RB_ROOT;
     417             : static bool vmap_initialized __read_mostly;
     418             : 
     419             : static struct rb_root purge_vmap_area_root = RB_ROOT;
     420             : static LIST_HEAD(purge_vmap_area_list);
     421             : static DEFINE_SPINLOCK(purge_vmap_area_lock);
     422             : 
     423             : /*
     424             :  * This kmem_cache is used for vmap_area objects. Instead of
     425             :  * allocating from slab we reuse an object from this cache to
     426             :  * make things faster. Especially in "no edge" splitting of
     427             :  * free block.
     428             :  */
     429             : static struct kmem_cache *vmap_area_cachep;
     430             : 
     431             : /*
     432             :  * This linked list is used in pair with free_vmap_area_root.
     433             :  * It gives O(1) access to prev/next to perform fast coalescing.
     434             :  */
     435             : static LIST_HEAD(free_vmap_area_list);
     436             : 
     437             : /*
     438             :  * This augment red-black tree represents the free vmap space.
     439             :  * All vmap_area objects in this tree are sorted by va->va_start
     440             :  * address. It is used for allocation and merging when a vmap
     441             :  * object is released.
     442             :  *
     443             :  * Each vmap_area node contains a maximum available free block
     444             :  * of its sub-tree, right or left. Therefore it is possible to
     445             :  * find a lowest match of free area.
     446             :  */
     447             : static struct rb_root free_vmap_area_root = RB_ROOT;
     448             : 
     449             : /*
     450             :  * Preload a CPU with one object for "no edge" split case. The
     451             :  * aim is to get rid of allocations from the atomic context, thus
     452             :  * to use more permissive allocation masks.
     453             :  */
     454             : static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
     455             : 
     456             : static __always_inline unsigned long
     457       10928 : va_size(struct vmap_area *va)
     458             : {
     459       10928 :         return (va->va_end - va->va_start);
     460             : }
     461             : 
     462             : static __always_inline unsigned long
     463       53369 : get_subtree_max_size(struct rb_node *node)
     464             : {
     465       53369 :         struct vmap_area *va;
     466             : 
     467      138656 :         va = rb_entry_safe(node, struct vmap_area, rb_node);
     468       31918 :         return va ? va->subtree_max_size : 0;
     469             : }
     470             : 
     471             : /*
     472             :  * Gets called when remove the node and rotate.
     473             :  */
     474             : static __always_inline unsigned long
     475             : compute_subtree_max_size(struct vmap_area *va)
     476             : {
     477             :         return max3(va_size(va),
     478             :                 get_subtree_max_size(va->rb_node.rb_left),
     479             :                 get_subtree_max_size(va->rb_node.rb_right));
     480             : }
     481             : 
     482       21859 : RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
     483             :         struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
     484             : 
     485             : static void purge_vmap_area_lazy(void);
     486             : static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
     487             : static unsigned long lazy_max_pages(void);
     488             : 
     489             : static atomic_long_t nr_vmalloc_pages;
     490             : 
     491           1 : unsigned long vmalloc_nr_pages(void)
     492             : {
     493           1 :         return atomic_long_read(&nr_vmalloc_pages);
     494             : }
     495             : 
     496       21177 : static struct vmap_area *__find_vmap_area(unsigned long addr)
     497             : {
     498       21177 :         struct rb_node *n = vmap_area_root.rb_node;
     499             : 
     500       65205 :         while (n) {
     501       65205 :                 struct vmap_area *va;
     502             : 
     503       65205 :                 va = rb_entry(n, struct vmap_area, rb_node);
     504       65205 :                 if (addr < va->va_start)
     505       27323 :                         n = n->rb_left;
     506       37882 :                 else if (addr >= va->va_end)
     507       16705 :                         n = n->rb_right;
     508             :                 else
     509       21177 :                         return va;
     510             :         }
     511             : 
     512             :         return NULL;
     513             : }
     514             : 
     515             : /*
     516             :  * This function returns back addresses of parent node
     517             :  * and its left or right link for further processing.
     518             :  *
     519             :  * Otherwise NULL is returned. In that case all further
     520             :  * steps regarding inserting of conflicting overlap range
     521             :  * have to be declined and actually considered as a bug.
     522             :  */
     523             : static __always_inline struct rb_node **
     524       21269 : find_va_links(struct vmap_area *va,
     525             :         struct rb_root *root, struct rb_node *from,
     526             :         struct rb_node **parent)
     527             : {
     528       21269 :         struct vmap_area *tmp_va;
     529       21269 :         struct rb_node **link;
     530             : 
     531       21269 :         if (root) {
     532       21267 :                 link = &root->rb_node;
     533       10673 :                 if (unlikely(!*link)) {
     534             :                         *parent = NULL;
     535             :                         return link;
     536             :                 }
     537             :         } else {
     538             :                 link = &from;
     539             :         }
     540             : 
     541             :         /*
     542             :          * Go to the bottom of the tree. When we hit the last point
     543             :          * we end up with parent rb_node and correct direction, i name
     544             :          * it link, where the new va->rb_node will be attached to.
     545             :          */
     546      127280 :         do {
     547      127280 :                 tmp_va = rb_entry(*link, struct vmap_area, rb_node);
     548             : 
     549             :                 /*
     550             :                  * During the traversal we also do some sanity check.
     551             :                  * Trigger the BUG() if there are sides(left/right)
     552             :                  * or full overlaps.
     553             :                  */
     554      127280 :                 if (va->va_start < tmp_va->va_end &&
     555       20934 :                                 va->va_end <= tmp_va->va_start)
     556       20934 :                         link = &(*link)->rb_left;
     557      106346 :                 else if (va->va_end > tmp_va->va_start &&
     558             :                                 va->va_start >= tmp_va->va_end)
     559      106346 :                         link = &(*link)->rb_right;
     560             :                 else {
     561           0 :                         WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
     562             :                                 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
     563             : 
     564           0 :                         return NULL;
     565             :                 }
     566      127280 :         } while (*link);
     567             : 
     568       21261 :         *parent = &tmp_va->rb_node;
     569       21261 :         return link;
     570             : }
     571             : 
     572             : static __always_inline struct list_head *
     573       10594 : get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
     574             : {
     575       10594 :         struct list_head *list;
     576             : 
     577       10594 :         if (unlikely(!parent))
     578             :                 /*
     579             :                  * The red-black tree where we try to find VA neighbors
     580             :                  * before merging or inserting is empty, i.e. it means
     581             :                  * there is no free vmap space. Normally it does not
     582             :                  * happen but we handle this case anyway.
     583             :                  */
     584             :                 return NULL;
     585             : 
     586       10588 :         list = &rb_entry(parent, struct vmap_area, rb_node)->list;
     587       10588 :         return (&parent->rb_right == link ? list->next : list);
     588             : }
     589             : 
     590             : static __always_inline void
     591       10905 : link_va(struct vmap_area *va, struct rb_root *root,
     592             :         struct rb_node *parent, struct rb_node **link, struct list_head *head)
     593             : {
     594             :         /*
     595             :          * VA is still not in the list, but we can
     596             :          * identify its future previous list_head node.
     597             :          */
     598       10905 :         if (likely(parent)) {
     599       10897 :                 head = &rb_entry(parent, struct vmap_area, rb_node)->list;
     600       10897 :                 if (&parent->rb_right != link)
     601        5272 :                         head = head->prev;
     602             :         }
     603             : 
     604             :         /* Insert to the rb-tree */
     605       10905 :         rb_link_node(&va->rb_node, parent, link);
     606       10675 :         if (root == &free_vmap_area_root) {
     607             :                 /*
     608             :                  * Some explanation here. Just perform simple insertion
     609             :                  * to the tree. We do not set va->subtree_max_size to
     610             :                  * its current size before calling rb_insert_augmented().
     611             :                  * It is because of we populate the tree from the bottom
     612             :                  * to parent levels when the node _is_ in the tree.
     613             :                  *
     614             :                  * Therefore we set subtree_max_size to zero after insertion,
     615             :                  * to let __augment_tree_propagate_from() puts everything to
     616             :                  * the correct order later on.
     617             :                  */
     618           8 :                 rb_insert_augmented(&va->rb_node,
     619             :                         root, &free_vmap_area_rb_augment_cb);
     620           3 :                 va->subtree_max_size = 0;
     621             :         } else {
     622       10897 :                 rb_insert_color(&va->rb_node, root);
     623             :         }
     624             : 
     625             :         /* Address-sort this list */
     626       10905 :         list_add(&va->list, head);
     627             : }
     628             : 
     629             : static __always_inline void
     630       10800 : unlink_va(struct vmap_area *va, struct rb_root *root)
     631             : {
     632       10800 :         if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
     633             :                 return;
     634             : 
     635       10800 :         if (root == &free_vmap_area_root)
     636           3 :                 rb_erase_augmented(&va->rb_node,
     637             :                         root, &free_vmap_area_rb_augment_cb);
     638             :         else
     639       10797 :                 rb_erase(&va->rb_node, root);
     640             : 
     641       10800 :         list_del(&va->list);
     642       10800 :         RB_CLEAR_NODE(&va->rb_node);
     643             : }
     644             : 
     645             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
     646             : static void
     647             : augment_tree_propagate_check(void)
     648             : {
     649             :         struct vmap_area *va;
     650             :         unsigned long computed_size;
     651             : 
     652             :         list_for_each_entry(va, &free_vmap_area_list, list) {
     653             :                 computed_size = compute_subtree_max_size(va);
     654             :                 if (computed_size != va->subtree_max_size)
     655             :                         pr_emerg("tree is corrupted: %lu, %lu\n",
     656             :                                 va_size(va), va->subtree_max_size);
     657             :         }
     658             : }
     659             : #endif
     660             : 
     661             : /*
     662             :  * This function populates subtree_max_size from bottom to upper
     663             :  * levels starting from VA point. The propagation must be done
     664             :  * when VA size is modified by changing its va_start/va_end. Or
     665             :  * in case of newly inserting of VA to the tree.
     666             :  *
     667             :  * It means that __augment_tree_propagate_from() must be called:
     668             :  * - After VA has been inserted to the tree(free path);
     669             :  * - After VA has been shrunk(allocation path);
     670             :  * - After VA has been increased(merging path).
     671             :  *
     672             :  * Please note that, it does not mean that upper parent nodes
     673             :  * and their subtree_max_size are recalculated all the time up
     674             :  * to the root node.
     675             :  *
     676             :  *       4--8
     677             :  *        /\
     678             :  *       /  \
     679             :  *      /    \
     680             :  *    2--2  8--8
     681             :  *
     682             :  * For example if we modify the node 4, shrinking it to 2, then
     683             :  * no any modification is required. If we shrink the node 2 to 1
     684             :  * its subtree_max_size is updated only, and set to 1. If we shrink
     685             :  * the node 8 to 6, then its subtree_max_size is set to 6 and parent
     686             :  * node becomes 4--6.
     687             :  */
     688             : static __always_inline void
     689       10678 : augment_tree_propagate_from(struct vmap_area *va)
     690             : {
     691             :         /*
     692             :          * Populate the tree from bottom towards the root until
     693             :          * the calculated maximum available size of checked node
     694             :          * is equal to its current one.
     695             :          */
     696       10678 :         free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
     697             : 
     698             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
     699             :         augment_tree_propagate_check();
     700             : #endif
     701          12 : }
     702             : 
     703             : static void
     704       10672 : insert_vmap_area(struct vmap_area *va,
     705             :         struct rb_root *root, struct list_head *head)
     706             : {
     707       10672 :         struct rb_node **link;
     708       10672 :         struct rb_node *parent;
     709             : 
     710       10672 :         link = find_va_links(va, root, NULL, &parent);
     711       10672 :         if (link)
     712       10672 :                 link_va(va, root, parent, link, head);
     713       10672 : }
     714             : 
     715             : static void
     716           3 : insert_vmap_area_augment(struct vmap_area *va,
     717             :         struct rb_node *from, struct rb_root *root,
     718             :         struct list_head *head)
     719             : {
     720           3 :         struct rb_node **link;
     721           3 :         struct rb_node *parent;
     722             : 
     723           3 :         if (from)
     724           4 :                 link = find_va_links(va, NULL, from, &parent);
     725             :         else
     726           2 :                 link = find_va_links(va, root, NULL, &parent);
     727             : 
     728           3 :         if (link) {
     729           3 :                 link_va(va, root, parent, link, head);
     730           3 :                 augment_tree_propagate_from(va);
     731             :         }
     732           3 : }
     733             : 
     734             : /*
     735             :  * Merge de-allocated chunk of VA memory with previous
     736             :  * and next free blocks. If coalesce is not done a new
     737             :  * free area is inserted. If VA has been merged, it is
     738             :  * freed.
     739             :  *
     740             :  * Please note, it can return NULL in case of overlap
     741             :  * ranges, followed by WARN() report. Despite it is a
     742             :  * buggy behaviour, a system can be alive and keep
     743             :  * ongoing.
     744             :  */
     745             : static __always_inline struct vmap_area *
     746       10594 : merge_or_add_vmap_area(struct vmap_area *va,
     747             :         struct rb_root *root, struct list_head *head)
     748             : {
     749       10594 :         struct vmap_area *sibling;
     750       10594 :         struct list_head *next;
     751       10594 :         struct rb_node **link;
     752       10594 :         struct rb_node *parent;
     753       10594 :         bool merged = false;
     754             : 
     755             :         /*
     756             :          * Find a place in the tree where VA potentially will be
     757             :          * inserted, unless it is merged with its sibling/siblings.
     758             :          */
     759       21188 :         link = find_va_links(va, root, NULL, &parent);
     760       10594 :         if (!link)
     761             :                 return NULL;
     762             : 
     763             :         /*
     764             :          * Get next node of VA to check if merging can be done.
     765             :          */
     766       10594 :         next = get_va_next_sibling(parent, link);
     767       10594 :         if (unlikely(next == NULL))
     768           6 :                 goto insert;
     769             : 
     770             :         /*
     771             :          * start            end
     772             :          * |                |
     773             :          * |<------VA------>|<-----Next----->|
     774             :          *                  |                |
     775             :          *                  start            end
     776             :          */
     777       10588 :         if (next != head) {
     778       10237 :                 sibling = list_entry(next, struct vmap_area, list);
     779       10237 :                 if (sibling->va_start == va->va_end) {
     780         220 :                         sibling->va_start = va->va_start;
     781             : 
     782             :                         /* Free vmap_area object. */
     783         220 :                         kmem_cache_free(vmap_area_cachep, va);
     784             : 
     785             :                         /* Point to the new merged area. */
     786         220 :                         va = sibling;
     787         220 :                         merged = true;
     788             :                 }
     789             :         }
     790             : 
     791             :         /*
     792             :          * start            end
     793             :          * |                |
     794             :          * |<-----Prev----->|<------VA------>|
     795             :          *                  |                |
     796             :          *                  start            end
     797             :          */
     798       10588 :         if (next->prev != head) {
     799       10586 :                 sibling = list_entry(next->prev, struct vmap_area, list);
     800       10586 :                 if (sibling->va_end == va->va_start) {
     801             :                         /*
     802             :                          * If both neighbors are coalesced, it is important
     803             :                          * to unlink the "next" node first, followed by merging
     804             :                          * with "previous" one. Otherwise the tree might not be
     805             :                          * fully populated if a sibling's augmented value is
     806             :                          * "normalized" because of rotation operations.
     807             :                          */
     808       10356 :                         if (merged)
     809         212 :                                 unlink_va(va, root);
     810             : 
     811       10356 :                         sibling->va_end = va->va_end;
     812             : 
     813             :                         /* Free vmap_area object. */
     814       10356 :                         kmem_cache_free(vmap_area_cachep, va);
     815             : 
     816             :                         /* Point to the new merged area. */
     817       10356 :                         va = sibling;
     818       10356 :                         merged = true;
     819             :                 }
     820             :         }
     821             : 
     822         232 : insert:
     823       10594 :         if (!merged)
     824         230 :                 link_va(va, root, parent, link, head);
     825             : 
     826             :         return va;
     827             : }
     828             : 
     829             : static __always_inline struct vmap_area *
     830           9 : merge_or_add_vmap_area_augment(struct vmap_area *va,
     831             :         struct rb_root *root, struct list_head *head)
     832             : {
     833          27 :         va = merge_or_add_vmap_area(va, root, head);
     834           9 :         if (va)
     835           9 :                 augment_tree_propagate_from(va);
     836             : 
     837           9 :         return va;
     838             : }
     839             : 
     840             : static __always_inline bool
     841       31878 : is_within_this_va(struct vmap_area *va, unsigned long size,
     842             :         unsigned long align, unsigned long vstart)
     843             : {
     844       31878 :         unsigned long nva_start_addr;
     845             : 
     846       31878 :         if (va->va_start > vstart)
     847       21210 :                 nva_start_addr = ALIGN(va->va_start, align);
     848             :         else
     849       10668 :                 nva_start_addr = ALIGN(vstart, align);
     850             : 
     851             :         /* Can be overflowed due to big size or alignment. */
     852       31878 :         if (nva_start_addr + size < nva_start_addr ||
     853             :                         nva_start_addr < vstart)
     854             :                 return false;
     855             : 
     856       31878 :         return (nva_start_addr + size <= va->va_end);
     857             : }
     858             : 
     859             : /*
     860             :  * Find the first free block(lowest start address) in the tree,
     861             :  * that will accomplish the request corresponding to passing
     862             :  * parameters.
     863             :  */
     864             : static __always_inline struct vmap_area *
     865       10668 : find_vmap_lowest_match(unsigned long size,
     866             :         unsigned long align, unsigned long vstart)
     867             : {
     868       10668 :         struct vmap_area *va;
     869       10668 :         struct rb_node *node;
     870       10668 :         unsigned long length;
     871             : 
     872             :         /* Start from the root. */
     873       10668 :         node = free_vmap_area_root.rb_node;
     874             : 
     875             :         /* Adjust the search size for alignment overhead. */
     876       10668 :         length = size + align - 1;
     877             : 
     878       32159 :         while (node) {
     879       32159 :                 va = rb_entry(node, struct vmap_area, rb_node);
     880             : 
     881       53422 :                 if (get_subtree_max_size(node->rb_left) >= length &&
     882       21263 :                                 vstart < va->va_start) {
     883             :                         node = node->rb_left;
     884             :                 } else {
     885       21792 :                         if (is_within_this_va(va, size, align, vstart))
     886             :                                 return va;
     887             : 
     888             :                         /*
     889             :                          * Does not make sense to go deeper towards the right
     890             :                          * sub-tree if it does not have a free block that is
     891             :                          * equal or bigger to the requested search length.
     892             :                          */
     893       10779 :                         if (get_subtree_max_size(node->rb_right) >= length) {
     894          13 :                                 node = node->rb_right;
     895          13 :                                 continue;
     896             :                         }
     897             : 
     898             :                         /*
     899             :                          * OK. We roll back and find the first right sub-tree,
     900             :                          * that will satisfy the search criteria. It can happen
     901             :                          * only once due to "vstart" restriction.
     902             :                          */
     903       20982 :                         while ((node = rb_parent(node))) {
     904       20982 :                                 va = rb_entry(node, struct vmap_area, rb_node);
     905       41964 :                                 if (is_within_this_va(va, size, align, vstart))
     906             :                                         return va;
     907             : 
     908       21086 :                                 if (get_subtree_max_size(node->rb_right) >= length &&
     909             :                                                 vstart <= va->va_start) {
     910             :                                         node = node->rb_right;
     911             :                                         break;
     912             :                                 }
     913             :                         }
     914             :                 }
     915             :         }
     916             : 
     917             :         return NULL;
     918             : }
     919             : 
     920             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
     921             : #include <linux/random.h>
     922             : 
     923             : static struct vmap_area *
     924             : find_vmap_lowest_linear_match(unsigned long size,
     925             :         unsigned long align, unsigned long vstart)
     926             : {
     927             :         struct vmap_area *va;
     928             : 
     929             :         list_for_each_entry(va, &free_vmap_area_list, list) {
     930             :                 if (!is_within_this_va(va, size, align, vstart))
     931             :                         continue;
     932             : 
     933             :                 return va;
     934             :         }
     935             : 
     936             :         return NULL;
     937             : }
     938             : 
     939             : static void
     940             : find_vmap_lowest_match_check(unsigned long size)
     941             : {
     942             :         struct vmap_area *va_1, *va_2;
     943             :         unsigned long vstart;
     944             :         unsigned int rnd;
     945             : 
     946             :         get_random_bytes(&rnd, sizeof(rnd));
     947             :         vstart = VMALLOC_START + rnd;
     948             : 
     949             :         va_1 = find_vmap_lowest_match(size, 1, vstart);
     950             :         va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
     951             : 
     952             :         if (va_1 != va_2)
     953             :                 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
     954             :                         va_1, va_2, vstart);
     955             : }
     956             : #endif
     957             : 
     958             : enum fit_type {
     959             :         NOTHING_FIT = 0,
     960             :         FL_FIT_TYPE = 1,        /* full fit */
     961             :         LE_FIT_TYPE = 2,        /* left edge fit */
     962             :         RE_FIT_TYPE = 3,        /* right edge fit */
     963             :         NE_FIT_TYPE = 4         /* no edge fit */
     964             : };
     965             : 
     966             : static __always_inline enum fit_type
     967       10669 : classify_va_fit_type(struct vmap_area *va,
     968             :         unsigned long nva_start_addr, unsigned long size)
     969             : {
     970       10669 :         enum fit_type type;
     971             : 
     972             :         /* Check if it is within VA. */
     973       21338 :         if (nva_start_addr < va->va_start ||
     974       10669 :                         nva_start_addr + size > va->va_end)
     975             :                 return NOTHING_FIT;
     976             : 
     977             :         /* Now classify. */
     978       10669 :         if (va->va_start == nva_start_addr) {
     979       10667 :                 if (va->va_end == nva_start_addr + size)
     980             :                         type = FL_FIT_TYPE;
     981             :                 else
     982       10664 :                         type = LE_FIT_TYPE;
     983           2 :         } else if (va->va_end == nva_start_addr + size) {
     984             :                 type = RE_FIT_TYPE;
     985             :         } else {
     986           2 :                 type = NE_FIT_TYPE;
     987             :         }
     988             : 
     989             :         return type;
     990             : }
     991             : 
     992             : static __always_inline int
     993       10669 : adjust_va_to_fit_type(struct vmap_area *va,
     994             :         unsigned long nva_start_addr, unsigned long size,
     995             :         enum fit_type type)
     996             : {
     997       10669 :         struct vmap_area *lva = NULL;
     998             : 
     999       10669 :         if (type == FL_FIT_TYPE) {
    1000             :                 /*
    1001             :                  * No need to split VA, it fully fits.
    1002             :                  *
    1003             :                  * |               |
    1004             :                  * V      NVA      V
    1005             :                  * |---------------|
    1006             :                  */
    1007           3 :                 unlink_va(va, &free_vmap_area_root);
    1008           3 :                 kmem_cache_free(vmap_area_cachep, va);
    1009       10666 :         } else if (type == LE_FIT_TYPE) {
    1010             :                 /*
    1011             :                  * Split left edge of fit VA.
    1012             :                  *
    1013             :                  * |       |
    1014             :                  * V  NVA  V   R
    1015             :                  * |-------|-------|
    1016             :                  */
    1017       10664 :                 va->va_start += size;
    1018           2 :         } else if (type == RE_FIT_TYPE) {
    1019             :                 /*
    1020             :                  * Split right edge of fit VA.
    1021             :                  *
    1022             :                  *         |       |
    1023             :                  *     L   V  NVA  V
    1024             :                  * |-------|-------|
    1025             :                  */
    1026           0 :                 va->va_end = nva_start_addr;
    1027           2 :         } else if (type == NE_FIT_TYPE) {
    1028             :                 /*
    1029             :                  * Split no edge of fit VA.
    1030             :                  *
    1031             :                  *     |       |
    1032             :                  *   L V  NVA  V R
    1033             :                  * |---|-------|---|
    1034             :                  */
    1035           2 :                 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
    1036           2 :                 if (unlikely(!lva)) {
    1037             :                         /*
    1038             :                          * For percpu allocator we do not do any pre-allocation
    1039             :                          * and leave it as it is. The reason is it most likely
    1040             :                          * never ends up with NE_FIT_TYPE splitting. In case of
    1041             :                          * percpu allocations offsets and sizes are aligned to
    1042             :                          * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
    1043             :                          * are its main fitting cases.
    1044             :                          *
    1045             :                          * There are a few exceptions though, as an example it is
    1046             :                          * a first allocation (early boot up) when we have "one"
    1047             :                          * big free space that has to be split.
    1048             :                          *
    1049             :                          * Also we can hit this path in case of regular "vmap"
    1050             :                          * allocations, if "this" current CPU was not preloaded.
    1051             :                          * See the comment in alloc_vmap_area() why. If so, then
    1052             :                          * GFP_NOWAIT is used instead to get an extra object for
    1053             :                          * split purpose. That is rare and most time does not
    1054             :                          * occur.
    1055             :                          *
    1056             :                          * What happens if an allocation gets failed. Basically,
    1057             :                          * an "overflow" path is triggered to purge lazily freed
    1058             :                          * areas to free some memory, then, the "retry" path is
    1059             :                          * triggered to repeat one more time. See more details
    1060             :                          * in alloc_vmap_area() function.
    1061             :                          */
    1062           0 :                         lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
    1063           0 :                         if (!lva)
    1064             :                                 return -1;
    1065             :                 }
    1066             : 
    1067             :                 /*
    1068             :                  * Build the remainder.
    1069             :                  */
    1070           2 :                 lva->va_start = va->va_start;
    1071           2 :                 lva->va_end = nva_start_addr;
    1072             : 
    1073             :                 /*
    1074             :                  * Shrink this VA to remaining size.
    1075             :                  */
    1076           2 :                 va->va_start = nva_start_addr + size;
    1077             :         } else {
    1078             :                 return -1;
    1079             :         }
    1080             : 
    1081       10669 :         if (type != FL_FIT_TYPE) {
    1082       10666 :                 augment_tree_propagate_from(va);
    1083             : 
    1084       10666 :                 if (lva)        /* type == NE_FIT_TYPE */
    1085           1 :                         insert_vmap_area_augment(lva, &va->rb_node,
    1086             :                                 &free_vmap_area_root, &free_vmap_area_list);
    1087             :         }
    1088             : 
    1089             :         return 0;
    1090             : }
    1091             : 
    1092             : /*
    1093             :  * Returns a start address of the newly allocated area, if success.
    1094             :  * Otherwise a vend is returned that indicates failure.
    1095             :  */
    1096             : static __always_inline unsigned long
    1097       10668 : __alloc_vmap_area(unsigned long size, unsigned long align,
    1098             :         unsigned long vstart, unsigned long vend)
    1099             : {
    1100       10668 :         unsigned long nva_start_addr;
    1101       10668 :         struct vmap_area *va;
    1102       10668 :         enum fit_type type;
    1103       10668 :         int ret;
    1104             : 
    1105       21336 :         va = find_vmap_lowest_match(size, align, vstart);
    1106       10668 :         if (unlikely(!va))
    1107             :                 return vend;
    1108             : 
    1109       10668 :         if (va->va_start > vstart)
    1110       10667 :                 nva_start_addr = ALIGN(va->va_start, align);
    1111             :         else
    1112           1 :                 nva_start_addr = ALIGN(vstart, align);
    1113             : 
    1114             :         /* Check the "vend" restriction. */
    1115       10668 :         if (nva_start_addr + size > vend)
    1116             :                 return vend;
    1117             : 
    1118             :         /* Classify what we have found. */
    1119       10668 :         type = classify_va_fit_type(va, nva_start_addr, size);
    1120       10668 :         if (WARN_ON_ONCE(type == NOTHING_FIT))
    1121             :                 return vend;
    1122             : 
    1123             :         /* Update the free vmap_area. */
    1124       10668 :         ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
    1125           1 :         if (ret)
    1126             :                 return vend;
    1127             : 
    1128             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1129             :         find_vmap_lowest_match_check(size);
    1130             : #endif
    1131             : 
    1132             :         return nva_start_addr;
    1133             : }
    1134             : 
    1135             : /*
    1136             :  * Free a region of KVA allocated by alloc_vmap_area
    1137             :  */
    1138           0 : static void free_vmap_area(struct vmap_area *va)
    1139             : {
    1140             :         /*
    1141             :          * Remove from the busy tree/list.
    1142             :          */
    1143           0 :         spin_lock(&vmap_area_lock);
    1144           0 :         unlink_va(va, &vmap_area_root);
    1145           0 :         spin_unlock(&vmap_area_lock);
    1146             : 
    1147             :         /*
    1148             :          * Insert/Merge it back to the free tree/list.
    1149             :          */
    1150           0 :         spin_lock(&free_vmap_area_lock);
    1151           0 :         merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
    1152           0 :         spin_unlock(&free_vmap_area_lock);
    1153           0 : }
    1154             : 
    1155             : /*
    1156             :  * Allocate a region of KVA of the specified size and alignment, within the
    1157             :  * vstart and vend.
    1158             :  */
    1159       10668 : static struct vmap_area *alloc_vmap_area(unsigned long size,
    1160             :                                 unsigned long align,
    1161             :                                 unsigned long vstart, unsigned long vend,
    1162             :                                 int node, gfp_t gfp_mask)
    1163             : {
    1164       10668 :         struct vmap_area *va, *pva;
    1165       10668 :         unsigned long addr;
    1166       10668 :         int purged = 0;
    1167       10668 :         int ret;
    1168             : 
    1169       10668 :         BUG_ON(!size);
    1170       10668 :         BUG_ON(offset_in_page(size));
    1171       21336 :         BUG_ON(!is_power_of_2(align));
    1172             : 
    1173       10668 :         if (unlikely(!vmap_initialized))
    1174       10668 :                 return ERR_PTR(-EBUSY);
    1175             : 
    1176       10668 :         might_sleep();
    1177       10668 :         gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
    1178             : 
    1179       10668 :         va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1180       10668 :         if (unlikely(!va))
    1181       10668 :                 return ERR_PTR(-ENOMEM);
    1182             : 
    1183             :         /*
    1184             :          * Only scan the relevant parts containing pointers to other objects
    1185             :          * to avoid false negatives.
    1186             :          */
    1187       10668 :         kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
    1188             : 
    1189             : retry:
    1190             :         /*
    1191             :          * Preload this CPU with one extra vmap_area object. It is used
    1192             :          * when fit type of free area is NE_FIT_TYPE. Please note, it
    1193             :          * does not guarantee that an allocation occurs on a CPU that
    1194             :          * is preloaded, instead we minimize the case when it is not.
    1195             :          * It can happen because of cpu migration, because there is a
    1196             :          * race until the below spinlock is taken.
    1197             :          *
    1198             :          * The preload is done in non-atomic context, thus it allows us
    1199             :          * to use more permissive allocation masks to be more stable under
    1200             :          * low memory condition and high memory pressure. In rare case,
    1201             :          * if not preloaded, GFP_NOWAIT is used.
    1202             :          *
    1203             :          * Set "pva" to NULL here, because of "retry" path.
    1204             :          */
    1205       10668 :         pva = NULL;
    1206             : 
    1207       10668 :         if (!this_cpu_read(ne_fit_preload_node))
    1208             :                 /*
    1209             :                  * Even if it fails we do not really care about that.
    1210             :                  * Just proceed as it is. If needed "overflow" path
    1211             :                  * will refill the cache we allocate from.
    1212             :                  */
    1213           6 :                 pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1214             : 
    1215       10668 :         spin_lock(&free_vmap_area_lock);
    1216             : 
    1217       10668 :         if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
    1218           0 :                 kmem_cache_free(vmap_area_cachep, pva);
    1219             : 
    1220             :         /*
    1221             :          * If an allocation fails, the "vend" address is
    1222             :          * returned. Therefore trigger the overflow path.
    1223             :          */
    1224       10668 :         addr = __alloc_vmap_area(size, align, vstart, vend);
    1225       10668 :         spin_unlock(&free_vmap_area_lock);
    1226             : 
    1227       10668 :         if (unlikely(addr == vend))
    1228           0 :                 goto overflow;
    1229             : 
    1230       10668 :         va->va_start = addr;
    1231       10668 :         va->va_end = addr + size;
    1232       10668 :         va->vm = NULL;
    1233             : 
    1234             : 
    1235       10668 :         spin_lock(&vmap_area_lock);
    1236       10668 :         insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    1237       10668 :         spin_unlock(&vmap_area_lock);
    1238             : 
    1239       10668 :         BUG_ON(!IS_ALIGNED(va->va_start, align));
    1240       10668 :         BUG_ON(va->va_start < vstart);
    1241       10668 :         BUG_ON(va->va_end > vend);
    1242             : 
    1243       10668 :         ret = kasan_populate_vmalloc(addr, size);
    1244       10668 :         if (ret) {
    1245           0 :                 free_vmap_area(va);
    1246           0 :                 return ERR_PTR(ret);
    1247             :         }
    1248             : 
    1249             :         return va;
    1250             : 
    1251           0 : overflow:
    1252           0 :         if (!purged) {
    1253           0 :                 purge_vmap_area_lazy();
    1254           0 :                 purged = 1;
    1255           0 :                 goto retry;
    1256             :         }
    1257             : 
    1258           0 :         if (gfpflags_allow_blocking(gfp_mask)) {
    1259           0 :                 unsigned long freed = 0;
    1260           0 :                 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
    1261           0 :                 if (freed > 0) {
    1262           0 :                         purged = 0;
    1263           0 :                         goto retry;
    1264             :                 }
    1265             :         }
    1266             : 
    1267           0 :         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
    1268           0 :                 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
    1269             :                         size);
    1270             : 
    1271           0 :         kmem_cache_free(vmap_area_cachep, va);
    1272           0 :         return ERR_PTR(-EBUSY);
    1273             : }
    1274             : 
    1275           0 : int register_vmap_purge_notifier(struct notifier_block *nb)
    1276             : {
    1277           0 :         return blocking_notifier_chain_register(&vmap_notify_list, nb);
    1278             : }
    1279             : EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
    1280             : 
    1281           0 : int unregister_vmap_purge_notifier(struct notifier_block *nb)
    1282             : {
    1283           0 :         return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
    1284             : }
    1285             : EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
    1286             : 
    1287             : /*
    1288             :  * lazy_max_pages is the maximum amount of virtual address space we gather up
    1289             :  * before attempting to purge with a TLB flush.
    1290             :  *
    1291             :  * There is a tradeoff here: a larger number will cover more kernel page tables
    1292             :  * and take slightly longer to purge, but it will linearly reduce the number of
    1293             :  * global TLB flushes that must be performed. It would seem natural to scale
    1294             :  * this number up linearly with the number of CPUs (because vmapping activity
    1295             :  * could also scale linearly with the number of CPUs), however it is likely
    1296             :  * that in practice, workloads might be constrained in other ways that mean
    1297             :  * vmap activity will not scale linearly with CPUs. Also, I want to be
    1298             :  * conservative and not introduce a big latency on huge systems, so go with
    1299             :  * a less aggressive log scale. It will still be an improvement over the old
    1300             :  * code, and it will be simple to change the scale factor if we find that it
    1301             :  * becomes a problem on bigger systems.
    1302             :  */
    1303       10590 : static unsigned long lazy_max_pages(void)
    1304             : {
    1305       10590 :         unsigned int log;
    1306             : 
    1307       10590 :         log = fls(num_online_cpus());
    1308             : 
    1309       10590 :         return log * (32UL * 1024 * 1024 / PAGE_SIZE);
    1310             : }
    1311             : 
    1312             : static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
    1313             : 
    1314             : /*
    1315             :  * Serialize vmap purging.  There is no actual criticial section protected
    1316             :  * by this look, but we want to avoid concurrent calls for performance
    1317             :  * reasons and to make the pcpu_get_vm_areas more deterministic.
    1318             :  */
    1319             : static DEFINE_MUTEX(vmap_purge_lock);
    1320             : 
    1321             : /* for per-CPU blocks */
    1322             : static void purge_fragmented_blocks_allcpus(void);
    1323             : 
    1324             : /*
    1325             :  * called before a call to iounmap() if the caller wants vm_area_struct's
    1326             :  * immediately freed.
    1327             :  */
    1328           0 : void set_iounmap_nonlazy(void)
    1329             : {
    1330           0 :         atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
    1331           0 : }
    1332             : 
    1333             : /*
    1334             :  * Purges all lazily-freed vmap areas.
    1335             :  */
    1336          21 : static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
    1337             : {
    1338          21 :         unsigned long resched_threshold;
    1339          21 :         struct list_head local_pure_list;
    1340          21 :         struct vmap_area *va, *n_va;
    1341             : 
    1342          63 :         lockdep_assert_held(&vmap_purge_lock);
    1343             : 
    1344          21 :         spin_lock(&purge_vmap_area_lock);
    1345          21 :         purge_vmap_area_root = RB_ROOT;
    1346          21 :         list_replace_init(&purge_vmap_area_list, &local_pure_list);
    1347          21 :         spin_unlock(&purge_vmap_area_lock);
    1348             : 
    1349          21 :         if (unlikely(list_empty(&local_pure_list)))
    1350             :                 return false;
    1351             : 
    1352           5 :         start = min(start,
    1353             :                 list_first_entry(&local_pure_list,
    1354             :                         struct vmap_area, list)->va_start);
    1355             : 
    1356           5 :         end = max(end,
    1357             :                 list_last_entry(&local_pure_list,
    1358             :                         struct vmap_area, list)->va_end);
    1359             : 
    1360           5 :         flush_tlb_kernel_range(start, end);
    1361           5 :         resched_threshold = lazy_max_pages() << 1;
    1362             : 
    1363           5 :         spin_lock(&free_vmap_area_lock);
    1364          14 :         list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
    1365           9 :                 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
    1366           9 :                 unsigned long orig_start = va->va_start;
    1367           9 :                 unsigned long orig_end = va->va_end;
    1368             : 
    1369             :                 /*
    1370             :                  * Finally insert or merge lazily-freed area. It is
    1371             :                  * detached and there is no need to "unlink" it from
    1372             :                  * anything.
    1373             :                  */
    1374           9 :                 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
    1375             :                                 &free_vmap_area_list);
    1376             : 
    1377           9 :                 if (!va)
    1378           0 :                         continue;
    1379             : 
    1380           9 :                 if (is_vmalloc_or_module_addr((void *)orig_start))
    1381           9 :                         kasan_release_vmalloc(orig_start, orig_end,
    1382             :                                               va->va_start, va->va_end);
    1383             : 
    1384           9 :                 atomic_long_sub(nr, &vmap_lazy_nr);
    1385             : 
    1386           9 :                 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
    1387           9 :                         cond_resched_lock(&free_vmap_area_lock);
    1388             :         }
    1389           5 :         spin_unlock(&free_vmap_area_lock);
    1390           5 :         return true;
    1391             : }
    1392             : 
    1393             : /*
    1394             :  * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
    1395             :  * is already purging.
    1396             :  */
    1397           0 : static void try_purge_vmap_area_lazy(void)
    1398             : {
    1399           0 :         if (mutex_trylock(&vmap_purge_lock)) {
    1400           0 :                 __purge_vmap_area_lazy(ULONG_MAX, 0);
    1401           0 :                 mutex_unlock(&vmap_purge_lock);
    1402             :         }
    1403           0 : }
    1404             : 
    1405             : /*
    1406             :  * Kick off a purge of the outstanding lazy areas.
    1407             :  */
    1408           0 : static void purge_vmap_area_lazy(void)
    1409             : {
    1410           0 :         mutex_lock(&vmap_purge_lock);
    1411           0 :         purge_fragmented_blocks_allcpus();
    1412           0 :         __purge_vmap_area_lazy(ULONG_MAX, 0);
    1413           0 :         mutex_unlock(&vmap_purge_lock);
    1414           0 : }
    1415             : 
    1416             : /*
    1417             :  * Free a vmap area, caller ensuring that the area has been unmapped
    1418             :  * and flush_cache_vunmap had been called for the correct range
    1419             :  * previously.
    1420             :  */
    1421       10585 : static void free_vmap_area_noflush(struct vmap_area *va)
    1422             : {
    1423       10585 :         unsigned long nr_lazy;
    1424             : 
    1425       10585 :         spin_lock(&vmap_area_lock);
    1426       10585 :         unlink_va(va, &vmap_area_root);
    1427       10585 :         spin_unlock(&vmap_area_lock);
    1428             : 
    1429       10585 :         nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
    1430             :                                 PAGE_SHIFT, &vmap_lazy_nr);
    1431             : 
    1432             :         /*
    1433             :          * Merge or place it to the purge tree/list.
    1434             :          */
    1435       10585 :         spin_lock(&purge_vmap_area_lock);
    1436       10585 :         merge_or_add_vmap_area(va,
    1437             :                 &purge_vmap_area_root, &purge_vmap_area_list);
    1438       10585 :         spin_unlock(&purge_vmap_area_lock);
    1439             : 
    1440             :         /* After this point, we may free va at any time */
    1441       10585 :         if (unlikely(nr_lazy > lazy_max_pages()))
    1442           0 :                 try_purge_vmap_area_lazy();
    1443       10585 : }
    1444             : 
    1445             : /*
    1446             :  * Free and unmap a vmap area
    1447             :  */
    1448       10585 : static void free_unmap_vmap_area(struct vmap_area *va)
    1449             : {
    1450       10585 :         flush_cache_vunmap(va->va_start, va->va_end);
    1451       10585 :         unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
    1452       10585 :         if (debug_pagealloc_enabled_static())
    1453             :                 flush_tlb_kernel_range(va->va_start, va->va_end);
    1454             : 
    1455       10585 :         free_vmap_area_noflush(va);
    1456       10585 : }
    1457             : 
    1458       10592 : static struct vmap_area *find_vmap_area(unsigned long addr)
    1459             : {
    1460       10592 :         struct vmap_area *va;
    1461             : 
    1462       10592 :         spin_lock(&vmap_area_lock);
    1463       10592 :         va = __find_vmap_area(addr);
    1464       10592 :         spin_unlock(&vmap_area_lock);
    1465             : 
    1466       10592 :         return va;
    1467             : }
    1468             : 
    1469             : /*** Per cpu kva allocator ***/
    1470             : 
    1471             : /*
    1472             :  * vmap space is limited especially on 32 bit architectures. Ensure there is
    1473             :  * room for at least 16 percpu vmap blocks per CPU.
    1474             :  */
    1475             : /*
    1476             :  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
    1477             :  * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
    1478             :  * instead (we just need a rough idea)
    1479             :  */
    1480             : #if BITS_PER_LONG == 32
    1481             : #define VMALLOC_SPACE           (128UL*1024*1024)
    1482             : #else
    1483             : #define VMALLOC_SPACE           (128UL*1024*1024*1024)
    1484             : #endif
    1485             : 
    1486             : #define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
    1487             : #define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
    1488             : #define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
    1489             : #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
    1490             : #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
    1491             : #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
    1492             : #define VMAP_BBMAP_BITS         \
    1493             :                 VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
    1494             :                 VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
    1495             :                         VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
    1496             : 
    1497             : #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
    1498             : 
    1499             : struct vmap_block_queue {
    1500             :         spinlock_t lock;
    1501             :         struct list_head free;
    1502             : };
    1503             : 
    1504             : struct vmap_block {
    1505             :         spinlock_t lock;
    1506             :         struct vmap_area *va;
    1507             :         unsigned long free, dirty;
    1508             :         unsigned long dirty_min, dirty_max; /*< dirty range */
    1509             :         struct list_head free_list;
    1510             :         struct rcu_head rcu_head;
    1511             :         struct list_head purge;
    1512             : };
    1513             : 
    1514             : /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
    1515             : static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
    1516             : 
    1517             : /*
    1518             :  * XArray of vmap blocks, indexed by address, to quickly find a vmap block
    1519             :  * in the free path. Could get rid of this if we change the API to return a
    1520             :  * "cookie" from alloc, to be passed to free. But no big deal yet.
    1521             :  */
    1522             : static DEFINE_XARRAY(vmap_blocks);
    1523             : 
    1524             : /*
    1525             :  * We should probably have a fallback mechanism to allocate virtual memory
    1526             :  * out of partially filled vmap blocks. However vmap block sizing should be
    1527             :  * fairly reasonable according to the vmalloc size, so it shouldn't be a
    1528             :  * big problem.
    1529             :  */
    1530             : 
    1531           0 : static unsigned long addr_to_vb_idx(unsigned long addr)
    1532             : {
    1533           0 :         addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
    1534           0 :         addr /= VMAP_BLOCK_SIZE;
    1535           0 :         return addr;
    1536             : }
    1537             : 
    1538           0 : static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
    1539             : {
    1540           0 :         unsigned long addr;
    1541             : 
    1542           0 :         addr = va_start + (pages_off << PAGE_SHIFT);
    1543           0 :         BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
    1544           0 :         return (void *)addr;
    1545             : }
    1546             : 
    1547             : /**
    1548             :  * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
    1549             :  *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
    1550             :  * @order:    how many 2^order pages should be occupied in newly allocated block
    1551             :  * @gfp_mask: flags for the page level allocator
    1552             :  *
    1553             :  * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
    1554             :  */
    1555           0 : static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
    1556             : {
    1557           0 :         struct vmap_block_queue *vbq;
    1558           0 :         struct vmap_block *vb;
    1559           0 :         struct vmap_area *va;
    1560           0 :         unsigned long vb_idx;
    1561           0 :         int node, err;
    1562           0 :         void *vaddr;
    1563             : 
    1564           0 :         node = numa_node_id();
    1565             : 
    1566           0 :         vb = kmalloc_node(sizeof(struct vmap_block),
    1567             :                         gfp_mask & GFP_RECLAIM_MASK, node);
    1568           0 :         if (unlikely(!vb))
    1569           0 :                 return ERR_PTR(-ENOMEM);
    1570             : 
    1571           0 :         va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
    1572             :                                         VMALLOC_START, VMALLOC_END,
    1573             :                                         node, gfp_mask);
    1574           0 :         if (IS_ERR(va)) {
    1575           0 :                 kfree(vb);
    1576           0 :                 return ERR_CAST(va);
    1577             :         }
    1578             : 
    1579           0 :         vaddr = vmap_block_vaddr(va->va_start, 0);
    1580           0 :         spin_lock_init(&vb->lock);
    1581           0 :         vb->va = va;
    1582             :         /* At least something should be left free */
    1583           0 :         BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
    1584           0 :         vb->free = VMAP_BBMAP_BITS - (1UL << order);
    1585           0 :         vb->dirty = 0;
    1586           0 :         vb->dirty_min = VMAP_BBMAP_BITS;
    1587           0 :         vb->dirty_max = 0;
    1588           0 :         INIT_LIST_HEAD(&vb->free_list);
    1589             : 
    1590           0 :         vb_idx = addr_to_vb_idx(va->va_start);
    1591           0 :         err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
    1592           0 :         if (err) {
    1593           0 :                 kfree(vb);
    1594           0 :                 free_vmap_area(va);
    1595           0 :                 return ERR_PTR(err);
    1596             :         }
    1597             : 
    1598           0 :         vbq = &get_cpu_var(vmap_block_queue);
    1599           0 :         spin_lock(&vbq->lock);
    1600           0 :         list_add_tail_rcu(&vb->free_list, &vbq->free);
    1601           0 :         spin_unlock(&vbq->lock);
    1602           0 :         put_cpu_var(vmap_block_queue);
    1603             : 
    1604           0 :         return vaddr;
    1605             : }
    1606             : 
    1607           0 : static void free_vmap_block(struct vmap_block *vb)
    1608             : {
    1609           0 :         struct vmap_block *tmp;
    1610             : 
    1611           0 :         tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
    1612           0 :         BUG_ON(tmp != vb);
    1613             : 
    1614           0 :         free_vmap_area_noflush(vb->va);
    1615           0 :         kfree_rcu(vb, rcu_head);
    1616           0 : }
    1617             : 
    1618          84 : static void purge_fragmented_blocks(int cpu)
    1619             : {
    1620          84 :         LIST_HEAD(purge);
    1621          84 :         struct vmap_block *vb;
    1622          84 :         struct vmap_block *n_vb;
    1623          84 :         struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    1624             : 
    1625          84 :         rcu_read_lock();
    1626          84 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    1627             : 
    1628           0 :                 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
    1629           0 :                         continue;
    1630             : 
    1631           0 :                 spin_lock(&vb->lock);
    1632           0 :                 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
    1633           0 :                         vb->free = 0; /* prevent further allocs after releasing lock */
    1634           0 :                         vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
    1635           0 :                         vb->dirty_min = 0;
    1636           0 :                         vb->dirty_max = VMAP_BBMAP_BITS;
    1637           0 :                         spin_lock(&vbq->lock);
    1638           0 :                         list_del_rcu(&vb->free_list);
    1639           0 :                         spin_unlock(&vbq->lock);
    1640           0 :                         spin_unlock(&vb->lock);
    1641           0 :                         list_add_tail(&vb->purge, &purge);
    1642             :                 } else
    1643           0 :                         spin_unlock(&vb->lock);
    1644             :         }
    1645          84 :         rcu_read_unlock();
    1646             : 
    1647          84 :         list_for_each_entry_safe(vb, n_vb, &purge, purge) {
    1648           0 :                 list_del(&vb->purge);
    1649           0 :                 free_vmap_block(vb);
    1650             :         }
    1651          84 : }
    1652             : 
    1653          21 : static void purge_fragmented_blocks_allcpus(void)
    1654             : {
    1655          21 :         int cpu;
    1656             : 
    1657         105 :         for_each_possible_cpu(cpu)
    1658          84 :                 purge_fragmented_blocks(cpu);
    1659          21 : }
    1660             : 
    1661           0 : static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
    1662             : {
    1663           0 :         struct vmap_block_queue *vbq;
    1664           0 :         struct vmap_block *vb;
    1665           0 :         void *vaddr = NULL;
    1666           0 :         unsigned int order;
    1667             : 
    1668           0 :         BUG_ON(offset_in_page(size));
    1669           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    1670           0 :         if (WARN_ON(size == 0)) {
    1671             :                 /*
    1672             :                  * Allocating 0 bytes isn't what caller wants since
    1673             :                  * get_order(0) returns funny result. Just warn and terminate
    1674             :                  * early.
    1675             :                  */
    1676             :                 return NULL;
    1677             :         }
    1678           0 :         order = get_order(size);
    1679             : 
    1680           0 :         rcu_read_lock();
    1681           0 :         vbq = &get_cpu_var(vmap_block_queue);
    1682           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    1683           0 :                 unsigned long pages_off;
    1684             : 
    1685           0 :                 spin_lock(&vb->lock);
    1686           0 :                 if (vb->free < (1UL << order)) {
    1687           0 :                         spin_unlock(&vb->lock);
    1688           0 :                         continue;
    1689             :                 }
    1690             : 
    1691           0 :                 pages_off = VMAP_BBMAP_BITS - vb->free;
    1692           0 :                 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
    1693           0 :                 vb->free -= 1UL << order;
    1694           0 :                 if (vb->free == 0) {
    1695           0 :                         spin_lock(&vbq->lock);
    1696           0 :                         list_del_rcu(&vb->free_list);
    1697           0 :                         spin_unlock(&vbq->lock);
    1698             :                 }
    1699             : 
    1700           0 :                 spin_unlock(&vb->lock);
    1701             :                 break;
    1702             :         }
    1703             : 
    1704           0 :         put_cpu_var(vmap_block_queue);
    1705           0 :         rcu_read_unlock();
    1706             : 
    1707             :         /* Allocate new block if nothing was found */
    1708           0 :         if (!vaddr)
    1709           0 :                 vaddr = new_vmap_block(order, gfp_mask);
    1710             : 
    1711             :         return vaddr;
    1712             : }
    1713             : 
    1714           0 : static void vb_free(unsigned long addr, unsigned long size)
    1715             : {
    1716           0 :         unsigned long offset;
    1717           0 :         unsigned int order;
    1718           0 :         struct vmap_block *vb;
    1719             : 
    1720           0 :         BUG_ON(offset_in_page(size));
    1721           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    1722             : 
    1723           0 :         flush_cache_vunmap(addr, addr + size);
    1724             : 
    1725           0 :         order = get_order(size);
    1726           0 :         offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
    1727           0 :         vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
    1728             : 
    1729           0 :         unmap_kernel_range_noflush(addr, size);
    1730             : 
    1731           0 :         if (debug_pagealloc_enabled_static())
    1732             :                 flush_tlb_kernel_range(addr, addr + size);
    1733             : 
    1734           0 :         spin_lock(&vb->lock);
    1735             : 
    1736             :         /* Expand dirty range */
    1737           0 :         vb->dirty_min = min(vb->dirty_min, offset);
    1738           0 :         vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
    1739             : 
    1740           0 :         vb->dirty += 1UL << order;
    1741           0 :         if (vb->dirty == VMAP_BBMAP_BITS) {
    1742           0 :                 BUG_ON(vb->free);
    1743           0 :                 spin_unlock(&vb->lock);
    1744           0 :                 free_vmap_block(vb);
    1745             :         } else
    1746           0 :                 spin_unlock(&vb->lock);
    1747           0 : }
    1748             : 
    1749          21 : static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
    1750             : {
    1751          21 :         int cpu;
    1752             : 
    1753          21 :         if (unlikely(!vmap_initialized))
    1754             :                 return;
    1755             : 
    1756          21 :         might_sleep();
    1757             : 
    1758         126 :         for_each_possible_cpu(cpu) {
    1759          84 :                 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    1760          84 :                 struct vmap_block *vb;
    1761             : 
    1762          84 :                 rcu_read_lock();
    1763          84 :                 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    1764           0 :                         spin_lock(&vb->lock);
    1765           0 :                         if (vb->dirty) {
    1766           0 :                                 unsigned long va_start = vb->va->va_start;
    1767           0 :                                 unsigned long s, e;
    1768             : 
    1769           0 :                                 s = va_start + (vb->dirty_min << PAGE_SHIFT);
    1770           0 :                                 e = va_start + (vb->dirty_max << PAGE_SHIFT);
    1771             : 
    1772           0 :                                 start = min(s, start);
    1773           0 :                                 end   = max(e, end);
    1774             : 
    1775           0 :                                 flush = 1;
    1776             :                         }
    1777           0 :                         spin_unlock(&vb->lock);
    1778             :                 }
    1779          84 :                 rcu_read_unlock();
    1780             :         }
    1781             : 
    1782          21 :         mutex_lock(&vmap_purge_lock);
    1783          21 :         purge_fragmented_blocks_allcpus();
    1784          21 :         if (!__purge_vmap_area_lazy(start, end) && flush)
    1785           0 :                 flush_tlb_kernel_range(start, end);
    1786          21 :         mutex_unlock(&vmap_purge_lock);
    1787             : }
    1788             : 
    1789             : /**
    1790             :  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
    1791             :  *
    1792             :  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
    1793             :  * to amortize TLB flushing overheads. What this means is that any page you
    1794             :  * have now, may, in a former life, have been mapped into kernel virtual
    1795             :  * address by the vmap layer and so there might be some CPUs with TLB entries
    1796             :  * still referencing that page (additional to the regular 1:1 kernel mapping).
    1797             :  *
    1798             :  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
    1799             :  * be sure that none of the pages we have control over will have any aliases
    1800             :  * from the vmap layer.
    1801             :  */
    1802          21 : void vm_unmap_aliases(void)
    1803             : {
    1804          21 :         unsigned long start = ULONG_MAX, end = 0;
    1805          21 :         int flush = 0;
    1806             : 
    1807          21 :         _vm_unmap_aliases(start, end, flush);
    1808          21 : }
    1809             : EXPORT_SYMBOL_GPL(vm_unmap_aliases);
    1810             : 
    1811             : /**
    1812             :  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
    1813             :  * @mem: the pointer returned by vm_map_ram
    1814             :  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
    1815             :  */
    1816           0 : void vm_unmap_ram(const void *mem, unsigned int count)
    1817             : {
    1818           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    1819           0 :         unsigned long addr = (unsigned long)mem;
    1820           0 :         struct vmap_area *va;
    1821             : 
    1822           0 :         might_sleep();
    1823           0 :         BUG_ON(!addr);
    1824           0 :         BUG_ON(addr < VMALLOC_START);
    1825           0 :         BUG_ON(addr > VMALLOC_END);
    1826           0 :         BUG_ON(!PAGE_ALIGNED(addr));
    1827             : 
    1828           0 :         kasan_poison_vmalloc(mem, size);
    1829             : 
    1830           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    1831           0 :                 debug_check_no_locks_freed(mem, size);
    1832           0 :                 vb_free(addr, size);
    1833           0 :                 return;
    1834             :         }
    1835             : 
    1836           0 :         va = find_vmap_area(addr);
    1837           0 :         BUG_ON(!va);
    1838           0 :         debug_check_no_locks_freed((void *)va->va_start,
    1839           0 :                                     (va->va_end - va->va_start));
    1840           0 :         free_unmap_vmap_area(va);
    1841             : }
    1842             : EXPORT_SYMBOL(vm_unmap_ram);
    1843             : 
    1844             : /**
    1845             :  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
    1846             :  * @pages: an array of pointers to the pages to be mapped
    1847             :  * @count: number of pages
    1848             :  * @node: prefer to allocate data structures on this node
    1849             :  *
    1850             :  * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
    1851             :  * faster than vmap so it's good.  But if you mix long-life and short-life
    1852             :  * objects with vm_map_ram(), it could consume lots of address space through
    1853             :  * fragmentation (especially on a 32bit machine).  You could see failures in
    1854             :  * the end.  Please use this function for short-lived objects.
    1855             :  *
    1856             :  * Returns: a pointer to the address that has been mapped, or %NULL on failure
    1857             :  */
    1858           0 : void *vm_map_ram(struct page **pages, unsigned int count, int node)
    1859             : {
    1860           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    1861           0 :         unsigned long addr;
    1862           0 :         void *mem;
    1863             : 
    1864           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    1865           0 :                 mem = vb_alloc(size, GFP_KERNEL);
    1866           0 :                 if (IS_ERR(mem))
    1867             :                         return NULL;
    1868             :                 addr = (unsigned long)mem;
    1869             :         } else {
    1870           0 :                 struct vmap_area *va;
    1871           0 :                 va = alloc_vmap_area(size, PAGE_SIZE,
    1872             :                                 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
    1873           0 :                 if (IS_ERR(va))
    1874             :                         return NULL;
    1875             : 
    1876           0 :                 addr = va->va_start;
    1877           0 :                 mem = (void *)addr;
    1878             :         }
    1879             : 
    1880           0 :         kasan_unpoison_vmalloc(mem, size);
    1881             : 
    1882           0 :         if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
    1883           0 :                 vm_unmap_ram(mem, count);
    1884           0 :                 return NULL;
    1885             :         }
    1886             :         return mem;
    1887             : }
    1888             : EXPORT_SYMBOL(vm_map_ram);
    1889             : 
    1890             : static struct vm_struct *vmlist __initdata;
    1891             : 
    1892             : /**
    1893             :  * vm_area_add_early - add vmap area early during boot
    1894             :  * @vm: vm_struct to add
    1895             :  *
    1896             :  * This function is used to add fixed kernel vm area to vmlist before
    1897             :  * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
    1898             :  * should contain proper values and the other fields should be zero.
    1899             :  *
    1900             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    1901             :  */
    1902           0 : void __init vm_area_add_early(struct vm_struct *vm)
    1903             : {
    1904           0 :         struct vm_struct *tmp, **p;
    1905             : 
    1906           0 :         BUG_ON(vmap_initialized);
    1907           0 :         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
    1908           0 :                 if (tmp->addr >= vm->addr) {
    1909           0 :                         BUG_ON(tmp->addr < vm->addr + vm->size);
    1910             :                         break;
    1911             :                 } else
    1912           0 :                         BUG_ON(tmp->addr + tmp->size > vm->addr);
    1913             :         }
    1914           0 :         vm->next = *p;
    1915           0 :         *p = vm;
    1916           0 : }
    1917             : 
    1918             : /**
    1919             :  * vm_area_register_early - register vmap area early during boot
    1920             :  * @vm: vm_struct to register
    1921             :  * @align: requested alignment
    1922             :  *
    1923             :  * This function is used to register kernel vm area before
    1924             :  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
    1925             :  * proper values on entry and other fields should be zero.  On return,
    1926             :  * vm->addr contains the allocated address.
    1927             :  *
    1928             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    1929             :  */
    1930           0 : void __init vm_area_register_early(struct vm_struct *vm, size_t align)
    1931             : {
    1932           0 :         static size_t vm_init_off __initdata;
    1933           0 :         unsigned long addr;
    1934             : 
    1935           0 :         addr = ALIGN(VMALLOC_START + vm_init_off, align);
    1936           0 :         vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
    1937             : 
    1938           0 :         vm->addr = (void *)addr;
    1939             : 
    1940           0 :         vm_area_add_early(vm);
    1941           0 : }
    1942             : 
    1943           1 : static void vmap_init_free_space(void)
    1944             : {
    1945           1 :         unsigned long vmap_start = 1;
    1946           1 :         const unsigned long vmap_end = ULONG_MAX;
    1947           1 :         struct vmap_area *busy, *free;
    1948             : 
    1949             :         /*
    1950             :          *     B     F     B     B     B     F
    1951             :          * -|-----|.....|-----|-----|-----|.....|-
    1952             :          *  |           The KVA space           |
    1953             :          *  |<--------------------------------->|
    1954             :          */
    1955           1 :         list_for_each_entry(busy, &vmap_area_list, list) {
    1956           0 :                 if (busy->va_start - vmap_start > 0) {
    1957           0 :                         free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    1958           0 :                         if (!WARN_ON_ONCE(!free)) {
    1959           0 :                                 free->va_start = vmap_start;
    1960           0 :                                 free->va_end = busy->va_start;
    1961             : 
    1962           0 :                                 insert_vmap_area_augment(free, NULL,
    1963             :                                         &free_vmap_area_root,
    1964             :                                                 &free_vmap_area_list);
    1965             :                         }
    1966             :                 }
    1967             : 
    1968           0 :                 vmap_start = busy->va_end;
    1969             :         }
    1970             : 
    1971           1 :         if (vmap_end - vmap_start > 0) {
    1972           1 :                 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    1973           1 :                 if (!WARN_ON_ONCE(!free)) {
    1974           1 :                         free->va_start = vmap_start;
    1975           1 :                         free->va_end = vmap_end;
    1976             : 
    1977           1 :                         insert_vmap_area_augment(free, NULL,
    1978             :                                 &free_vmap_area_root,
    1979             :                                         &free_vmap_area_list);
    1980             :                 }
    1981             :         }
    1982           1 : }
    1983             : 
    1984           1 : void __init vmalloc_init(void)
    1985             : {
    1986           1 :         struct vmap_area *va;
    1987           1 :         struct vm_struct *tmp;
    1988           1 :         int i;
    1989             : 
    1990             :         /*
    1991             :          * Create the cache for vmap_area objects.
    1992             :          */
    1993           1 :         vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
    1994             : 
    1995           6 :         for_each_possible_cpu(i) {
    1996           4 :                 struct vmap_block_queue *vbq;
    1997           4 :                 struct vfree_deferred *p;
    1998             : 
    1999           4 :                 vbq = &per_cpu(vmap_block_queue, i);
    2000           4 :                 spin_lock_init(&vbq->lock);
    2001           4 :                 INIT_LIST_HEAD(&vbq->free);
    2002           4 :                 p = &per_cpu(vfree_deferred, i);
    2003           4 :                 init_llist_head(&p->list);
    2004           5 :                 INIT_WORK(&p->wq, free_work);
    2005             :         }
    2006             : 
    2007             :         /* Import existing vmlist entries. */
    2008           1 :         for (tmp = vmlist; tmp; tmp = tmp->next) {
    2009           0 :                 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2010           0 :                 if (WARN_ON_ONCE(!va))
    2011           0 :                         continue;
    2012             : 
    2013           0 :                 va->va_start = (unsigned long)tmp->addr;
    2014           0 :                 va->va_end = va->va_start + tmp->size;
    2015           0 :                 va->vm = tmp;
    2016           0 :                 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    2017             :         }
    2018             : 
    2019             :         /*
    2020             :          * Now we can initialize a free vmap space.
    2021             :          */
    2022           1 :         vmap_init_free_space();
    2023           1 :         vmap_initialized = true;
    2024           1 : }
    2025             : 
    2026             : /**
    2027             :  * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
    2028             :  * @addr: start of the VM area to unmap
    2029             :  * @size: size of the VM area to unmap
    2030             :  *
    2031             :  * Similar to unmap_kernel_range_noflush() but flushes vcache before
    2032             :  * the unmapping and tlb after.
    2033             :  */
    2034           0 : void unmap_kernel_range(unsigned long addr, unsigned long size)
    2035             : {
    2036           0 :         unsigned long end = addr + size;
    2037             : 
    2038           0 :         flush_cache_vunmap(addr, end);
    2039           0 :         unmap_kernel_range_noflush(addr, size);
    2040           0 :         flush_tlb_kernel_range(addr, end);
    2041           0 : }
    2042             : 
    2043       10645 : static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
    2044             :         struct vmap_area *va, unsigned long flags, const void *caller)
    2045             : {
    2046       10645 :         vm->flags = flags;
    2047       10645 :         vm->addr = (void *)va->va_start;
    2048       10645 :         vm->size = va->va_end - va->va_start;
    2049       10645 :         vm->caller = caller;
    2050       10645 :         va->vm = vm;
    2051             : }
    2052             : 
    2053       10644 : static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
    2054             :                               unsigned long flags, const void *caller)
    2055             : {
    2056       10644 :         spin_lock(&vmap_area_lock);
    2057       10644 :         setup_vmalloc_vm_locked(vm, va, flags, caller);
    2058       10644 :         spin_unlock(&vmap_area_lock);
    2059       10644 : }
    2060             : 
    2061       10625 : static void clear_vm_uninitialized_flag(struct vm_struct *vm)
    2062             : {
    2063             :         /*
    2064             :          * Before removing VM_UNINITIALIZED,
    2065             :          * we should make sure that vm has proper values.
    2066             :          * Pair with smp_rmb() in show_numa_info().
    2067             :          */
    2068       10625 :         smp_wmb();
    2069       10625 :         vm->flags &= ~VM_UNINITIALIZED;
    2070             : }
    2071             : 
    2072       10643 : static struct vm_struct *__get_vm_area_node(unsigned long size,
    2073             :                 unsigned long align, unsigned long flags, unsigned long start,
    2074             :                 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
    2075             : {
    2076       10643 :         struct vmap_area *va;
    2077       10643 :         struct vm_struct *area;
    2078       10643 :         unsigned long requested_size = size;
    2079             : 
    2080       10643 :         BUG_ON(in_interrupt());
    2081       10643 :         size = PAGE_ALIGN(size);
    2082       10643 :         if (unlikely(!size))
    2083             :                 return NULL;
    2084             : 
    2085       10643 :         if (flags & VM_IOREMAP)
    2086           4 :                 align = 1ul << clamp_t(int, get_count_order_long(size),
    2087             :                                        PAGE_SHIFT, IOREMAP_MAX_ORDER);
    2088             : 
    2089       10643 :         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    2090       10643 :         if (unlikely(!area))
    2091             :                 return NULL;
    2092             : 
    2093       10643 :         if (!(flags & VM_NO_GUARD))
    2094       10643 :                 size += PAGE_SIZE;
    2095             : 
    2096       10643 :         va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
    2097       10643 :         if (IS_ERR(va)) {
    2098           0 :                 kfree(area);
    2099           0 :                 return NULL;
    2100             :         }
    2101             : 
    2102       10643 :         kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
    2103             : 
    2104       10643 :         setup_vmalloc_vm(area, va, flags, caller);
    2105             : 
    2106       10643 :         return area;
    2107             : }
    2108             : 
    2109           0 : struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
    2110             :                                        unsigned long start, unsigned long end,
    2111             :                                        const void *caller)
    2112             : {
    2113           0 :         return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
    2114             :                                   GFP_KERNEL, caller);
    2115             : }
    2116             : 
    2117             : /**
    2118             :  * get_vm_area - reserve a contiguous kernel virtual area
    2119             :  * @size:        size of the area
    2120             :  * @flags:       %VM_IOREMAP for I/O mappings or VM_ALLOC
    2121             :  *
    2122             :  * Search an area of @size in the kernel virtual mapping area,
    2123             :  * and reserved it for out purposes.  Returns the area descriptor
    2124             :  * on success or %NULL on failure.
    2125             :  *
    2126             :  * Return: the area descriptor on success or %NULL on failure.
    2127             :  */
    2128           0 : struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
    2129             : {
    2130           0 :         return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
    2131             :                                   NUMA_NO_NODE, GFP_KERNEL,
    2132           0 :                                   __builtin_return_address(0));
    2133             : }
    2134             : 
    2135           2 : struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
    2136             :                                 const void *caller)
    2137             : {
    2138           2 :         return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
    2139             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2140             : }
    2141             : 
    2142             : /**
    2143             :  * find_vm_area - find a continuous kernel virtual area
    2144             :  * @addr:         base address
    2145             :  *
    2146             :  * Search for the kernel VM area starting at @addr, and return it.
    2147             :  * It is up to the caller to do all required locking to keep the returned
    2148             :  * pointer valid.
    2149             :  *
    2150             :  * Return: the area descriptor on success or %NULL on failure.
    2151             :  */
    2152       10592 : struct vm_struct *find_vm_area(const void *addr)
    2153             : {
    2154       10592 :         struct vmap_area *va;
    2155             : 
    2156           7 :         va = find_vmap_area((unsigned long)addr);
    2157       10592 :         if (!va)
    2158             :                 return NULL;
    2159             : 
    2160       10592 :         return va->vm;
    2161             : }
    2162             : 
    2163             : /**
    2164             :  * remove_vm_area - find and remove a continuous kernel virtual area
    2165             :  * @addr:           base address
    2166             :  *
    2167             :  * Search for the kernel VM area starting at @addr, and remove it.
    2168             :  * This function returns the found VM area, but using it is NOT safe
    2169             :  * on SMP machines, except for its size or flags.
    2170             :  *
    2171             :  * Return: the area descriptor on success or %NULL on failure.
    2172             :  */
    2173       10585 : struct vm_struct *remove_vm_area(const void *addr)
    2174             : {
    2175       10585 :         struct vmap_area *va;
    2176             : 
    2177       10585 :         might_sleep();
    2178             : 
    2179       10585 :         spin_lock(&vmap_area_lock);
    2180       10585 :         va = __find_vmap_area((unsigned long)addr);
    2181       10585 :         if (va && va->vm) {
    2182       10585 :                 struct vm_struct *vm = va->vm;
    2183             : 
    2184       10585 :                 va->vm = NULL;
    2185       10585 :                 spin_unlock(&vmap_area_lock);
    2186             : 
    2187       10585 :                 kasan_free_shadow(vm);
    2188       10585 :                 free_unmap_vmap_area(va);
    2189             : 
    2190       10585 :                 return vm;
    2191             :         }
    2192             : 
    2193           0 :         spin_unlock(&vmap_area_lock);
    2194           0 :         return NULL;
    2195             : }
    2196             : 
    2197           0 : static inline void set_area_direct_map(const struct vm_struct *area,
    2198             :                                        int (*set_direct_map)(struct page *page))
    2199             : {
    2200           0 :         int i;
    2201             : 
    2202           0 :         for (i = 0; i < area->nr_pages; i++)
    2203           0 :                 if (page_address(area->pages[i]))
    2204           0 :                         set_direct_map(area->pages[i]);
    2205           0 : }
    2206             : 
    2207             : /* Handle removing and resetting vm mappings related to the vm_struct. */
    2208       10585 : static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
    2209             : {
    2210       10585 :         unsigned long start = ULONG_MAX, end = 0;
    2211       10585 :         int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
    2212       10585 :         int flush_dmap = 0;
    2213       10585 :         int i;
    2214             : 
    2215       10585 :         remove_vm_area(area->addr);
    2216             : 
    2217             :         /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
    2218       10585 :         if (!flush_reset)
    2219             :                 return;
    2220             : 
    2221             :         /*
    2222             :          * If not deallocating pages, just do the flush of the VM area and
    2223             :          * return.
    2224             :          */
    2225           0 :         if (!deallocate_pages) {
    2226           0 :                 vm_unmap_aliases();
    2227           0 :                 return;
    2228             :         }
    2229             : 
    2230             :         /*
    2231             :          * If execution gets here, flush the vm mapping and reset the direct
    2232             :          * map. Find the start and end range of the direct mappings to make sure
    2233             :          * the vm_unmap_aliases() flush includes the direct map.
    2234             :          */
    2235           0 :         for (i = 0; i < area->nr_pages; i++) {
    2236           0 :                 unsigned long addr = (unsigned long)page_address(area->pages[i]);
    2237           0 :                 if (addr) {
    2238           0 :                         start = min(addr, start);
    2239           0 :                         end = max(addr + PAGE_SIZE, end);
    2240           0 :                         flush_dmap = 1;
    2241             :                 }
    2242             :         }
    2243             : 
    2244             :         /*
    2245             :          * Set direct map to something invalid so that it won't be cached if
    2246             :          * there are any accesses after the TLB flush, then flush the TLB and
    2247             :          * reset the direct map permissions to the default.
    2248             :          */
    2249           0 :         set_area_direct_map(area, set_direct_map_invalid_noflush);
    2250           0 :         _vm_unmap_aliases(start, end, flush_dmap);
    2251           0 :         set_area_direct_map(area, set_direct_map_default_noflush);
    2252             : }
    2253             : 
    2254       10585 : static void __vunmap(const void *addr, int deallocate_pages)
    2255             : {
    2256       10585 :         struct vm_struct *area;
    2257             : 
    2258       10585 :         if (!addr)
    2259             :                 return;
    2260             : 
    2261       10585 :         if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
    2262             :                         addr))
    2263             :                 return;
    2264             : 
    2265       10585 :         area = find_vm_area(addr);
    2266       10585 :         if (unlikely(!area)) {
    2267           0 :                 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
    2268             :                                 addr);
    2269           0 :                 return;
    2270             :         }
    2271             : 
    2272       21170 :         debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
    2273       21170 :         debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
    2274             : 
    2275       21170 :         kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
    2276             : 
    2277       10585 :         vm_remove_mappings(area, deallocate_pages);
    2278             : 
    2279       10585 :         if (deallocate_pages) {
    2280             :                 int i;
    2281             : 
    2282       21655 :                 for (i = 0; i < area->nr_pages; i++) {
    2283       11070 :                         struct page *page = area->pages[i];
    2284             : 
    2285       11070 :                         BUG_ON(!page);
    2286       11070 :                         __free_pages(page, 0);
    2287             :                 }
    2288       10585 :                 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
    2289             : 
    2290       10585 :                 kvfree(area->pages);
    2291             :         }
    2292             : 
    2293       10585 :         kfree(area);
    2294             : }
    2295             : 
    2296           0 : static inline void __vfree_deferred(const void *addr)
    2297             : {
    2298             :         /*
    2299             :          * Use raw_cpu_ptr() because this can be called from preemptible
    2300             :          * context. Preemption is absolutely fine here, because the llist_add()
    2301             :          * implementation is lockless, so it works even if we are adding to
    2302             :          * another cpu's list. schedule_work() should be fine with this too.
    2303             :          */
    2304           0 :         struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
    2305             : 
    2306           0 :         if (llist_add((struct llist_node *)addr, &p->list))
    2307           0 :                 schedule_work(&p->wq);
    2308           0 : }
    2309             : 
    2310             : /**
    2311             :  * vfree_atomic - release memory allocated by vmalloc()
    2312             :  * @addr:         memory base address
    2313             :  *
    2314             :  * This one is just like vfree() but can be called in any atomic context
    2315             :  * except NMIs.
    2316             :  */
    2317           0 : void vfree_atomic(const void *addr)
    2318             : {
    2319           0 :         BUG_ON(in_nmi());
    2320             : 
    2321           0 :         kmemleak_free(addr);
    2322             : 
    2323           0 :         if (!addr)
    2324             :                 return;
    2325           0 :         __vfree_deferred(addr);
    2326             : }
    2327             : 
    2328       10585 : static void __vfree(const void *addr)
    2329             : {
    2330       10585 :         if (unlikely(in_interrupt()))
    2331           0 :                 __vfree_deferred(addr);
    2332             :         else
    2333       10585 :                 __vunmap(addr, 1);
    2334       10585 : }
    2335             : 
    2336             : /**
    2337             :  * vfree - Release memory allocated by vmalloc()
    2338             :  * @addr:  Memory base address
    2339             :  *
    2340             :  * Free the virtually continuous memory area starting at @addr, as obtained
    2341             :  * from one of the vmalloc() family of APIs.  This will usually also free the
    2342             :  * physical memory underlying the virtual allocation, but that memory is
    2343             :  * reference counted, so it will not be freed until the last user goes away.
    2344             :  *
    2345             :  * If @addr is NULL, no operation is performed.
    2346             :  *
    2347             :  * Context:
    2348             :  * May sleep if called *not* from interrupt context.
    2349             :  * Must not be called in NMI context (strictly speaking, it could be
    2350             :  * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
    2351             :  * conventions for vfree() arch-depenedent would be a really bad idea).
    2352             :  */
    2353       10585 : void vfree(const void *addr)
    2354             : {
    2355       10585 :         BUG_ON(in_nmi());
    2356             : 
    2357       10585 :         kmemleak_free(addr);
    2358             : 
    2359       10585 :         might_sleep_if(!in_interrupt());
    2360             : 
    2361       10585 :         if (!addr)
    2362             :                 return;
    2363             : 
    2364       10585 :         __vfree(addr);
    2365             : }
    2366             : EXPORT_SYMBOL(vfree);
    2367             : 
    2368             : /**
    2369             :  * vunmap - release virtual mapping obtained by vmap()
    2370             :  * @addr:   memory base address
    2371             :  *
    2372             :  * Free the virtually contiguous memory area starting at @addr,
    2373             :  * which was created from the page array passed to vmap().
    2374             :  *
    2375             :  * Must not be called in interrupt context.
    2376             :  */
    2377           0 : void vunmap(const void *addr)
    2378             : {
    2379           0 :         BUG_ON(in_interrupt());
    2380           0 :         might_sleep();
    2381           0 :         if (addr)
    2382           0 :                 __vunmap(addr, 0);
    2383           0 : }
    2384             : EXPORT_SYMBOL(vunmap);
    2385             : 
    2386             : /**
    2387             :  * vmap - map an array of pages into virtually contiguous space
    2388             :  * @pages: array of page pointers
    2389             :  * @count: number of pages to map
    2390             :  * @flags: vm_area->flags
    2391             :  * @prot: page protection for the mapping
    2392             :  *
    2393             :  * Maps @count pages from @pages into contiguous kernel virtual space.
    2394             :  * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
    2395             :  * (which must be kmalloc or vmalloc memory) and one reference per pages in it
    2396             :  * are transferred from the caller to vmap(), and will be freed / dropped when
    2397             :  * vfree() is called on the return value.
    2398             :  *
    2399             :  * Return: the address of the area or %NULL on failure
    2400             :  */
    2401           0 : void *vmap(struct page **pages, unsigned int count,
    2402             :            unsigned long flags, pgprot_t prot)
    2403             : {
    2404           0 :         struct vm_struct *area;
    2405           0 :         unsigned long size;             /* In bytes */
    2406             : 
    2407           0 :         might_sleep();
    2408             : 
    2409           0 :         if (count > totalram_pages())
    2410             :                 return NULL;
    2411             : 
    2412           0 :         size = (unsigned long)count << PAGE_SHIFT;
    2413           0 :         area = get_vm_area_caller(size, flags, __builtin_return_address(0));
    2414           0 :         if (!area)
    2415             :                 return NULL;
    2416             : 
    2417           0 :         if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
    2418             :                         pages) < 0) {
    2419           0 :                 vunmap(area->addr);
    2420           0 :                 return NULL;
    2421             :         }
    2422             : 
    2423           0 :         if (flags & VM_MAP_PUT_PAGES) {
    2424           0 :                 area->pages = pages;
    2425           0 :                 area->nr_pages = count;
    2426             :         }
    2427           0 :         return area->addr;
    2428             : }
    2429             : EXPORT_SYMBOL(vmap);
    2430             : 
    2431             : #ifdef CONFIG_VMAP_PFN
    2432             : struct vmap_pfn_data {
    2433             :         unsigned long   *pfns;
    2434             :         pgprot_t        prot;
    2435             :         unsigned int    idx;
    2436             : };
    2437             : 
    2438             : static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
    2439             : {
    2440             :         struct vmap_pfn_data *data = private;
    2441             : 
    2442             :         if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
    2443             :                 return -EINVAL;
    2444             :         *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
    2445             :         return 0;
    2446             : }
    2447             : 
    2448             : /**
    2449             :  * vmap_pfn - map an array of PFNs into virtually contiguous space
    2450             :  * @pfns: array of PFNs
    2451             :  * @count: number of pages to map
    2452             :  * @prot: page protection for the mapping
    2453             :  *
    2454             :  * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
    2455             :  * the start address of the mapping.
    2456             :  */
    2457             : void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
    2458             : {
    2459             :         struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
    2460             :         struct vm_struct *area;
    2461             : 
    2462             :         area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
    2463             :                         __builtin_return_address(0));
    2464             :         if (!area)
    2465             :                 return NULL;
    2466             :         if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
    2467             :                         count * PAGE_SIZE, vmap_pfn_apply, &data)) {
    2468             :                 free_vm_area(area);
    2469             :                 return NULL;
    2470             :         }
    2471             :         return area->addr;
    2472             : }
    2473             : EXPORT_SYMBOL_GPL(vmap_pfn);
    2474             : #endif /* CONFIG_VMAP_PFN */
    2475             : 
    2476       10626 : static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
    2477             :                                  pgprot_t prot, int node)
    2478             : {
    2479       10626 :         const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    2480       10626 :         unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
    2481       10626 :         unsigned long array_size;
    2482       10626 :         unsigned int i;
    2483       10626 :         struct page **pages;
    2484             : 
    2485       10626 :         array_size = (unsigned long)nr_pages * sizeof(struct page *);
    2486       10626 :         gfp_mask |= __GFP_NOWARN;
    2487       10626 :         if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
    2488       10626 :                 gfp_mask |= __GFP_HIGHMEM;
    2489             : 
    2490             :         /* Please note that the recursion is strictly bounded. */
    2491       10626 :         if (array_size > PAGE_SIZE) {
    2492           0 :                 pages = __vmalloc_node(array_size, 1, nested_gfp, node,
    2493             :                                         area->caller);
    2494             :         } else {
    2495       10626 :                 pages = kmalloc_node(array_size, nested_gfp, node);
    2496             :         }
    2497             : 
    2498       10626 :         if (!pages) {
    2499           0 :                 free_vm_area(area);
    2500           0 :                 return NULL;
    2501             :         }
    2502             : 
    2503       10626 :         area->pages = pages;
    2504       10626 :         area->nr_pages = nr_pages;
    2505             : 
    2506       21748 :         for (i = 0; i < area->nr_pages; i++) {
    2507       11122 :                 struct page *page;
    2508             : 
    2509       11122 :                 if (node == NUMA_NO_NODE)
    2510       11122 :                         page = alloc_page(gfp_mask);
    2511             :                 else
    2512           0 :                         page = alloc_pages_node(node, gfp_mask, 0);
    2513             : 
    2514       11122 :                 if (unlikely(!page)) {
    2515             :                         /* Successfully allocated i pages, free them in __vfree() */
    2516           0 :                         area->nr_pages = i;
    2517           0 :                         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
    2518           0 :                         goto fail;
    2519             :                 }
    2520       11122 :                 area->pages[i] = page;
    2521       11122 :                 if (gfpflags_allow_blocking(gfp_mask))
    2522       11122 :                         cond_resched();
    2523             :         }
    2524       10626 :         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
    2525             : 
    2526       21252 :         if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
    2527             :                         prot, pages) < 0)
    2528           0 :                 goto fail;
    2529             : 
    2530       10626 :         return area->addr;
    2531             : 
    2532           0 : fail:
    2533           0 :         warn_alloc(gfp_mask, NULL,
    2534             :                           "vmalloc: allocation failure, allocated %ld of %ld bytes",
    2535           0 :                           (area->nr_pages*PAGE_SIZE), area->size);
    2536           0 :         __vfree(area->addr);
    2537           0 :         return NULL;
    2538             : }
    2539             : 
    2540             : /**
    2541             :  * __vmalloc_node_range - allocate virtually contiguous memory
    2542             :  * @size:                 allocation size
    2543             :  * @align:                desired alignment
    2544             :  * @start:                vm area range start
    2545             :  * @end:                  vm area range end
    2546             :  * @gfp_mask:             flags for the page level allocator
    2547             :  * @prot:                 protection mask for the allocated pages
    2548             :  * @vm_flags:             additional vm area flags (e.g. %VM_NO_GUARD)
    2549             :  * @node:                 node to use for allocation or NUMA_NO_NODE
    2550             :  * @caller:               caller's return address
    2551             :  *
    2552             :  * Allocate enough pages to cover @size from the page level
    2553             :  * allocator with @gfp_mask flags.  Map them into contiguous
    2554             :  * kernel virtual space, using a pagetable protection of @prot.
    2555             :  *
    2556             :  * Return: the address of the area or %NULL on failure
    2557             :  */
    2558       10625 : void *__vmalloc_node_range(unsigned long size, unsigned long align,
    2559             :                         unsigned long start, unsigned long end, gfp_t gfp_mask,
    2560             :                         pgprot_t prot, unsigned long vm_flags, int node,
    2561             :                         const void *caller)
    2562             : {
    2563       10625 :         struct vm_struct *area;
    2564       10625 :         void *addr;
    2565       10625 :         unsigned long real_size = size;
    2566             : 
    2567       10625 :         size = PAGE_ALIGN(size);
    2568       10625 :         if (!size || (size >> PAGE_SHIFT) > totalram_pages())
    2569           0 :                 goto fail;
    2570             : 
    2571       10625 :         area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
    2572             :                                 vm_flags, start, end, node, gfp_mask, caller);
    2573       10625 :         if (!area)
    2574           0 :                 goto fail;
    2575             : 
    2576       10625 :         addr = __vmalloc_area_node(area, gfp_mask, prot, node);
    2577       10625 :         if (!addr)
    2578             :                 return NULL;
    2579             : 
    2580             :         /*
    2581             :          * In this function, newly allocated vm_struct has VM_UNINITIALIZED
    2582             :          * flag. It means that vm_struct is not fully initialized.
    2583             :          * Now, it is fully initialized, so remove this flag here.
    2584             :          */
    2585       10625 :         clear_vm_uninitialized_flag(area);
    2586             : 
    2587       10625 :         kmemleak_vmalloc(area, size, gfp_mask);
    2588             : 
    2589       10625 :         return addr;
    2590             : 
    2591           0 : fail:
    2592           0 :         warn_alloc(gfp_mask, NULL,
    2593             :                           "vmalloc: allocation failure: %lu bytes", real_size);
    2594           0 :         return NULL;
    2595             : }
    2596             : 
    2597             : /**
    2598             :  * __vmalloc_node - allocate virtually contiguous memory
    2599             :  * @size:           allocation size
    2600             :  * @align:          desired alignment
    2601             :  * @gfp_mask:       flags for the page level allocator
    2602             :  * @node:           node to use for allocation or NUMA_NO_NODE
    2603             :  * @caller:         caller's return address
    2604             :  *
    2605             :  * Allocate enough pages to cover @size from the page level allocator with
    2606             :  * @gfp_mask flags.  Map them into contiguous kernel virtual space.
    2607             :  *
    2608             :  * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
    2609             :  * and __GFP_NOFAIL are not supported
    2610             :  *
    2611             :  * Any use of gfp flags outside of GFP_KERNEL should be consulted
    2612             :  * with mm people.
    2613             :  *
    2614             :  * Return: pointer to the allocated memory or %NULL on error
    2615             :  */
    2616       10624 : void *__vmalloc_node(unsigned long size, unsigned long align,
    2617             :                             gfp_t gfp_mask, int node, const void *caller)
    2618             : {
    2619       21248 :         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
    2620       10624 :                                 gfp_mask, PAGE_KERNEL, 0, node, caller);
    2621             : }
    2622             : /*
    2623             :  * This is only for performance analysis of vmalloc and stress purpose.
    2624             :  * It is required by vmalloc test module, therefore do not use it other
    2625             :  * than that.
    2626             :  */
    2627             : #ifdef CONFIG_TEST_VMALLOC_MODULE
    2628             : EXPORT_SYMBOL_GPL(__vmalloc_node);
    2629             : #endif
    2630             : 
    2631           9 : void *__vmalloc(unsigned long size, gfp_t gfp_mask)
    2632             : {
    2633           9 :         return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
    2634           9 :                                 __builtin_return_address(0));
    2635             : }
    2636             : EXPORT_SYMBOL(__vmalloc);
    2637             : 
    2638             : /**
    2639             :  * vmalloc - allocate virtually contiguous memory
    2640             :  * @size:    allocation size
    2641             :  *
    2642             :  * Allocate enough pages to cover @size from the page level
    2643             :  * allocator and map them into contiguous kernel virtual space.
    2644             :  *
    2645             :  * For tight control over page level allocator and protection flags
    2646             :  * use __vmalloc() instead.
    2647             :  *
    2648             :  * Return: pointer to the allocated memory or %NULL on error
    2649             :  */
    2650       10448 : void *vmalloc(unsigned long size)
    2651             : {
    2652       10448 :         return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
    2653       10448 :                                 __builtin_return_address(0));
    2654             : }
    2655             : EXPORT_SYMBOL(vmalloc);
    2656             : 
    2657             : /**
    2658             :  * vzalloc - allocate virtually contiguous memory with zero fill
    2659             :  * @size:    allocation size
    2660             :  *
    2661             :  * Allocate enough pages to cover @size from the page level
    2662             :  * allocator and map them into contiguous kernel virtual space.
    2663             :  * The memory allocated is set to zero.
    2664             :  *
    2665             :  * For tight control over page level allocator and protection flags
    2666             :  * use __vmalloc() instead.
    2667             :  *
    2668             :  * Return: pointer to the allocated memory or %NULL on error
    2669             :  */
    2670         165 : void *vzalloc(unsigned long size)
    2671             : {
    2672         165 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
    2673         165 :                                 __builtin_return_address(0));
    2674             : }
    2675             : EXPORT_SYMBOL(vzalloc);
    2676             : 
    2677             : /**
    2678             :  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
    2679             :  * @size: allocation size
    2680             :  *
    2681             :  * The resulting memory area is zeroed so it can be mapped to userspace
    2682             :  * without leaking data.
    2683             :  *
    2684             :  * Return: pointer to the allocated memory or %NULL on error
    2685             :  */
    2686           0 : void *vmalloc_user(unsigned long size)
    2687             : {
    2688           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    2689           0 :                                     GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
    2690             :                                     VM_USERMAP, NUMA_NO_NODE,
    2691           0 :                                     __builtin_return_address(0));
    2692             : }
    2693             : EXPORT_SYMBOL(vmalloc_user);
    2694             : 
    2695             : /**
    2696             :  * vmalloc_node - allocate memory on a specific node
    2697             :  * @size:         allocation size
    2698             :  * @node:         numa node
    2699             :  *
    2700             :  * Allocate enough pages to cover @size from the page level
    2701             :  * allocator and map them into contiguous kernel virtual space.
    2702             :  *
    2703             :  * For tight control over page level allocator and protection flags
    2704             :  * use __vmalloc() instead.
    2705             :  *
    2706             :  * Return: pointer to the allocated memory or %NULL on error
    2707             :  */
    2708           0 : void *vmalloc_node(unsigned long size, int node)
    2709             : {
    2710           0 :         return __vmalloc_node(size, 1, GFP_KERNEL, node,
    2711           0 :                         __builtin_return_address(0));
    2712             : }
    2713             : EXPORT_SYMBOL(vmalloc_node);
    2714             : 
    2715             : /**
    2716             :  * vzalloc_node - allocate memory on a specific node with zero fill
    2717             :  * @size:       allocation size
    2718             :  * @node:       numa node
    2719             :  *
    2720             :  * Allocate enough pages to cover @size from the page level
    2721             :  * allocator and map them into contiguous kernel virtual space.
    2722             :  * The memory allocated is set to zero.
    2723             :  *
    2724             :  * Return: pointer to the allocated memory or %NULL on error
    2725             :  */
    2726           0 : void *vzalloc_node(unsigned long size, int node)
    2727             : {
    2728           0 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
    2729           0 :                                 __builtin_return_address(0));
    2730             : }
    2731             : EXPORT_SYMBOL(vzalloc_node);
    2732             : 
    2733             : #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
    2734             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    2735             : #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
    2736             : #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
    2737             : #else
    2738             : /*
    2739             :  * 64b systems should always have either DMA or DMA32 zones. For others
    2740             :  * GFP_DMA32 should do the right thing and use the normal zone.
    2741             :  */
    2742             : #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
    2743             : #endif
    2744             : 
    2745             : /**
    2746             :  * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
    2747             :  * @size:       allocation size
    2748             :  *
    2749             :  * Allocate enough 32bit PA addressable pages to cover @size from the
    2750             :  * page level allocator and map them into contiguous kernel virtual space.
    2751             :  *
    2752             :  * Return: pointer to the allocated memory or %NULL on error
    2753             :  */
    2754           0 : void *vmalloc_32(unsigned long size)
    2755             : {
    2756           0 :         return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
    2757           0 :                         __builtin_return_address(0));
    2758             : }
    2759             : EXPORT_SYMBOL(vmalloc_32);
    2760             : 
    2761             : /**
    2762             :  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
    2763             :  * @size:            allocation size
    2764             :  *
    2765             :  * The resulting memory area is 32bit addressable and zeroed so it can be
    2766             :  * mapped to userspace without leaking data.
    2767             :  *
    2768             :  * Return: pointer to the allocated memory or %NULL on error
    2769             :  */
    2770           0 : void *vmalloc_32_user(unsigned long size)
    2771             : {
    2772           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    2773           0 :                                     GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
    2774             :                                     VM_USERMAP, NUMA_NO_NODE,
    2775           0 :                                     __builtin_return_address(0));
    2776             : }
    2777             : EXPORT_SYMBOL(vmalloc_32_user);
    2778             : 
    2779             : /*
    2780             :  * small helper routine , copy contents to buf from addr.
    2781             :  * If the page is not present, fill zero.
    2782             :  */
    2783             : 
    2784           0 : static int aligned_vread(char *buf, char *addr, unsigned long count)
    2785             : {
    2786           0 :         struct page *p;
    2787           0 :         int copied = 0;
    2788             : 
    2789           0 :         while (count) {
    2790           0 :                 unsigned long offset, length;
    2791             : 
    2792           0 :                 offset = offset_in_page(addr);
    2793           0 :                 length = PAGE_SIZE - offset;
    2794           0 :                 if (length > count)
    2795             :                         length = count;
    2796           0 :                 p = vmalloc_to_page(addr);
    2797             :                 /*
    2798             :                  * To do safe access to this _mapped_ area, we need
    2799             :                  * lock. But adding lock here means that we need to add
    2800             :                  * overhead of vmalloc()/vfree() calles for this _debug_
    2801             :                  * interface, rarely used. Instead of that, we'll use
    2802             :                  * kmap() and get small overhead in this access function.
    2803             :                  */
    2804           0 :                 if (p) {
    2805             :                         /*
    2806             :                          * we can expect USER0 is not used (see vread/vwrite's
    2807             :                          * function description)
    2808             :                          */
    2809           0 :                         void *map = kmap_atomic(p);
    2810           0 :                         memcpy(buf, map + offset, length);
    2811           0 :                         kunmap_atomic(map);
    2812             :                 } else
    2813           0 :                         memset(buf, 0, length);
    2814             : 
    2815           0 :                 addr += length;
    2816           0 :                 buf += length;
    2817           0 :                 copied += length;
    2818           0 :                 count -= length;
    2819             :         }
    2820           0 :         return copied;
    2821             : }
    2822             : 
    2823           0 : static int aligned_vwrite(char *buf, char *addr, unsigned long count)
    2824             : {
    2825           0 :         struct page *p;
    2826           0 :         int copied = 0;
    2827             : 
    2828           0 :         while (count) {
    2829           0 :                 unsigned long offset, length;
    2830             : 
    2831           0 :                 offset = offset_in_page(addr);
    2832           0 :                 length = PAGE_SIZE - offset;
    2833           0 :                 if (length > count)
    2834             :                         length = count;
    2835           0 :                 p = vmalloc_to_page(addr);
    2836             :                 /*
    2837             :                  * To do safe access to this _mapped_ area, we need
    2838             :                  * lock. But adding lock here means that we need to add
    2839             :                  * overhead of vmalloc()/vfree() calles for this _debug_
    2840             :                  * interface, rarely used. Instead of that, we'll use
    2841             :                  * kmap() and get small overhead in this access function.
    2842             :                  */
    2843           0 :                 if (p) {
    2844             :                         /*
    2845             :                          * we can expect USER0 is not used (see vread/vwrite's
    2846             :                          * function description)
    2847             :                          */
    2848           0 :                         void *map = kmap_atomic(p);
    2849           0 :                         memcpy(map + offset, buf, length);
    2850           0 :                         kunmap_atomic(map);
    2851             :                 }
    2852           0 :                 addr += length;
    2853           0 :                 buf += length;
    2854           0 :                 copied += length;
    2855           0 :                 count -= length;
    2856             :         }
    2857           0 :         return copied;
    2858             : }
    2859             : 
    2860             : /**
    2861             :  * vread() - read vmalloc area in a safe way.
    2862             :  * @buf:     buffer for reading data
    2863             :  * @addr:    vm address.
    2864             :  * @count:   number of bytes to be read.
    2865             :  *
    2866             :  * This function checks that addr is a valid vmalloc'ed area, and
    2867             :  * copy data from that area to a given buffer. If the given memory range
    2868             :  * of [addr...addr+count) includes some valid address, data is copied to
    2869             :  * proper area of @buf. If there are memory holes, they'll be zero-filled.
    2870             :  * IOREMAP area is treated as memory hole and no copy is done.
    2871             :  *
    2872             :  * If [addr...addr+count) doesn't includes any intersects with alive
    2873             :  * vm_struct area, returns 0. @buf should be kernel's buffer.
    2874             :  *
    2875             :  * Note: In usual ops, vread() is never necessary because the caller
    2876             :  * should know vmalloc() area is valid and can use memcpy().
    2877             :  * This is for routines which have to access vmalloc area without
    2878             :  * any information, as /dev/kmem.
    2879             :  *
    2880             :  * Return: number of bytes for which addr and buf should be increased
    2881             :  * (same number as @count) or %0 if [addr...addr+count) doesn't
    2882             :  * include any intersection with valid vmalloc area
    2883             :  */
    2884           0 : long vread(char *buf, char *addr, unsigned long count)
    2885             : {
    2886           0 :         struct vmap_area *va;
    2887           0 :         struct vm_struct *vm;
    2888           0 :         char *vaddr, *buf_start = buf;
    2889           0 :         unsigned long buflen = count;
    2890           0 :         unsigned long n;
    2891             : 
    2892             :         /* Don't allow overflow */
    2893           0 :         if ((unsigned long) addr + count < count)
    2894           0 :                 count = -(unsigned long) addr;
    2895             : 
    2896           0 :         spin_lock(&vmap_area_lock);
    2897           0 :         list_for_each_entry(va, &vmap_area_list, list) {
    2898           0 :                 if (!count)
    2899             :                         break;
    2900             : 
    2901           0 :                 if (!va->vm)
    2902           0 :                         continue;
    2903             : 
    2904           0 :                 vm = va->vm;
    2905           0 :                 vaddr = (char *) vm->addr;
    2906           0 :                 if (addr >= vaddr + get_vm_area_size(vm))
    2907           0 :                         continue;
    2908           0 :                 while (addr < vaddr) {
    2909           0 :                         if (count == 0)
    2910           0 :                                 goto finished;
    2911           0 :                         *buf = '\0';
    2912           0 :                         buf++;
    2913           0 :                         addr++;
    2914           0 :                         count--;
    2915             :                 }
    2916           0 :                 n = vaddr + get_vm_area_size(vm) - addr;
    2917           0 :                 if (n > count)
    2918             :                         n = count;
    2919           0 :                 if (!(vm->flags & VM_IOREMAP))
    2920           0 :                         aligned_vread(buf, addr, n);
    2921             :                 else /* IOREMAP area is treated as memory hole */
    2922           0 :                         memset(buf, 0, n);
    2923           0 :                 buf += n;
    2924           0 :                 addr += n;
    2925           0 :                 count -= n;
    2926             :         }
    2927           0 : finished:
    2928           0 :         spin_unlock(&vmap_area_lock);
    2929             : 
    2930           0 :         if (buf == buf_start)
    2931             :                 return 0;
    2932             :         /* zero-fill memory holes */
    2933           0 :         if (buf != buf_start + buflen)
    2934           0 :                 memset(buf, 0, buflen - (buf - buf_start));
    2935             : 
    2936           0 :         return buflen;
    2937             : }
    2938             : 
    2939             : /**
    2940             :  * vwrite() - write vmalloc area in a safe way.
    2941             :  * @buf:      buffer for source data
    2942             :  * @addr:     vm address.
    2943             :  * @count:    number of bytes to be read.
    2944             :  *
    2945             :  * This function checks that addr is a valid vmalloc'ed area, and
    2946             :  * copy data from a buffer to the given addr. If specified range of
    2947             :  * [addr...addr+count) includes some valid address, data is copied from
    2948             :  * proper area of @buf. If there are memory holes, no copy to hole.
    2949             :  * IOREMAP area is treated as memory hole and no copy is done.
    2950             :  *
    2951             :  * If [addr...addr+count) doesn't includes any intersects with alive
    2952             :  * vm_struct area, returns 0. @buf should be kernel's buffer.
    2953             :  *
    2954             :  * Note: In usual ops, vwrite() is never necessary because the caller
    2955             :  * should know vmalloc() area is valid and can use memcpy().
    2956             :  * This is for routines which have to access vmalloc area without
    2957             :  * any information, as /dev/kmem.
    2958             :  *
    2959             :  * Return: number of bytes for which addr and buf should be
    2960             :  * increased (same number as @count) or %0 if [addr...addr+count)
    2961             :  * doesn't include any intersection with valid vmalloc area
    2962             :  */
    2963           0 : long vwrite(char *buf, char *addr, unsigned long count)
    2964             : {
    2965           0 :         struct vmap_area *va;
    2966           0 :         struct vm_struct *vm;
    2967           0 :         char *vaddr;
    2968           0 :         unsigned long n, buflen;
    2969           0 :         int copied = 0;
    2970             : 
    2971             :         /* Don't allow overflow */
    2972           0 :         if ((unsigned long) addr + count < count)
    2973           0 :                 count = -(unsigned long) addr;
    2974           0 :         buflen = count;
    2975             : 
    2976           0 :         spin_lock(&vmap_area_lock);
    2977           0 :         list_for_each_entry(va, &vmap_area_list, list) {
    2978           0 :                 if (!count)
    2979             :                         break;
    2980             : 
    2981           0 :                 if (!va->vm)
    2982           0 :                         continue;
    2983             : 
    2984           0 :                 vm = va->vm;
    2985           0 :                 vaddr = (char *) vm->addr;
    2986           0 :                 if (addr >= vaddr + get_vm_area_size(vm))
    2987           0 :                         continue;
    2988           0 :                 while (addr < vaddr) {
    2989           0 :                         if (count == 0)
    2990           0 :                                 goto finished;
    2991           0 :                         buf++;
    2992           0 :                         addr++;
    2993           0 :                         count--;
    2994             :                 }
    2995           0 :                 n = vaddr + get_vm_area_size(vm) - addr;
    2996           0 :                 if (n > count)
    2997             :                         n = count;
    2998           0 :                 if (!(vm->flags & VM_IOREMAP)) {
    2999           0 :                         aligned_vwrite(buf, addr, n);
    3000           0 :                         copied++;
    3001             :                 }
    3002           0 :                 buf += n;
    3003           0 :                 addr += n;
    3004           0 :                 count -= n;
    3005             :         }
    3006           0 : finished:
    3007           0 :         spin_unlock(&vmap_area_lock);
    3008           0 :         if (!copied)
    3009             :                 return 0;
    3010           0 :         return buflen;
    3011             : }
    3012             : 
    3013             : /**
    3014             :  * remap_vmalloc_range_partial - map vmalloc pages to userspace
    3015             :  * @vma:                vma to cover
    3016             :  * @uaddr:              target user address to start at
    3017             :  * @kaddr:              virtual address of vmalloc kernel memory
    3018             :  * @pgoff:              offset from @kaddr to start at
    3019             :  * @size:               size of map area
    3020             :  *
    3021             :  * Returns:     0 for success, -Exxx on failure
    3022             :  *
    3023             :  * This function checks that @kaddr is a valid vmalloc'ed area,
    3024             :  * and that it is big enough to cover the range starting at
    3025             :  * @uaddr in @vma. Will return failure if that criteria isn't
    3026             :  * met.
    3027             :  *
    3028             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3029             :  */
    3030           0 : int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
    3031             :                                 void *kaddr, unsigned long pgoff,
    3032             :                                 unsigned long size)
    3033             : {
    3034           0 :         struct vm_struct *area;
    3035           0 :         unsigned long off;
    3036           0 :         unsigned long end_index;
    3037             : 
    3038           0 :         if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
    3039             :                 return -EINVAL;
    3040             : 
    3041           0 :         size = PAGE_ALIGN(size);
    3042             : 
    3043           0 :         if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
    3044             :                 return -EINVAL;
    3045             : 
    3046           0 :         area = find_vm_area(kaddr);
    3047           0 :         if (!area)
    3048             :                 return -EINVAL;
    3049             : 
    3050           0 :         if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
    3051             :                 return -EINVAL;
    3052             : 
    3053           0 :         if (check_add_overflow(size, off, &end_index) ||
    3054           0 :             end_index > get_vm_area_size(area))
    3055             :                 return -EINVAL;
    3056           0 :         kaddr += off;
    3057             : 
    3058           0 :         do {
    3059           0 :                 struct page *page = vmalloc_to_page(kaddr);
    3060           0 :                 int ret;
    3061             : 
    3062           0 :                 ret = vm_insert_page(vma, uaddr, page);
    3063           0 :                 if (ret)
    3064           0 :                         return ret;
    3065             : 
    3066           0 :                 uaddr += PAGE_SIZE;
    3067           0 :                 kaddr += PAGE_SIZE;
    3068           0 :                 size -= PAGE_SIZE;
    3069           0 :         } while (size > 0);
    3070             : 
    3071           0 :         vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
    3072             : 
    3073           0 :         return 0;
    3074             : }
    3075             : EXPORT_SYMBOL(remap_vmalloc_range_partial);
    3076             : 
    3077             : /**
    3078             :  * remap_vmalloc_range - map vmalloc pages to userspace
    3079             :  * @vma:                vma to cover (map full range of vma)
    3080             :  * @addr:               vmalloc memory
    3081             :  * @pgoff:              number of pages into addr before first page to map
    3082             :  *
    3083             :  * Returns:     0 for success, -Exxx on failure
    3084             :  *
    3085             :  * This function checks that addr is a valid vmalloc'ed area, and
    3086             :  * that it is big enough to cover the vma. Will return failure if
    3087             :  * that criteria isn't met.
    3088             :  *
    3089             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3090             :  */
    3091           0 : int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
    3092             :                                                 unsigned long pgoff)
    3093             : {
    3094           0 :         return remap_vmalloc_range_partial(vma, vma->vm_start,
    3095             :                                            addr, pgoff,
    3096           0 :                                            vma->vm_end - vma->vm_start);
    3097             : }
    3098             : EXPORT_SYMBOL(remap_vmalloc_range);
    3099             : 
    3100           0 : void free_vm_area(struct vm_struct *area)
    3101             : {
    3102           0 :         struct vm_struct *ret;
    3103           0 :         ret = remove_vm_area(area->addr);
    3104           0 :         BUG_ON(ret != area);
    3105           0 :         kfree(area);
    3106           0 : }
    3107             : EXPORT_SYMBOL_GPL(free_vm_area);
    3108             : 
    3109             : #ifdef CONFIG_SMP
    3110           0 : static struct vmap_area *node_to_va(struct rb_node *n)
    3111             : {
    3112           0 :         return rb_entry_safe(n, struct vmap_area, rb_node);
    3113             : }
    3114             : 
    3115             : /**
    3116             :  * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
    3117             :  * @addr: target address
    3118             :  *
    3119             :  * Returns: vmap_area if it is found. If there is no such area
    3120             :  *   the first highest(reverse order) vmap_area is returned
    3121             :  *   i.e. va->va_start < addr && va->va_end < addr or NULL
    3122             :  *   if there are no any areas before @addr.
    3123             :  */
    3124             : static struct vmap_area *
    3125           2 : pvm_find_va_enclose_addr(unsigned long addr)
    3126             : {
    3127           2 :         struct vmap_area *va, *tmp;
    3128           2 :         struct rb_node *n;
    3129             : 
    3130           2 :         n = free_vmap_area_root.rb_node;
    3131           2 :         va = NULL;
    3132             : 
    3133           2 :         while (n) {
    3134           2 :                 tmp = rb_entry(n, struct vmap_area, rb_node);
    3135           2 :                 if (tmp->va_start <= addr) {
    3136           2 :                         va = tmp;
    3137           2 :                         if (tmp->va_end >= addr)
    3138             :                                 break;
    3139             : 
    3140           0 :                         n = n->rb_right;
    3141             :                 } else {
    3142           0 :                         n = n->rb_left;
    3143             :                 }
    3144             :         }
    3145             : 
    3146           2 :         return va;
    3147             : }
    3148             : 
    3149             : /**
    3150             :  * pvm_determine_end_from_reverse - find the highest aligned address
    3151             :  * of free block below VMALLOC_END
    3152             :  * @va:
    3153             :  *   in - the VA we start the search(reverse order);
    3154             :  *   out - the VA with the highest aligned end address.
    3155             :  * @align: alignment for required highest address
    3156             :  *
    3157             :  * Returns: determined end address within vmap_area
    3158             :  */
    3159             : static unsigned long
    3160           1 : pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
    3161             : {
    3162           1 :         unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3163           1 :         unsigned long addr;
    3164             : 
    3165           1 :         if (likely(*va)) {
    3166           1 :                 list_for_each_entry_from_reverse((*va),
    3167             :                                 &free_vmap_area_list, list) {
    3168           1 :                         addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
    3169           1 :                         if ((*va)->va_start < addr)
    3170           1 :                                 return addr;
    3171             :                 }
    3172             :         }
    3173             : 
    3174             :         return 0;
    3175             : }
    3176             : 
    3177             : /**
    3178             :  * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
    3179             :  * @offsets: array containing offset of each area
    3180             :  * @sizes: array containing size of each area
    3181             :  * @nr_vms: the number of areas to allocate
    3182             :  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
    3183             :  *
    3184             :  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
    3185             :  *          vm_structs on success, %NULL on failure
    3186             :  *
    3187             :  * Percpu allocator wants to use congruent vm areas so that it can
    3188             :  * maintain the offsets among percpu areas.  This function allocates
    3189             :  * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
    3190             :  * be scattered pretty far, distance between two areas easily going up
    3191             :  * to gigabytes.  To avoid interacting with regular vmallocs, these
    3192             :  * areas are allocated from top.
    3193             :  *
    3194             :  * Despite its complicated look, this allocator is rather simple. It
    3195             :  * does everything top-down and scans free blocks from the end looking
    3196             :  * for matching base. While scanning, if any of the areas do not fit the
    3197             :  * base address is pulled down to fit the area. Scanning is repeated till
    3198             :  * all the areas fit and then all necessary data structures are inserted
    3199             :  * and the result is returned.
    3200             :  */
    3201           1 : struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
    3202             :                                      const size_t *sizes, int nr_vms,
    3203             :                                      size_t align)
    3204             : {
    3205           1 :         const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
    3206           1 :         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3207           1 :         struct vmap_area **vas, *va;
    3208           1 :         struct vm_struct **vms;
    3209           1 :         int area, area2, last_area, term_area;
    3210           1 :         unsigned long base, start, size, end, last_end, orig_start, orig_end;
    3211           1 :         bool purged = false;
    3212           1 :         enum fit_type type;
    3213             : 
    3214             :         /* verify parameters and allocate data structures */
    3215           2 :         BUG_ON(offset_in_page(align) || !is_power_of_2(align));
    3216           2 :         for (last_area = 0, area = 0; area < nr_vms; area++) {
    3217           1 :                 start = offsets[area];
    3218           1 :                 end = start + sizes[area];
    3219             : 
    3220             :                 /* is everything aligned properly? */
    3221           1 :                 BUG_ON(!IS_ALIGNED(offsets[area], align));
    3222           1 :                 BUG_ON(!IS_ALIGNED(sizes[area], align));
    3223             : 
    3224             :                 /* detect the area with the highest address */
    3225           1 :                 if (start > offsets[last_area])
    3226           0 :                         last_area = area;
    3227             : 
    3228           1 :                 for (area2 = area + 1; area2 < nr_vms; area2++) {
    3229           0 :                         unsigned long start2 = offsets[area2];
    3230           0 :                         unsigned long end2 = start2 + sizes[area2];
    3231             : 
    3232           0 :                         BUG_ON(start2 < end && start < end2);
    3233             :                 }
    3234             :         }
    3235           1 :         last_end = offsets[last_area] + sizes[last_area];
    3236             : 
    3237           1 :         if (vmalloc_end - vmalloc_start < last_end) {
    3238           0 :                 WARN_ON(true);
    3239           0 :                 return NULL;
    3240             :         }
    3241             : 
    3242           1 :         vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
    3243           1 :         vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
    3244           1 :         if (!vas || !vms)
    3245           0 :                 goto err_free2;
    3246             : 
    3247           2 :         for (area = 0; area < nr_vms; area++) {
    3248           1 :                 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
    3249           1 :                 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
    3250           1 :                 if (!vas[area] || !vms[area])
    3251           0 :                         goto err_free;
    3252             :         }
    3253           1 : retry:
    3254           1 :         spin_lock(&free_vmap_area_lock);
    3255             : 
    3256             :         /* start scanning - we scan from the top, begin with the last area */
    3257           1 :         area = term_area = last_area;
    3258           1 :         start = offsets[area];
    3259           1 :         end = start + sizes[area];
    3260             : 
    3261           1 :         va = pvm_find_va_enclose_addr(vmalloc_end);
    3262           1 :         base = pvm_determine_end_from_reverse(&va, align) - end;
    3263             : 
    3264           1 :         while (true) {
    3265             :                 /*
    3266             :                  * base might have underflowed, add last_end before
    3267             :                  * comparing.
    3268             :                  */
    3269           1 :                 if (base + last_end < vmalloc_start + last_end)
    3270           0 :                         goto overflow;
    3271             : 
    3272             :                 /*
    3273             :                  * Fitting base has not been found.
    3274             :                  */
    3275           1 :                 if (va == NULL)
    3276           0 :                         goto overflow;
    3277             : 
    3278             :                 /*
    3279             :                  * If required width exceeds current VA block, move
    3280             :                  * base downwards and then recheck.
    3281             :                  */
    3282           1 :                 if (base + end > va->va_end) {
    3283           0 :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    3284           0 :                         term_area = area;
    3285           0 :                         continue;
    3286             :                 }
    3287             : 
    3288             :                 /*
    3289             :                  * If this VA does not fit, move base downwards and recheck.
    3290             :                  */
    3291           1 :                 if (base + start < va->va_start) {
    3292           0 :                         va = node_to_va(rb_prev(&va->rb_node));
    3293           0 :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    3294           0 :                         term_area = area;
    3295           0 :                         continue;
    3296             :                 }
    3297             : 
    3298             :                 /*
    3299             :                  * This area fits, move on to the previous one.  If
    3300             :                  * the previous one is the terminal one, we're done.
    3301             :                  */
    3302           1 :                 area = (area + nr_vms - 1) % nr_vms;
    3303           1 :                 if (area == term_area)
    3304             :                         break;
    3305             : 
    3306           0 :                 start = offsets[area];
    3307           0 :                 end = start + sizes[area];
    3308           0 :                 va = pvm_find_va_enclose_addr(base + end);
    3309             :         }
    3310             : 
    3311             :         /* we've found a fitting base, insert all va's */
    3312           2 :         for (area = 0; area < nr_vms; area++) {
    3313           1 :                 int ret;
    3314             : 
    3315           1 :                 start = base + offsets[area];
    3316           1 :                 size = sizes[area];
    3317             : 
    3318           1 :                 va = pvm_find_va_enclose_addr(start);
    3319           1 :                 if (WARN_ON_ONCE(va == NULL))
    3320             :                         /* It is a BUG(), but trigger recovery instead. */
    3321           0 :                         goto recovery;
    3322             : 
    3323           1 :                 type = classify_va_fit_type(va, start, size);
    3324           1 :                 if (WARN_ON_ONCE(type == NOTHING_FIT))
    3325             :                         /* It is a BUG(), but trigger recovery instead. */
    3326           0 :                         goto recovery;
    3327             : 
    3328           1 :                 ret = adjust_va_to_fit_type(va, start, size, type);
    3329           1 :                 if (unlikely(ret))
    3330           0 :                         goto recovery;
    3331             : 
    3332             :                 /* Allocated area. */
    3333           1 :                 va = vas[area];
    3334           1 :                 va->va_start = start;
    3335           1 :                 va->va_end = start + size;
    3336             :         }
    3337             : 
    3338           1 :         spin_unlock(&free_vmap_area_lock);
    3339             : 
    3340             :         /* populate the kasan shadow space */
    3341           3 :         for (area = 0; area < nr_vms; area++) {
    3342           1 :                 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
    3343           0 :                         goto err_free_shadow;
    3344             : 
    3345           1 :                 kasan_unpoison_vmalloc((void *)vas[area]->va_start,
    3346             :                                        sizes[area]);
    3347             :         }
    3348             : 
    3349             :         /* insert all vm's */
    3350           1 :         spin_lock(&vmap_area_lock);
    3351           3 :         for (area = 0; area < nr_vms; area++) {
    3352           1 :                 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
    3353             : 
    3354           1 :                 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
    3355             :                                  pcpu_get_vm_areas);
    3356             :         }
    3357           1 :         spin_unlock(&vmap_area_lock);
    3358             : 
    3359           1 :         kfree(vas);
    3360           1 :         return vms;
    3361             : 
    3362             : recovery:
    3363             :         /*
    3364             :          * Remove previously allocated areas. There is no
    3365             :          * need in removing these areas from the busy tree,
    3366             :          * because they are inserted only on the final step
    3367             :          * and when pcpu_get_vm_areas() is success.
    3368             :          */
    3369           0 :         while (area--) {
    3370           0 :                 orig_start = vas[area]->va_start;
    3371           0 :                 orig_end = vas[area]->va_end;
    3372           0 :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    3373             :                                 &free_vmap_area_list);
    3374           0 :                 if (va)
    3375           0 :                         kasan_release_vmalloc(orig_start, orig_end,
    3376             :                                 va->va_start, va->va_end);
    3377           0 :                 vas[area] = NULL;
    3378             :         }
    3379             : 
    3380           0 : overflow:
    3381           0 :         spin_unlock(&free_vmap_area_lock);
    3382           0 :         if (!purged) {
    3383           0 :                 purge_vmap_area_lazy();
    3384           0 :                 purged = true;
    3385             : 
    3386             :                 /* Before "retry", check if we recover. */
    3387           0 :                 for (area = 0; area < nr_vms; area++) {
    3388           0 :                         if (vas[area])
    3389           0 :                                 continue;
    3390             : 
    3391           0 :                         vas[area] = kmem_cache_zalloc(
    3392             :                                 vmap_area_cachep, GFP_KERNEL);
    3393           0 :                         if (!vas[area])
    3394           0 :                                 goto err_free;
    3395             :                 }
    3396             : 
    3397           0 :                 goto retry;
    3398             :         }
    3399             : 
    3400           0 : err_free:
    3401           0 :         for (area = 0; area < nr_vms; area++) {
    3402           0 :                 if (vas[area])
    3403           0 :                         kmem_cache_free(vmap_area_cachep, vas[area]);
    3404             : 
    3405           0 :                 kfree(vms[area]);
    3406             :         }
    3407           0 : err_free2:
    3408           0 :         kfree(vas);
    3409           0 :         kfree(vms);
    3410           0 :         return NULL;
    3411             : 
    3412           0 : err_free_shadow:
    3413           0 :         spin_lock(&free_vmap_area_lock);
    3414             :         /*
    3415             :          * We release all the vmalloc shadows, even the ones for regions that
    3416             :          * hadn't been successfully added. This relies on kasan_release_vmalloc
    3417             :          * being able to tolerate this case.
    3418             :          */
    3419           0 :         for (area = 0; area < nr_vms; area++) {
    3420           0 :                 orig_start = vas[area]->va_start;
    3421           0 :                 orig_end = vas[area]->va_end;
    3422           0 :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    3423             :                                 &free_vmap_area_list);
    3424           0 :                 if (va)
    3425           0 :                         kasan_release_vmalloc(orig_start, orig_end,
    3426             :                                 va->va_start, va->va_end);
    3427           0 :                 vas[area] = NULL;
    3428           0 :                 kfree(vms[area]);
    3429             :         }
    3430           0 :         spin_unlock(&free_vmap_area_lock);
    3431           0 :         kfree(vas);
    3432           0 :         kfree(vms);
    3433           0 :         return NULL;
    3434             : }
    3435             : 
    3436             : /**
    3437             :  * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
    3438             :  * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
    3439             :  * @nr_vms: the number of allocated areas
    3440             :  *
    3441             :  * Free vm_structs and the array allocated by pcpu_get_vm_areas().
    3442             :  */
    3443           0 : void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
    3444             : {
    3445           0 :         int i;
    3446             : 
    3447           0 :         for (i = 0; i < nr_vms; i++)
    3448           0 :                 free_vm_area(vms[i]);
    3449           0 :         kfree(vms);
    3450           0 : }
    3451             : #endif  /* CONFIG_SMP */
    3452             : 
    3453           0 : bool vmalloc_dump_obj(void *object)
    3454             : {
    3455           0 :         struct vm_struct *vm;
    3456           0 :         void *objp = (void *)PAGE_ALIGN((unsigned long)object);
    3457             : 
    3458           0 :         vm = find_vm_area(objp);
    3459           0 :         if (!vm)
    3460             :                 return false;
    3461           0 :         pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
    3462             :                 vm->nr_pages, (unsigned long)vm->addr, vm->caller);
    3463           0 :         return true;
    3464             : }
    3465             : 
    3466             : #ifdef CONFIG_PROC_FS
    3467           0 : static void *s_start(struct seq_file *m, loff_t *pos)
    3468             :         __acquires(&vmap_purge_lock)
    3469             :         __acquires(&vmap_area_lock)
    3470             : {
    3471           0 :         mutex_lock(&vmap_purge_lock);
    3472           0 :         spin_lock(&vmap_area_lock);
    3473             : 
    3474           0 :         return seq_list_start(&vmap_area_list, *pos);
    3475             : }
    3476             : 
    3477           0 : static void *s_next(struct seq_file *m, void *p, loff_t *pos)
    3478             : {
    3479           0 :         return seq_list_next(p, &vmap_area_list, pos);
    3480             : }
    3481             : 
    3482           0 : static void s_stop(struct seq_file *m, void *p)
    3483             :         __releases(&vmap_area_lock)
    3484             :         __releases(&vmap_purge_lock)
    3485             : {
    3486           0 :         spin_unlock(&vmap_area_lock);
    3487           0 :         mutex_unlock(&vmap_purge_lock);
    3488           0 : }
    3489             : 
    3490           0 : static void show_numa_info(struct seq_file *m, struct vm_struct *v)
    3491             : {
    3492           0 :         if (IS_ENABLED(CONFIG_NUMA)) {
    3493           0 :                 unsigned int nr, *counters = m->private;
    3494             : 
    3495           0 :                 if (!counters)
    3496             :                         return;
    3497             : 
    3498           0 :                 if (v->flags & VM_UNINITIALIZED)
    3499             :                         return;
    3500             :                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
    3501           0 :                 smp_rmb();
    3502             : 
    3503           0 :                 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
    3504             : 
    3505           0 :                 for (nr = 0; nr < v->nr_pages; nr++)
    3506           0 :                         counters[page_to_nid(v->pages[nr])]++;
    3507             : 
    3508           0 :                 for_each_node_state(nr, N_HIGH_MEMORY)
    3509           0 :                         if (counters[nr])
    3510           0 :                                 seq_printf(m, " N%u=%u", nr, counters[nr]);
    3511             :         }
    3512             : }
    3513             : 
    3514           0 : static void show_purge_info(struct seq_file *m)
    3515             : {
    3516           0 :         struct vmap_area *va;
    3517             : 
    3518           0 :         spin_lock(&purge_vmap_area_lock);
    3519           0 :         list_for_each_entry(va, &purge_vmap_area_list, list) {
    3520           0 :                 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
    3521             :                         (void *)va->va_start, (void *)va->va_end,
    3522           0 :                         va->va_end - va->va_start);
    3523             :         }
    3524           0 :         spin_unlock(&purge_vmap_area_lock);
    3525           0 : }
    3526             : 
    3527           0 : static int s_show(struct seq_file *m, void *p)
    3528             : {
    3529           0 :         struct vmap_area *va;
    3530           0 :         struct vm_struct *v;
    3531             : 
    3532           0 :         va = list_entry(p, struct vmap_area, list);
    3533             : 
    3534             :         /*
    3535             :          * s_show can encounter race with remove_vm_area, !vm on behalf
    3536             :          * of vmap area is being tear down or vm_map_ram allocation.
    3537             :          */
    3538           0 :         if (!va->vm) {
    3539           0 :                 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
    3540             :                         (void *)va->va_start, (void *)va->va_end,
    3541           0 :                         va->va_end - va->va_start);
    3542             : 
    3543           0 :                 return 0;
    3544             :         }
    3545             : 
    3546           0 :         v = va->vm;
    3547             : 
    3548           0 :         seq_printf(m, "0x%pK-0x%pK %7ld",
    3549           0 :                 v->addr, v->addr + v->size, v->size);
    3550             : 
    3551           0 :         if (v->caller)
    3552           0 :                 seq_printf(m, " %pS", v->caller);
    3553             : 
    3554           0 :         if (v->nr_pages)
    3555           0 :                 seq_printf(m, " pages=%d", v->nr_pages);
    3556             : 
    3557           0 :         if (v->phys_addr)
    3558           0 :                 seq_printf(m, " phys=%pa", &v->phys_addr);
    3559             : 
    3560           0 :         if (v->flags & VM_IOREMAP)
    3561           0 :                 seq_puts(m, " ioremap");
    3562             : 
    3563           0 :         if (v->flags & VM_ALLOC)
    3564           0 :                 seq_puts(m, " vmalloc");
    3565             : 
    3566           0 :         if (v->flags & VM_MAP)
    3567           0 :                 seq_puts(m, " vmap");
    3568             : 
    3569           0 :         if (v->flags & VM_USERMAP)
    3570           0 :                 seq_puts(m, " user");
    3571             : 
    3572           0 :         if (v->flags & VM_DMA_COHERENT)
    3573           0 :                 seq_puts(m, " dma-coherent");
    3574             : 
    3575           0 :         if (is_vmalloc_addr(v->pages))
    3576           0 :                 seq_puts(m, " vpages");
    3577             : 
    3578           0 :         show_numa_info(m, v);
    3579           0 :         seq_putc(m, '\n');
    3580             : 
    3581             :         /*
    3582             :          * As a final step, dump "unpurged" areas.
    3583             :          */
    3584           0 :         if (list_is_last(&va->list, &vmap_area_list))
    3585           0 :                 show_purge_info(m);
    3586             : 
    3587             :         return 0;
    3588             : }
    3589             : 
    3590             : static const struct seq_operations vmalloc_op = {
    3591             :         .start = s_start,
    3592             :         .next = s_next,
    3593             :         .stop = s_stop,
    3594             :         .show = s_show,
    3595             : };
    3596             : 
    3597           1 : static int __init proc_vmalloc_init(void)
    3598             : {
    3599           1 :         if (IS_ENABLED(CONFIG_NUMA))
    3600           1 :                 proc_create_seq_private("vmallocinfo", 0400, NULL,
    3601             :                                 &vmalloc_op,
    3602             :                                 nr_node_ids * sizeof(unsigned int), NULL);
    3603             :         else
    3604             :                 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
    3605           1 :         return 0;
    3606             : }
    3607             : module_init(proc_vmalloc_init);
    3608             : 
    3609             : #endif

Generated by: LCOV version 1.14