LCOV - code coverage report
Current view: top level - arch/x86/mm - numa.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 194 305 63.6 %
Date: 2021-04-22 12:43:58 Functions: 20 28 71.4 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /* Common code for 32 and 64-bit NUMA */
       3             : #include <linux/acpi.h>
       4             : #include <linux/kernel.h>
       5             : #include <linux/mm.h>
       6             : #include <linux/string.h>
       7             : #include <linux/init.h>
       8             : #include <linux/memblock.h>
       9             : #include <linux/mmzone.h>
      10             : #include <linux/ctype.h>
      11             : #include <linux/nodemask.h>
      12             : #include <linux/sched.h>
      13             : #include <linux/topology.h>
      14             : 
      15             : #include <asm/e820/api.h>
      16             : #include <asm/proto.h>
      17             : #include <asm/dma.h>
      18             : #include <asm/amd_nb.h>
      19             : 
      20             : #include "numa_internal.h"
      21             : 
      22             : int numa_off;
      23             : nodemask_t numa_nodes_parsed __initdata;
      24             : 
      25             : struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
      26             : EXPORT_SYMBOL(node_data);
      27             : 
      28             : static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
      29             : static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
      30             : 
      31             : static int numa_distance_cnt;
      32             : static u8 *numa_distance;
      33             : 
      34           0 : static __init int numa_setup(char *opt)
      35             : {
      36           0 :         if (!opt)
      37             :                 return -EINVAL;
      38           0 :         if (!strncmp(opt, "off", 3))
      39           0 :                 numa_off = 1;
      40           0 :         if (!strncmp(opt, "fake=", 5))
      41           0 :                 return numa_emu_cmdline(opt + 5);
      42             :         if (!strncmp(opt, "noacpi", 6))
      43             :                 disable_srat();
      44             :         if (!strncmp(opt, "nohmat", 6))
      45             :                 disable_hmat();
      46             :         return 0;
      47             : }
      48             : early_param("numa", numa_setup);
      49             : 
      50             : /*
      51             :  * apicid, cpu, node mappings
      52             :  */
      53             : s16 __apicid_to_node[MAX_LOCAL_APIC] = {
      54             :         [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
      55             : };
      56             : 
      57           8 : int numa_cpu_node(int cpu)
      58             : {
      59           8 :         int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
      60             : 
      61           8 :         if (apicid != BAD_APICID)
      62           8 :                 return __apicid_to_node[apicid];
      63             :         return NUMA_NO_NODE;
      64             : }
      65             : 
      66             : cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
      67             : EXPORT_SYMBOL(node_to_cpumask_map);
      68             : 
      69             : /*
      70             :  * Map cpu index to node index
      71             :  */
      72             : DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
      73             : EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
      74             : 
      75          20 : void numa_set_node(int cpu, int node)
      76             : {
      77          20 :         int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
      78             : 
      79             :         /* early setting, no percpu area yet */
      80          20 :         if (cpu_to_node_map) {
      81          16 :                 cpu_to_node_map[cpu] = node;
      82          16 :                 return;
      83             :         }
      84             : 
      85             : #ifdef CONFIG_DEBUG_PER_CPU_MAPS
      86             :         if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
      87             :                 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
      88             :                 dump_stack();
      89             :                 return;
      90             :         }
      91             : #endif
      92           4 :         per_cpu(x86_cpu_to_node_map, cpu) = node;
      93             : 
      94           4 :         set_cpu_numa_node(cpu, node);
      95             : }
      96             : 
      97           0 : void numa_clear_node(int cpu)
      98             : {
      99           0 :         numa_set_node(cpu, NUMA_NO_NODE);
     100           0 : }
     101             : 
     102             : /*
     103             :  * Allocate node_to_cpumask_map based on number of available nodes
     104             :  * Requires node_possible_map to be valid.
     105             :  *
     106             :  * Note: cpumask_of_node() is not valid until after this is done.
     107             :  * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
     108             :  */
     109           1 : void __init setup_node_to_cpumask_map(void)
     110             : {
     111           1 :         unsigned int node;
     112             : 
     113             :         /* setup nr_node_ids if not done yet */
     114           1 :         if (nr_node_ids == MAX_NUMNODES)
     115           0 :                 setup_nr_node_ids();
     116             : 
     117             :         /* allocate the map */
     118           2 :         for (node = 0; node < nr_node_ids; node++)
     119           1 :                 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
     120             : 
     121             :         /* cpumask_of_node() will now work */
     122           1 :         pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
     123           1 : }
     124             : 
     125           1 : static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
     126             :                                      struct numa_meminfo *mi)
     127             : {
     128             :         /* ignore zero length blks */
     129           1 :         if (start == end)
     130             :                 return 0;
     131             : 
     132             :         /* whine about and ignore invalid blks */
     133           1 :         if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
     134           0 :                 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
     135             :                         nid, start, end - 1);
     136           0 :                 return 0;
     137             :         }
     138             : 
     139           1 :         if (mi->nr_blks >= NR_NODE_MEMBLKS) {
     140           0 :                 pr_err("too many memblk ranges\n");
     141           0 :                 return -EINVAL;
     142             :         }
     143             : 
     144           1 :         mi->blk[mi->nr_blks].start = start;
     145           1 :         mi->blk[mi->nr_blks].end = end;
     146           1 :         mi->blk[mi->nr_blks].nid = nid;
     147           1 :         mi->nr_blks++;
     148           1 :         return 0;
     149             : }
     150             : 
     151             : /**
     152             :  * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
     153             :  * @idx: Index of memblk to remove
     154             :  * @mi: numa_meminfo to remove memblk from
     155             :  *
     156             :  * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
     157             :  * decrementing @mi->nr_blks.
     158             :  */
     159           0 : void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
     160             : {
     161           0 :         mi->nr_blks--;
     162           0 :         memmove(&mi->blk[idx], &mi->blk[idx + 1],
     163           0 :                 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
     164           0 : }
     165             : 
     166             : /**
     167             :  * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
     168             :  * @dst: numa_meminfo to append block to
     169             :  * @idx: Index of memblk to remove
     170             :  * @src: numa_meminfo to remove memblk from
     171             :  */
     172           0 : static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
     173             :                                          struct numa_meminfo *src)
     174             : {
     175           0 :         dst->blk[dst->nr_blks++] = src->blk[idx];
     176           0 :         numa_remove_memblk_from(idx, src);
     177           0 : }
     178             : 
     179             : /**
     180             :  * numa_add_memblk - Add one numa_memblk to numa_meminfo
     181             :  * @nid: NUMA node ID of the new memblk
     182             :  * @start: Start address of the new memblk
     183             :  * @end: End address of the new memblk
     184             :  *
     185             :  * Add a new memblk to the default numa_meminfo.
     186             :  *
     187             :  * RETURNS:
     188             :  * 0 on success, -errno on failure.
     189             :  */
     190           1 : int __init numa_add_memblk(int nid, u64 start, u64 end)
     191             : {
     192           1 :         return numa_add_memblk_to(nid, start, end, &numa_meminfo);
     193             : }
     194             : 
     195             : /* Allocate NODE_DATA for a node on the local memory */
     196           1 : static void __init alloc_node_data(int nid)
     197             : {
     198           1 :         const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
     199           1 :         u64 nd_pa;
     200           1 :         void *nd;
     201           1 :         int tnid;
     202             : 
     203             :         /*
     204             :          * Allocate node data.  Try node-local memory and then any node.
     205             :          * Never allocate in DMA zone.
     206             :          */
     207           1 :         nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
     208           1 :         if (!nd_pa) {
     209           0 :                 pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
     210             :                        nd_size, nid);
     211           0 :                 return;
     212             :         }
     213           1 :         nd = __va(nd_pa);
     214             : 
     215             :         /* report and initialize */
     216           1 :         printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
     217             :                nd_pa, nd_pa + nd_size - 1);
     218           1 :         tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
     219           1 :         if (tnid != nid)
     220           0 :                 printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
     221             : 
     222           1 :         node_data[nid] = nd;
     223           1 :         memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
     224             : 
     225           1 :         node_set_online(nid);
     226             : }
     227             : 
     228             : /**
     229             :  * numa_cleanup_meminfo - Cleanup a numa_meminfo
     230             :  * @mi: numa_meminfo to clean up
     231             :  *
     232             :  * Sanitize @mi by merging and removing unnecessary memblks.  Also check for
     233             :  * conflicts and clear unused memblks.
     234             :  *
     235             :  * RETURNS:
     236             :  * 0 on success, -errno on failure.
     237             :  */
     238           1 : int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
     239             : {
     240           1 :         const u64 low = 0;
     241           1 :         const u64 high = PFN_PHYS(max_pfn);
     242           1 :         int i, j, k;
     243             : 
     244             :         /* first, trim all entries */
     245           2 :         for (i = 0; i < mi->nr_blks; i++) {
     246           1 :                 struct numa_memblk *bi = &mi->blk[i];
     247             : 
     248             :                 /* move / save reserved memory ranges */
     249           1 :                 if (!memblock_overlaps_region(&memblock.memory,
     250           1 :                                         bi->start, bi->end - bi->start)) {
     251           0 :                         numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
     252           0 :                         continue;
     253             :                 }
     254             : 
     255             :                 /* make sure all non-reserved blocks are inside the limits */
     256           1 :                 bi->start = max(bi->start, low);
     257           1 :                 bi->end = min(bi->end, high);
     258             : 
     259             :                 /* and there's no empty block */
     260           1 :                 if (bi->start >= bi->end)
     261           0 :                         numa_remove_memblk_from(i--, mi);
     262             :         }
     263             : 
     264             :         /* merge neighboring / overlapping entries */
     265           2 :         for (i = 0; i < mi->nr_blks; i++) {
     266           1 :                 struct numa_memblk *bi = &mi->blk[i];
     267             : 
     268           1 :                 for (j = i + 1; j < mi->nr_blks; j++) {
     269           0 :                         struct numa_memblk *bj = &mi->blk[j];
     270           0 :                         u64 start, end;
     271             : 
     272             :                         /*
     273             :                          * See whether there are overlapping blocks.  Whine
     274             :                          * about but allow overlaps of the same nid.  They
     275             :                          * will be merged below.
     276             :                          */
     277           0 :                         if (bi->end > bj->start && bi->start < bj->end) {
     278           0 :                                 if (bi->nid != bj->nid) {
     279           0 :                                         pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
     280             :                                                bi->nid, bi->start, bi->end - 1,
     281             :                                                bj->nid, bj->start, bj->end - 1);
     282           0 :                                         return -EINVAL;
     283             :                                 }
     284           0 :                                 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
     285             :                                         bi->nid, bi->start, bi->end - 1,
     286             :                                         bj->start, bj->end - 1);
     287             :                         }
     288             : 
     289             :                         /*
     290             :                          * Join together blocks on the same node, holes
     291             :                          * between which don't overlap with memory on other
     292             :                          * nodes.
     293             :                          */
     294           0 :                         if (bi->nid != bj->nid)
     295           0 :                                 continue;
     296           0 :                         start = min(bi->start, bj->start);
     297           0 :                         end = max(bi->end, bj->end);
     298           0 :                         for (k = 0; k < mi->nr_blks; k++) {
     299           0 :                                 struct numa_memblk *bk = &mi->blk[k];
     300             : 
     301           0 :                                 if (bi->nid == bk->nid)
     302           0 :                                         continue;
     303           0 :                                 if (start < bk->end && end > bk->start)
     304             :                                         break;
     305             :                         }
     306           0 :                         if (k < mi->nr_blks)
     307           0 :                                 continue;
     308           0 :                         printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
     309             :                                bi->nid, bi->start, bi->end - 1, bj->start,
     310             :                                bj->end - 1, start, end - 1);
     311           0 :                         bi->start = start;
     312           0 :                         bi->end = end;
     313           0 :                         numa_remove_memblk_from(j--, mi);
     314             :                 }
     315             :         }
     316             : 
     317             :         /* clear unused ones */
     318         128 :         for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
     319         127 :                 mi->blk[i].start = mi->blk[i].end = 0;
     320         127 :                 mi->blk[i].nid = NUMA_NO_NODE;
     321             :         }
     322             : 
     323             :         return 0;
     324             : }
     325             : 
     326             : /*
     327             :  * Set nodes, which have memory in @mi, in *@nodemask.
     328             :  */
     329           1 : static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
     330             :                                               const struct numa_meminfo *mi)
     331             : {
     332           1 :         int i;
     333             : 
     334         129 :         for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
     335         128 :                 if (mi->blk[i].start != mi->blk[i].end &&
     336           1 :                     mi->blk[i].nid != NUMA_NO_NODE)
     337         129 :                         node_set(mi->blk[i].nid, *nodemask);
     338           1 : }
     339             : 
     340             : /**
     341             :  * numa_reset_distance - Reset NUMA distance table
     342             :  *
     343             :  * The current table is freed.  The next numa_set_distance() call will
     344             :  * create a new one.
     345             :  */
     346           1 : void __init numa_reset_distance(void)
     347             : {
     348           1 :         size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
     349             : 
     350             :         /* numa_distance could be 1LU marking allocation failure, test cnt */
     351           1 :         if (numa_distance_cnt)
     352           0 :                 memblock_free(__pa(numa_distance), size);
     353           1 :         numa_distance_cnt = 0;
     354           1 :         numa_distance = NULL;   /* enable table creation */
     355           1 : }
     356             : 
     357           0 : static int __init numa_alloc_distance(void)
     358             : {
     359           0 :         nodemask_t nodes_parsed;
     360           0 :         size_t size;
     361           0 :         int i, j, cnt = 0;
     362           0 :         u64 phys;
     363             : 
     364             :         /* size the new table and allocate it */
     365           0 :         nodes_parsed = numa_nodes_parsed;
     366           0 :         numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
     367             : 
     368           0 :         for_each_node_mask(i, nodes_parsed)
     369           0 :                 cnt = i;
     370           0 :         cnt++;
     371           0 :         size = cnt * cnt * sizeof(numa_distance[0]);
     372             : 
     373           0 :         phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
     374             :                                       size, PAGE_SIZE);
     375           0 :         if (!phys) {
     376           0 :                 pr_warn("Warning: can't allocate distance table!\n");
     377             :                 /* don't retry until explicitly reset */
     378           0 :                 numa_distance = (void *)1LU;
     379           0 :                 return -ENOMEM;
     380             :         }
     381           0 :         memblock_reserve(phys, size);
     382             : 
     383           0 :         numa_distance = __va(phys);
     384           0 :         numa_distance_cnt = cnt;
     385             : 
     386             :         /* fill with the default distances */
     387           0 :         for (i = 0; i < cnt; i++)
     388           0 :                 for (j = 0; j < cnt; j++)
     389           0 :                         numa_distance[i * cnt + j] = i == j ?
     390             :                                 LOCAL_DISTANCE : REMOTE_DISTANCE;
     391           0 :         printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
     392             : 
     393           0 :         return 0;
     394             : }
     395             : 
     396             : /**
     397             :  * numa_set_distance - Set NUMA distance from one NUMA to another
     398             :  * @from: the 'from' node to set distance
     399             :  * @to: the 'to'  node to set distance
     400             :  * @distance: NUMA distance
     401             :  *
     402             :  * Set the distance from node @from to @to to @distance.  If distance table
     403             :  * doesn't exist, one which is large enough to accommodate all the currently
     404             :  * known nodes will be created.
     405             :  *
     406             :  * If such table cannot be allocated, a warning is printed and further
     407             :  * calls are ignored until the distance table is reset with
     408             :  * numa_reset_distance().
     409             :  *
     410             :  * If @from or @to is higher than the highest known node or lower than zero
     411             :  * at the time of table creation or @distance doesn't make sense, the call
     412             :  * is ignored.
     413             :  * This is to allow simplification of specific NUMA config implementations.
     414             :  */
     415           0 : void __init numa_set_distance(int from, int to, int distance)
     416             : {
     417           0 :         if (!numa_distance && numa_alloc_distance() < 0)
     418             :                 return;
     419             : 
     420           0 :         if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
     421           0 :                         from < 0 || to < 0) {
     422           0 :                 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
     423             :                              from, to, distance);
     424           0 :                 return;
     425             :         }
     426             : 
     427           0 :         if ((u8)distance != distance ||
     428           0 :             (from == to && distance != LOCAL_DISTANCE)) {
     429           0 :                 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
     430             :                              from, to, distance);
     431           0 :                 return;
     432             :         }
     433             : 
     434           0 :         numa_distance[from * numa_distance_cnt + to] = distance;
     435             : }
     436             : 
     437           4 : int __node_distance(int from, int to)
     438             : {
     439           4 :         if (from >= numa_distance_cnt || to >= numa_distance_cnt)
     440           4 :                 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
     441           0 :         return numa_distance[from * numa_distance_cnt + to];
     442             : }
     443             : EXPORT_SYMBOL(__node_distance);
     444             : 
     445             : /*
     446             :  * Sanity check to catch more bad NUMA configurations (they are amazingly
     447             :  * common).  Make sure the nodes cover all memory.
     448             :  */
     449           1 : static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
     450             : {
     451           1 :         u64 numaram, e820ram;
     452           1 :         int i;
     453             : 
     454           1 :         numaram = 0;
     455           2 :         for (i = 0; i < mi->nr_blks; i++) {
     456           1 :                 u64 s = mi->blk[i].start >> PAGE_SHIFT;
     457           1 :                 u64 e = mi->blk[i].end >> PAGE_SHIFT;
     458           1 :                 numaram += e - s;
     459           1 :                 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
     460           1 :                 if ((s64)numaram < 0)
     461           0 :                         numaram = 0;
     462             :         }
     463             : 
     464           1 :         e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
     465             : 
     466             :         /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
     467           1 :         if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
     468           0 :                 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
     469           0 :                        (numaram << PAGE_SHIFT) >> 20,
     470           0 :                        (e820ram << PAGE_SHIFT) >> 20);
     471           0 :                 return false;
     472             :         }
     473             :         return true;
     474             : }
     475             : 
     476             : /*
     477             :  * Mark all currently memblock-reserved physical memory (which covers the
     478             :  * kernel's own memory ranges) as hot-unswappable.
     479             :  */
     480           1 : static void __init numa_clear_kernel_node_hotplug(void)
     481             : {
     482           1 :         nodemask_t reserved_nodemask = NODE_MASK_NONE;
     483           1 :         struct memblock_region *mb_region;
     484           1 :         int i;
     485             : 
     486             :         /*
     487             :          * We have to do some preprocessing of memblock regions, to
     488             :          * make them suitable for reservation.
     489             :          *
     490             :          * At this time, all memory regions reserved by memblock are
     491             :          * used by the kernel, but those regions are not split up
     492             :          * along node boundaries yet, and don't necessarily have their
     493             :          * node ID set yet either.
     494             :          *
     495             :          * So iterate over all memory known to the x86 architecture,
     496             :          * and use those ranges to set the nid in memblock.reserved.
     497             :          * This will split up the memblock regions along node
     498             :          * boundaries and will set the node IDs as well.
     499             :          */
     500           2 :         for (i = 0; i < numa_meminfo.nr_blks; i++) {
     501           1 :                 struct numa_memblk *mb = numa_meminfo.blk + i;
     502           1 :                 int ret;
     503             : 
     504           1 :                 ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
     505           1 :                 WARN_ON_ONCE(ret);
     506             :         }
     507             : 
     508             :         /*
     509             :          * Now go over all reserved memblock regions, to construct a
     510             :          * node mask of all kernel reserved memory areas.
     511             :          *
     512             :          * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
     513             :          *   numa_meminfo might not include all memblock.reserved
     514             :          *   memory ranges, because quirks such as trim_snb_memory()
     515             :          *   reserve specific pages for Sandy Bridge graphics. ]
     516             :          */
     517           4 :         for_each_reserved_mem_region(mb_region) {
     518           3 :                 int nid = memblock_get_region_node(mb_region);
     519             : 
     520           3 :                 if (nid != MAX_NUMNODES)
     521           6 :                         node_set(nid, reserved_nodemask);
     522             :         }
     523             : 
     524             :         /*
     525             :          * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
     526             :          * belonging to the reserved node mask.
     527             :          *
     528             :          * Note that this will include memory regions that reside
     529             :          * on nodes that contain kernel memory - entire nodes
     530             :          * become hot-unpluggable:
     531             :          */
     532           2 :         for (i = 0; i < numa_meminfo.nr_blks; i++) {
     533           1 :                 struct numa_memblk *mb = numa_meminfo.blk + i;
     534             : 
     535           1 :                 if (!node_isset(mb->nid, reserved_nodemask))
     536           0 :                         continue;
     537             : 
     538           1 :                 memblock_clear_hotplug(mb->start, mb->end - mb->start);
     539             :         }
     540           1 : }
     541             : 
     542           1 : static int __init numa_register_memblks(struct numa_meminfo *mi)
     543             : {
     544           1 :         int i, nid;
     545             : 
     546             :         /* Account for nodes with cpus and no memory */
     547           1 :         node_possible_map = numa_nodes_parsed;
     548           1 :         numa_nodemask_from_meminfo(&node_possible_map, mi);
     549           1 :         if (WARN_ON(nodes_empty(node_possible_map)))
     550             :                 return -EINVAL;
     551             : 
     552           2 :         for (i = 0; i < mi->nr_blks; i++) {
     553           1 :                 struct numa_memblk *mb = &mi->blk[i];
     554           1 :                 memblock_set_node(mb->start, mb->end - mb->start,
     555             :                                   &memblock.memory, mb->nid);
     556             :         }
     557             : 
     558             :         /*
     559             :          * At very early time, the kernel have to use some memory such as
     560             :          * loading the kernel image. We cannot prevent this anyway. So any
     561             :          * node the kernel resides in should be un-hotpluggable.
     562             :          *
     563             :          * And when we come here, alloc node data won't fail.
     564             :          */
     565           1 :         numa_clear_kernel_node_hotplug();
     566             : 
     567             :         /*
     568             :          * If sections array is gonna be used for pfn -> nid mapping, check
     569             :          * whether its granularity is fine enough.
     570             :          */
     571           1 :         if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
     572             :                 unsigned long pfn_align = node_map_pfn_alignment();
     573             : 
     574             :                 if (pfn_align && pfn_align < PAGES_PER_SECTION) {
     575             :                         pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
     576             :                                 PFN_PHYS(pfn_align) >> 20,
     577             :                                 PFN_PHYS(PAGES_PER_SECTION) >> 20);
     578             :                         return -EINVAL;
     579             :                 }
     580             :         }
     581           1 :         if (!numa_meminfo_cover_memory(mi))
     582             :                 return -EINVAL;
     583             : 
     584             :         /* Finally register nodes. */
     585           2 :         for_each_node_mask(nid, node_possible_map) {
     586           1 :                 u64 start = PFN_PHYS(max_pfn);
     587           1 :                 u64 end = 0;
     588             : 
     589           2 :                 for (i = 0; i < mi->nr_blks; i++) {
     590           1 :                         if (nid != mi->blk[i].nid)
     591           0 :                                 continue;
     592           1 :                         start = min(mi->blk[i].start, start);
     593           1 :                         end = max(mi->blk[i].end, end);
     594             :                 }
     595             : 
     596           1 :                 if (start >= end)
     597           0 :                         continue;
     598             : 
     599             :                 /*
     600             :                  * Don't confuse VM with a node that doesn't have the
     601             :                  * minimum amount of memory:
     602             :                  */
     603           1 :                 if (end && (end - start) < NODE_MIN_SIZE)
     604           0 :                         continue;
     605             : 
     606           1 :                 alloc_node_data(nid);
     607             :         }
     608             : 
     609             :         /* Dump memblock with node info and return. */
     610           1 :         memblock_dump_all();
     611           1 :         return 0;
     612             : }
     613             : 
     614             : /*
     615             :  * There are unfortunately some poorly designed mainboards around that
     616             :  * only connect memory to a single CPU. This breaks the 1:1 cpu->node
     617             :  * mapping. To avoid this fill in the mapping for all possible CPUs,
     618             :  * as the number of CPUs is not known yet. We round robin the existing
     619             :  * nodes.
     620             :  */
     621           1 : static void __init numa_init_array(void)
     622             : {
     623           1 :         int rr, i;
     624             : 
     625           1 :         rr = first_node(node_online_map);
     626          17 :         for (i = 0; i < nr_cpu_ids; i++) {
     627          16 :                 if (early_cpu_to_node(i) != NUMA_NO_NODE)
     628           0 :                         continue;
     629          16 :                 numa_set_node(i, rr);
     630          16 :                 rr = next_node_in(rr, node_online_map);
     631             :         }
     632           1 : }
     633             : 
     634           1 : static int __init numa_init(int (*init_func)(void))
     635             : {
     636           1 :         int i;
     637           1 :         int ret;
     638             : 
     639       32769 :         for (i = 0; i < MAX_LOCAL_APIC; i++)
     640       32768 :                 set_apicid_to_node(i, NUMA_NO_NODE);
     641             : 
     642           1 :         nodes_clear(numa_nodes_parsed);
     643           1 :         nodes_clear(node_possible_map);
     644           1 :         nodes_clear(node_online_map);
     645           1 :         memset(&numa_meminfo, 0, sizeof(numa_meminfo));
     646           1 :         WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
     647             :                                   MAX_NUMNODES));
     648           1 :         WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
     649             :                                   MAX_NUMNODES));
     650             :         /* In case that parsing SRAT failed. */
     651           1 :         WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
     652           1 :         numa_reset_distance();
     653             : 
     654           1 :         ret = init_func();
     655           1 :         if (ret < 0)
     656             :                 return ret;
     657             : 
     658             :         /*
     659             :          * We reset memblock back to the top-down direction
     660             :          * here because if we configured ACPI_NUMA, we have
     661             :          * parsed SRAT in init_func(). It is ok to have the
     662             :          * reset here even if we did't configure ACPI_NUMA
     663             :          * or acpi numa init fails and fallbacks to dummy
     664             :          * numa init.
     665             :          */
     666           1 :         memblock_set_bottom_up(false);
     667             : 
     668           1 :         ret = numa_cleanup_meminfo(&numa_meminfo);
     669           1 :         if (ret < 0)
     670             :                 return ret;
     671             : 
     672           1 :         numa_emulation(&numa_meminfo, numa_distance_cnt);
     673             : 
     674           1 :         ret = numa_register_memblks(&numa_meminfo);
     675           1 :         if (ret < 0)
     676             :                 return ret;
     677             : 
     678          17 :         for (i = 0; i < nr_cpu_ids; i++) {
     679          16 :                 int nid = early_cpu_to_node(i);
     680             : 
     681          16 :                 if (nid == NUMA_NO_NODE)
     682          16 :                         continue;
     683           0 :                 if (!node_online(nid))
     684          16 :                         numa_clear_node(i);
     685             :         }
     686           1 :         numa_init_array();
     687             : 
     688           1 :         return 0;
     689             : }
     690             : 
     691             : /**
     692             :  * dummy_numa_init - Fallback dummy NUMA init
     693             :  *
     694             :  * Used if there's no underlying NUMA architecture, NUMA initialization
     695             :  * fails, or NUMA is disabled on the command line.
     696             :  *
     697             :  * Must online at least one node and add memory blocks that cover all
     698             :  * allowed memory.  This function must not fail.
     699             :  */
     700           1 : static int __init dummy_numa_init(void)
     701             : {
     702           1 :         printk(KERN_INFO "%s\n",
     703           1 :                numa_off ? "NUMA turned off" : "No NUMA configuration found");
     704           1 :         printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
     705           1 :                0LLU, PFN_PHYS(max_pfn) - 1);
     706             : 
     707           1 :         node_set(0, numa_nodes_parsed);
     708           1 :         numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
     709             : 
     710           1 :         return 0;
     711             : }
     712             : 
     713             : /**
     714             :  * x86_numa_init - Initialize NUMA
     715             :  *
     716             :  * Try each configured NUMA initialization method until one succeeds.  The
     717             :  * last fallback is dummy single node config encompassing whole memory and
     718             :  * never fails.
     719             :  */
     720           1 : void __init x86_numa_init(void)
     721             : {
     722           1 :         if (!numa_off) {
     723             : #ifdef CONFIG_ACPI_NUMA
     724             :                 if (!numa_init(x86_acpi_numa_init))
     725             :                         return;
     726             : #endif
     727             : #ifdef CONFIG_AMD_NUMA
     728             :                 if (!numa_init(amd_numa_init))
     729             :                         return;
     730             : #endif
     731           1 :         }
     732             : 
     733           1 :         numa_init(dummy_numa_init);
     734           1 : }
     735             : 
     736           0 : static void __init init_memory_less_node(int nid)
     737             : {
     738             :         /* Allocate and initialize node data. Memory-less node is now online.*/
     739           0 :         alloc_node_data(nid);
     740           0 :         free_area_init_memoryless_node(nid);
     741             : 
     742             :         /*
     743             :          * All zonelists will be built later in start_kernel() after per cpu
     744             :          * areas are initialized.
     745             :          */
     746           0 : }
     747             : 
     748             : /*
     749             :  * A node may exist which has one or more Generic Initiators but no CPUs and no
     750             :  * memory.
     751             :  *
     752             :  * This function must be called after init_cpu_to_node(), to ensure that any
     753             :  * memoryless CPU nodes have already been brought online, and before the
     754             :  * node_data[nid] is needed for zone list setup in build_all_zonelists().
     755             :  *
     756             :  * When this function is called, any nodes containing either memory and/or CPUs
     757             :  * will already be online and there is no need to do anything extra, even if
     758             :  * they also contain one or more Generic Initiators.
     759             :  */
     760           1 : void __init init_gi_nodes(void)
     761             : {
     762           1 :         int nid;
     763             : 
     764           1 :         for_each_node_state(nid, N_GENERIC_INITIATOR)
     765           0 :                 if (!node_online(nid))
     766           0 :                         init_memory_less_node(nid);
     767           1 : }
     768             : 
     769             : /*
     770             :  * Setup early cpu_to_node.
     771             :  *
     772             :  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
     773             :  * and apicid_to_node[] tables have valid entries for a CPU.
     774             :  * This means we skip cpu_to_node[] initialisation for NUMA
     775             :  * emulation and faking node case (when running a kernel compiled
     776             :  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
     777             :  * is already initialized in a round robin manner at numa_init_array,
     778             :  * prior to this call, and this initialization is good enough
     779             :  * for the fake NUMA cases.
     780             :  *
     781             :  * Called before the per_cpu areas are setup.
     782             :  */
     783           1 : void __init init_cpu_to_node(void)
     784             : {
     785           1 :         int cpu;
     786           1 :         u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
     787             : 
     788           1 :         BUG_ON(cpu_to_apicid == NULL);
     789             : 
     790           5 :         for_each_possible_cpu(cpu) {
     791           4 :                 int node = numa_cpu_node(cpu);
     792             : 
     793           4 :                 if (node == NUMA_NO_NODE)
     794           4 :                         continue;
     795             : 
     796           0 :                 if (!node_online(node))
     797           0 :                         init_memory_less_node(node);
     798             : 
     799           0 :                 numa_set_node(cpu, node);
     800             :         }
     801           1 : }
     802             : 
     803             : #ifndef CONFIG_DEBUG_PER_CPU_MAPS
     804             : 
     805             : # ifndef CONFIG_NUMA_EMU
     806           4 : void numa_add_cpu(int cpu)
     807             : {
     808           4 :         cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
     809           4 : }
     810             : 
     811           0 : void numa_remove_cpu(int cpu)
     812             : {
     813           0 :         cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
     814           0 : }
     815             : # endif /* !CONFIG_NUMA_EMU */
     816             : 
     817             : #else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
     818             : 
     819             : int __cpu_to_node(int cpu)
     820             : {
     821             :         if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
     822             :                 printk(KERN_WARNING
     823             :                         "cpu_to_node(%d): usage too early!\n", cpu);
     824             :                 dump_stack();
     825             :                 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
     826             :         }
     827             :         return per_cpu(x86_cpu_to_node_map, cpu);
     828             : }
     829             : EXPORT_SYMBOL(__cpu_to_node);
     830             : 
     831             : /*
     832             :  * Same function as cpu_to_node() but used if called before the
     833             :  * per_cpu areas are setup.
     834             :  */
     835             : int early_cpu_to_node(int cpu)
     836             : {
     837             :         if (early_per_cpu_ptr(x86_cpu_to_node_map))
     838             :                 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
     839             : 
     840             :         if (!cpu_possible(cpu)) {
     841             :                 printk(KERN_WARNING
     842             :                         "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
     843             :                 dump_stack();
     844             :                 return NUMA_NO_NODE;
     845             :         }
     846             :         return per_cpu(x86_cpu_to_node_map, cpu);
     847             : }
     848             : 
     849             : void debug_cpumask_set_cpu(int cpu, int node, bool enable)
     850             : {
     851             :         struct cpumask *mask;
     852             : 
     853             :         if (node == NUMA_NO_NODE) {
     854             :                 /* early_cpu_to_node() already emits a warning and trace */
     855             :                 return;
     856             :         }
     857             :         mask = node_to_cpumask_map[node];
     858             :         if (!mask) {
     859             :                 pr_err("node_to_cpumask_map[%i] NULL\n", node);
     860             :                 dump_stack();
     861             :                 return;
     862             :         }
     863             : 
     864             :         if (enable)
     865             :                 cpumask_set_cpu(cpu, mask);
     866             :         else
     867             :                 cpumask_clear_cpu(cpu, mask);
     868             : 
     869             :         printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
     870             :                 enable ? "numa_add_cpu" : "numa_remove_cpu",
     871             :                 cpu, node, cpumask_pr_args(mask));
     872             :         return;
     873             : }
     874             : 
     875             : # ifndef CONFIG_NUMA_EMU
     876             : static void numa_set_cpumask(int cpu, bool enable)
     877             : {
     878             :         debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
     879             : }
     880             : 
     881             : void numa_add_cpu(int cpu)
     882             : {
     883             :         numa_set_cpumask(cpu, true);
     884             : }
     885             : 
     886             : void numa_remove_cpu(int cpu)
     887             : {
     888             :         numa_set_cpumask(cpu, false);
     889             : }
     890             : # endif /* !CONFIG_NUMA_EMU */
     891             : 
     892             : /*
     893             :  * Returns a pointer to the bitmask of CPUs on Node 'node'.
     894             :  */
     895             : const struct cpumask *cpumask_of_node(int node)
     896             : {
     897             :         if ((unsigned)node >= nr_node_ids) {
     898             :                 printk(KERN_WARNING
     899             :                         "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
     900             :                         node, nr_node_ids);
     901             :                 dump_stack();
     902             :                 return cpu_none_mask;
     903             :         }
     904             :         if (node_to_cpumask_map[node] == NULL) {
     905             :                 printk(KERN_WARNING
     906             :                         "cpumask_of_node(%d): no node_to_cpumask_map!\n",
     907             :                         node);
     908             :                 dump_stack();
     909             :                 return cpu_online_mask;
     910             :         }
     911             :         return node_to_cpumask_map[node];
     912             : }
     913             : EXPORT_SYMBOL(cpumask_of_node);
     914             : 
     915             : #endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */
     916             : 
     917             : #ifdef CONFIG_NUMA_KEEP_MEMINFO
     918             : static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
     919             : {
     920             :         int i;
     921             : 
     922             :         for (i = 0; i < mi->nr_blks; i++)
     923             :                 if (mi->blk[i].start <= start && mi->blk[i].end > start)
     924             :                         return mi->blk[i].nid;
     925             :         return NUMA_NO_NODE;
     926             : }
     927             : 
     928             : int phys_to_target_node(phys_addr_t start)
     929             : {
     930             :         int nid = meminfo_to_nid(&numa_meminfo, start);
     931             : 
     932             :         /*
     933             :          * Prefer online nodes, but if reserved memory might be
     934             :          * hot-added continue the search with reserved ranges.
     935             :          */
     936             :         if (nid != NUMA_NO_NODE)
     937             :                 return nid;
     938             : 
     939             :         return meminfo_to_nid(&numa_reserved_meminfo, start);
     940             : }
     941             : EXPORT_SYMBOL_GPL(phys_to_target_node);
     942             : 
     943             : int memory_add_physaddr_to_nid(u64 start)
     944             : {
     945             :         int nid = meminfo_to_nid(&numa_meminfo, start);
     946             : 
     947             :         if (nid == NUMA_NO_NODE)
     948             :                 nid = numa_meminfo.blk[0].nid;
     949             :         return nid;
     950             : }
     951             : EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
     952             : #endif

Generated by: LCOV version 1.14