Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Basic Node interface support
4 : */
5 :
6 : #include <linux/module.h>
7 : #include <linux/init.h>
8 : #include <linux/mm.h>
9 : #include <linux/memory.h>
10 : #include <linux/vmstat.h>
11 : #include <linux/notifier.h>
12 : #include <linux/node.h>
13 : #include <linux/hugetlb.h>
14 : #include <linux/compaction.h>
15 : #include <linux/cpumask.h>
16 : #include <linux/topology.h>
17 : #include <linux/nodemask.h>
18 : #include <linux/cpu.h>
19 : #include <linux/device.h>
20 : #include <linux/pm_runtime.h>
21 : #include <linux/swap.h>
22 : #include <linux/slab.h>
23 :
24 : static struct bus_type node_subsys = {
25 : .name = "node",
26 : .dev_name = "node",
27 : };
28 :
29 :
30 0 : static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf)
31 : {
32 0 : ssize_t n;
33 0 : cpumask_var_t mask;
34 0 : struct node *node_dev = to_node(dev);
35 :
36 : /* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
37 0 : BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
38 :
39 0 : if (!alloc_cpumask_var(&mask, GFP_KERNEL))
40 : return 0;
41 :
42 0 : cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask);
43 0 : n = cpumap_print_to_pagebuf(list, buf, mask);
44 0 : free_cpumask_var(mask);
45 :
46 0 : return n;
47 : }
48 :
49 0 : static inline ssize_t cpumap_show(struct device *dev,
50 : struct device_attribute *attr,
51 : char *buf)
52 : {
53 0 : return node_read_cpumap(dev, false, buf);
54 : }
55 :
56 : static DEVICE_ATTR_RO(cpumap);
57 :
58 0 : static inline ssize_t cpulist_show(struct device *dev,
59 : struct device_attribute *attr,
60 : char *buf)
61 : {
62 0 : return node_read_cpumap(dev, true, buf);
63 : }
64 :
65 : static DEVICE_ATTR_RO(cpulist);
66 :
67 : /**
68 : * struct node_access_nodes - Access class device to hold user visible
69 : * relationships to other nodes.
70 : * @dev: Device for this memory access class
71 : * @list_node: List element in the node's access list
72 : * @access: The access class rank
73 : * @hmem_attrs: Heterogeneous memory performance attributes
74 : */
75 : struct node_access_nodes {
76 : struct device dev;
77 : struct list_head list_node;
78 : unsigned access;
79 : #ifdef CONFIG_HMEM_REPORTING
80 : struct node_hmem_attrs hmem_attrs;
81 : #endif
82 : };
83 : #define to_access_nodes(dev) container_of(dev, struct node_access_nodes, dev)
84 :
85 : static struct attribute *node_init_access_node_attrs[] = {
86 : NULL,
87 : };
88 :
89 : static struct attribute *node_targ_access_node_attrs[] = {
90 : NULL,
91 : };
92 :
93 : static const struct attribute_group initiators = {
94 : .name = "initiators",
95 : .attrs = node_init_access_node_attrs,
96 : };
97 :
98 : static const struct attribute_group targets = {
99 : .name = "targets",
100 : .attrs = node_targ_access_node_attrs,
101 : };
102 :
103 : static const struct attribute_group *node_access_node_groups[] = {
104 : &initiators,
105 : &targets,
106 : NULL,
107 : };
108 :
109 0 : static void node_remove_accesses(struct node *node)
110 : {
111 0 : struct node_access_nodes *c, *cnext;
112 :
113 0 : list_for_each_entry_safe(c, cnext, &node->access_list, list_node) {
114 0 : list_del(&c->list_node);
115 0 : device_unregister(&c->dev);
116 : }
117 0 : }
118 :
119 0 : static void node_access_release(struct device *dev)
120 : {
121 0 : kfree(to_access_nodes(dev));
122 0 : }
123 :
124 0 : static struct node_access_nodes *node_init_node_access(struct node *node,
125 : unsigned access)
126 : {
127 0 : struct node_access_nodes *access_node;
128 0 : struct device *dev;
129 :
130 0 : list_for_each_entry(access_node, &node->access_list, list_node)
131 0 : if (access_node->access == access)
132 0 : return access_node;
133 :
134 0 : access_node = kzalloc(sizeof(*access_node), GFP_KERNEL);
135 0 : if (!access_node)
136 : return NULL;
137 :
138 0 : access_node->access = access;
139 0 : dev = &access_node->dev;
140 0 : dev->parent = &node->dev;
141 0 : dev->release = node_access_release;
142 0 : dev->groups = node_access_node_groups;
143 0 : if (dev_set_name(dev, "access%u", access))
144 0 : goto free;
145 :
146 0 : if (device_register(dev))
147 0 : goto free_name;
148 :
149 0 : pm_runtime_no_callbacks(dev);
150 0 : list_add_tail(&access_node->list_node, &node->access_list);
151 0 : return access_node;
152 0 : free_name:
153 0 : kfree_const(dev->kobj.name);
154 0 : free:
155 0 : kfree(access_node);
156 0 : return NULL;
157 : }
158 :
159 : #ifdef CONFIG_HMEM_REPORTING
160 : #define ACCESS_ATTR(name) \
161 : static ssize_t name##_show(struct device *dev, \
162 : struct device_attribute *attr, \
163 : char *buf) \
164 : { \
165 : return sysfs_emit(buf, "%u\n", \
166 : to_access_nodes(dev)->hmem_attrs.name); \
167 : } \
168 : static DEVICE_ATTR_RO(name)
169 :
170 : ACCESS_ATTR(read_bandwidth);
171 : ACCESS_ATTR(read_latency);
172 : ACCESS_ATTR(write_bandwidth);
173 : ACCESS_ATTR(write_latency);
174 :
175 : static struct attribute *access_attrs[] = {
176 : &dev_attr_read_bandwidth.attr,
177 : &dev_attr_read_latency.attr,
178 : &dev_attr_write_bandwidth.attr,
179 : &dev_attr_write_latency.attr,
180 : NULL,
181 : };
182 :
183 : /**
184 : * node_set_perf_attrs - Set the performance values for given access class
185 : * @nid: Node identifier to be set
186 : * @hmem_attrs: Heterogeneous memory performance attributes
187 : * @access: The access class the for the given attributes
188 : */
189 : void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs,
190 : unsigned access)
191 : {
192 : struct node_access_nodes *c;
193 : struct node *node;
194 : int i;
195 :
196 : if (WARN_ON_ONCE(!node_online(nid)))
197 : return;
198 :
199 : node = node_devices[nid];
200 : c = node_init_node_access(node, access);
201 : if (!c)
202 : return;
203 :
204 : c->hmem_attrs = *hmem_attrs;
205 : for (i = 0; access_attrs[i] != NULL; i++) {
206 : if (sysfs_add_file_to_group(&c->dev.kobj, access_attrs[i],
207 : "initiators")) {
208 : pr_info("failed to add performance attribute to node %d\n",
209 : nid);
210 : break;
211 : }
212 : }
213 : }
214 :
215 : /**
216 : * struct node_cache_info - Internal tracking for memory node caches
217 : * @dev: Device represeting the cache level
218 : * @node: List element for tracking in the node
219 : * @cache_attrs:Attributes for this cache level
220 : */
221 : struct node_cache_info {
222 : struct device dev;
223 : struct list_head node;
224 : struct node_cache_attrs cache_attrs;
225 : };
226 : #define to_cache_info(device) container_of(device, struct node_cache_info, dev)
227 :
228 : #define CACHE_ATTR(name, fmt) \
229 : static ssize_t name##_show(struct device *dev, \
230 : struct device_attribute *attr, \
231 : char *buf) \
232 : { \
233 : return sysfs_emit(buf, fmt "\n", \
234 : to_cache_info(dev)->cache_attrs.name); \
235 : } \
236 : DEVICE_ATTR_RO(name);
237 :
238 : CACHE_ATTR(size, "%llu")
239 : CACHE_ATTR(line_size, "%u")
240 : CACHE_ATTR(indexing, "%u")
241 : CACHE_ATTR(write_policy, "%u")
242 :
243 : static struct attribute *cache_attrs[] = {
244 : &dev_attr_indexing.attr,
245 : &dev_attr_size.attr,
246 : &dev_attr_line_size.attr,
247 : &dev_attr_write_policy.attr,
248 : NULL,
249 : };
250 : ATTRIBUTE_GROUPS(cache);
251 :
252 : static void node_cache_release(struct device *dev)
253 : {
254 : kfree(dev);
255 : }
256 :
257 : static void node_cacheinfo_release(struct device *dev)
258 : {
259 : struct node_cache_info *info = to_cache_info(dev);
260 : kfree(info);
261 : }
262 :
263 : static void node_init_cache_dev(struct node *node)
264 : {
265 : struct device *dev;
266 :
267 : dev = kzalloc(sizeof(*dev), GFP_KERNEL);
268 : if (!dev)
269 : return;
270 :
271 : dev->parent = &node->dev;
272 : dev->release = node_cache_release;
273 : if (dev_set_name(dev, "memory_side_cache"))
274 : goto free_dev;
275 :
276 : if (device_register(dev))
277 : goto free_name;
278 :
279 : pm_runtime_no_callbacks(dev);
280 : node->cache_dev = dev;
281 : return;
282 : free_name:
283 : kfree_const(dev->kobj.name);
284 : free_dev:
285 : kfree(dev);
286 : }
287 :
288 : /**
289 : * node_add_cache() - add cache attribute to a memory node
290 : * @nid: Node identifier that has new cache attributes
291 : * @cache_attrs: Attributes for the cache being added
292 : */
293 : void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs)
294 : {
295 : struct node_cache_info *info;
296 : struct device *dev;
297 : struct node *node;
298 :
299 : if (!node_online(nid) || !node_devices[nid])
300 : return;
301 :
302 : node = node_devices[nid];
303 : list_for_each_entry(info, &node->cache_attrs, node) {
304 : if (info->cache_attrs.level == cache_attrs->level) {
305 : dev_warn(&node->dev,
306 : "attempt to add duplicate cache level:%d\n",
307 : cache_attrs->level);
308 : return;
309 : }
310 : }
311 :
312 : if (!node->cache_dev)
313 : node_init_cache_dev(node);
314 : if (!node->cache_dev)
315 : return;
316 :
317 : info = kzalloc(sizeof(*info), GFP_KERNEL);
318 : if (!info)
319 : return;
320 :
321 : dev = &info->dev;
322 : dev->parent = node->cache_dev;
323 : dev->release = node_cacheinfo_release;
324 : dev->groups = cache_groups;
325 : if (dev_set_name(dev, "index%d", cache_attrs->level))
326 : goto free_cache;
327 :
328 : info->cache_attrs = *cache_attrs;
329 : if (device_register(dev)) {
330 : dev_warn(&node->dev, "failed to add cache level:%d\n",
331 : cache_attrs->level);
332 : goto free_name;
333 : }
334 : pm_runtime_no_callbacks(dev);
335 : list_add_tail(&info->node, &node->cache_attrs);
336 : return;
337 : free_name:
338 : kfree_const(dev->kobj.name);
339 : free_cache:
340 : kfree(info);
341 : }
342 :
343 : static void node_remove_caches(struct node *node)
344 : {
345 : struct node_cache_info *info, *next;
346 :
347 : if (!node->cache_dev)
348 : return;
349 :
350 : list_for_each_entry_safe(info, next, &node->cache_attrs, node) {
351 : list_del(&info->node);
352 : device_unregister(&info->dev);
353 : }
354 : device_unregister(node->cache_dev);
355 : }
356 :
357 : static void node_init_caches(unsigned int nid)
358 : {
359 : INIT_LIST_HEAD(&node_devices[nid]->cache_attrs);
360 : }
361 : #else
362 1 : static void node_init_caches(unsigned int nid) { }
363 0 : static void node_remove_caches(struct node *node) { }
364 : #endif
365 :
366 : #define K(x) ((x) << (PAGE_SHIFT - 10))
367 0 : static ssize_t node_read_meminfo(struct device *dev,
368 : struct device_attribute *attr, char *buf)
369 : {
370 0 : int len = 0;
371 0 : int nid = dev->id;
372 0 : struct pglist_data *pgdat = NODE_DATA(nid);
373 0 : struct sysinfo i;
374 0 : unsigned long sreclaimable, sunreclaimable;
375 0 : unsigned long swapcached = 0;
376 :
377 0 : si_meminfo_node(&i, nid);
378 0 : sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
379 0 : sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
380 : #ifdef CONFIG_SWAP
381 : swapcached = node_page_state_pages(pgdat, NR_SWAPCACHE);
382 : #endif
383 0 : len = sysfs_emit_at(buf, len,
384 : "Node %d MemTotal: %8lu kB\n"
385 : "Node %d MemFree: %8lu kB\n"
386 : "Node %d MemUsed: %8lu kB\n"
387 : "Node %d SwapCached: %8lu kB\n"
388 : "Node %d Active: %8lu kB\n"
389 : "Node %d Inactive: %8lu kB\n"
390 : "Node %d Active(anon): %8lu kB\n"
391 : "Node %d Inactive(anon): %8lu kB\n"
392 : "Node %d Active(file): %8lu kB\n"
393 : "Node %d Inactive(file): %8lu kB\n"
394 : "Node %d Unevictable: %8lu kB\n"
395 : "Node %d Mlocked: %8lu kB\n",
396 : nid, K(i.totalram),
397 : nid, K(i.freeram),
398 0 : nid, K(i.totalram - i.freeram),
399 : nid, K(swapcached),
400 0 : nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
401 : node_page_state(pgdat, NR_ACTIVE_FILE)),
402 0 : nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
403 : node_page_state(pgdat, NR_INACTIVE_FILE)),
404 0 : nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
405 0 : nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
406 0 : nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
407 0 : nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
408 0 : nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
409 0 : nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
410 :
411 : #ifdef CONFIG_HIGHMEM
412 : len += sysfs_emit_at(buf, len,
413 : "Node %d HighTotal: %8lu kB\n"
414 : "Node %d HighFree: %8lu kB\n"
415 : "Node %d LowTotal: %8lu kB\n"
416 : "Node %d LowFree: %8lu kB\n",
417 : nid, K(i.totalhigh),
418 : nid, K(i.freehigh),
419 : nid, K(i.totalram - i.totalhigh),
420 : nid, K(i.freeram - i.freehigh));
421 : #endif
422 0 : len += sysfs_emit_at(buf, len,
423 : "Node %d Dirty: %8lu kB\n"
424 : "Node %d Writeback: %8lu kB\n"
425 : "Node %d FilePages: %8lu kB\n"
426 : "Node %d Mapped: %8lu kB\n"
427 : "Node %d AnonPages: %8lu kB\n"
428 : "Node %d Shmem: %8lu kB\n"
429 : "Node %d KernelStack: %8lu kB\n"
430 : #ifdef CONFIG_SHADOW_CALL_STACK
431 : "Node %d ShadowCallStack:%8lu kB\n"
432 : #endif
433 : "Node %d PageTables: %8lu kB\n"
434 : "Node %d NFS_Unstable: %8lu kB\n"
435 : "Node %d Bounce: %8lu kB\n"
436 : "Node %d WritebackTmp: %8lu kB\n"
437 : "Node %d KReclaimable: %8lu kB\n"
438 : "Node %d Slab: %8lu kB\n"
439 : "Node %d SReclaimable: %8lu kB\n"
440 : "Node %d SUnreclaim: %8lu kB\n"
441 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
442 : "Node %d AnonHugePages: %8lu kB\n"
443 : "Node %d ShmemHugePages: %8lu kB\n"
444 : "Node %d ShmemPmdMapped: %8lu kB\n"
445 : "Node %d FileHugePages: %8lu kB\n"
446 : "Node %d FilePmdMapped: %8lu kB\n"
447 : #endif
448 : ,
449 0 : nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
450 0 : nid, K(node_page_state(pgdat, NR_WRITEBACK)),
451 0 : nid, K(node_page_state(pgdat, NR_FILE_PAGES)),
452 0 : nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
453 0 : nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
454 0 : nid, K(i.sharedram),
455 : nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
456 : #ifdef CONFIG_SHADOW_CALL_STACK
457 : nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
458 : #endif
459 0 : nid, K(node_page_state(pgdat, NR_PAGETABLE)),
460 : nid, 0UL,
461 0 : nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
462 0 : nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
463 0 : nid, K(sreclaimable +
464 : node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
465 0 : nid, K(sreclaimable + sunreclaimable),
466 : nid, K(sreclaimable),
467 : nid, K(sunreclaimable)
468 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
469 : ,
470 0 : nid, K(node_page_state(pgdat, NR_ANON_THPS)),
471 0 : nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
472 0 : nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
473 0 : nid, K(node_page_state(pgdat, NR_FILE_THPS)),
474 0 : nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
475 : #endif
476 : );
477 0 : len += hugetlb_report_node_meminfo(buf, len, nid);
478 0 : return len;
479 : }
480 :
481 : #undef K
482 : static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL);
483 :
484 0 : static ssize_t node_read_numastat(struct device *dev,
485 : struct device_attribute *attr, char *buf)
486 : {
487 0 : return sysfs_emit(buf,
488 : "numa_hit %lu\n"
489 : "numa_miss %lu\n"
490 : "numa_foreign %lu\n"
491 : "interleave_hit %lu\n"
492 : "local_node %lu\n"
493 : "other_node %lu\n",
494 0 : sum_zone_numa_state(dev->id, NUMA_HIT),
495 0 : sum_zone_numa_state(dev->id, NUMA_MISS),
496 0 : sum_zone_numa_state(dev->id, NUMA_FOREIGN),
497 0 : sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
498 0 : sum_zone_numa_state(dev->id, NUMA_LOCAL),
499 0 : sum_zone_numa_state(dev->id, NUMA_OTHER));
500 : }
501 : static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL);
502 :
503 0 : static ssize_t node_read_vmstat(struct device *dev,
504 : struct device_attribute *attr, char *buf)
505 : {
506 0 : int nid = dev->id;
507 0 : struct pglist_data *pgdat = NODE_DATA(nid);
508 0 : int i;
509 0 : int len = 0;
510 :
511 0 : for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
512 0 : len += sysfs_emit_at(buf, len, "%s %lu\n",
513 : zone_stat_name(i),
514 : sum_zone_node_page_state(nid, i));
515 :
516 : #ifdef CONFIG_NUMA
517 0 : for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
518 0 : len += sysfs_emit_at(buf, len, "%s %lu\n",
519 : numa_stat_name(i),
520 : sum_zone_numa_state(nid, i));
521 :
522 : #endif
523 0 : for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
524 0 : unsigned long pages = node_page_state_pages(pgdat, i);
525 :
526 0 : if (vmstat_item_print_in_thp(i))
527 0 : pages /= HPAGE_PMD_NR;
528 0 : len += sysfs_emit_at(buf, len, "%s %lu\n", node_stat_name(i),
529 : pages);
530 : }
531 :
532 0 : return len;
533 : }
534 : static DEVICE_ATTR(vmstat, 0444, node_read_vmstat, NULL);
535 :
536 0 : static ssize_t node_read_distance(struct device *dev,
537 : struct device_attribute *attr, char *buf)
538 : {
539 0 : int nid = dev->id;
540 0 : int len = 0;
541 0 : int i;
542 :
543 : /*
544 : * buf is currently PAGE_SIZE in length and each node needs 4 chars
545 : * at the most (distance + space or newline).
546 : */
547 0 : BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE);
548 :
549 0 : for_each_online_node(i) {
550 0 : len += sysfs_emit_at(buf, len, "%s%d",
551 : i ? " " : "", node_distance(nid, i));
552 : }
553 :
554 0 : len += sysfs_emit_at(buf, len, "\n");
555 0 : return len;
556 : }
557 : static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
558 :
559 : static struct attribute *node_dev_attrs[] = {
560 : &dev_attr_cpumap.attr,
561 : &dev_attr_cpulist.attr,
562 : &dev_attr_meminfo.attr,
563 : &dev_attr_numastat.attr,
564 : &dev_attr_distance.attr,
565 : &dev_attr_vmstat.attr,
566 : NULL
567 : };
568 : ATTRIBUTE_GROUPS(node_dev);
569 :
570 : #ifdef CONFIG_HUGETLBFS
571 : /*
572 : * hugetlbfs per node attributes registration interface:
573 : * When/if hugetlb[fs] subsystem initializes [sometime after this module],
574 : * it will register its per node attributes for all online nodes with
575 : * memory. It will also call register_hugetlbfs_with_node(), below, to
576 : * register its attribute registration functions with this node driver.
577 : * Once these hooks have been initialized, the node driver will call into
578 : * the hugetlb module to [un]register attributes for hot-plugged nodes.
579 : */
580 : static node_registration_func_t __hugetlb_register_node;
581 : static node_registration_func_t __hugetlb_unregister_node;
582 :
583 : static inline bool hugetlb_register_node(struct node *node)
584 : {
585 : if (__hugetlb_register_node &&
586 : node_state(node->dev.id, N_MEMORY)) {
587 : __hugetlb_register_node(node);
588 : return true;
589 : }
590 : return false;
591 : }
592 :
593 : static inline void hugetlb_unregister_node(struct node *node)
594 : {
595 : if (__hugetlb_unregister_node)
596 : __hugetlb_unregister_node(node);
597 : }
598 :
599 : void register_hugetlbfs_with_node(node_registration_func_t doregister,
600 : node_registration_func_t unregister)
601 : {
602 : __hugetlb_register_node = doregister;
603 : __hugetlb_unregister_node = unregister;
604 : }
605 : #else
606 1 : static inline void hugetlb_register_node(struct node *node) {}
607 :
608 0 : static inline void hugetlb_unregister_node(struct node *node) {}
609 : #endif
610 :
611 0 : static void node_device_release(struct device *dev)
612 : {
613 0 : struct node *node = to_node(dev);
614 :
615 : #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
616 : /*
617 : * We schedule the work only when a memory section is
618 : * onlined/offlined on this node. When we come here,
619 : * all the memory on this node has been offlined,
620 : * so we won't enqueue new work to this work.
621 : *
622 : * The work is using node->node_work, so we should
623 : * flush work before freeing the memory.
624 : */
625 : flush_work(&node->node_work);
626 : #endif
627 0 : kfree(node);
628 0 : }
629 :
630 : /*
631 : * register_node - Setup a sysfs device for a node.
632 : * @num - Node number to use when creating the device.
633 : *
634 : * Initialize and register the node device.
635 : */
636 1 : static int register_node(struct node *node, int num)
637 : {
638 1 : int error;
639 :
640 1 : node->dev.id = num;
641 1 : node->dev.bus = &node_subsys;
642 1 : node->dev.release = node_device_release;
643 1 : node->dev.groups = node_dev_groups;
644 1 : error = device_register(&node->dev);
645 :
646 1 : if (error)
647 0 : put_device(&node->dev);
648 : else {
649 1 : hugetlb_register_node(node);
650 :
651 1 : compaction_register_node(node);
652 : }
653 1 : return error;
654 : }
655 :
656 : /**
657 : * unregister_node - unregister a node device
658 : * @node: node going away
659 : *
660 : * Unregisters a node device @node. All the devices on the node must be
661 : * unregistered before calling this function.
662 : */
663 0 : void unregister_node(struct node *node)
664 : {
665 0 : hugetlb_unregister_node(node); /* no-op, if memoryless node */
666 0 : node_remove_accesses(node);
667 0 : node_remove_caches(node);
668 0 : device_unregister(&node->dev);
669 0 : }
670 :
671 : struct node *node_devices[MAX_NUMNODES];
672 :
673 : /*
674 : * register cpu under node
675 : */
676 8 : int register_cpu_under_node(unsigned int cpu, unsigned int nid)
677 : {
678 8 : int ret;
679 8 : struct device *obj;
680 :
681 8 : if (!node_online(nid))
682 : return 0;
683 :
684 8 : obj = get_cpu_device(cpu);
685 8 : if (!obj)
686 : return 0;
687 :
688 8 : ret = sysfs_create_link(&node_devices[nid]->dev.kobj,
689 : &obj->kobj,
690 4 : kobject_name(&obj->kobj));
691 4 : if (ret)
692 : return ret;
693 :
694 4 : return sysfs_create_link(&obj->kobj,
695 : &node_devices[nid]->dev.kobj,
696 4 : kobject_name(&node_devices[nid]->dev.kobj));
697 : }
698 :
699 : /**
700 : * register_memory_node_under_compute_node - link memory node to its compute
701 : * node for a given access class.
702 : * @mem_nid: Memory node number
703 : * @cpu_nid: Cpu node number
704 : * @access: Access class to register
705 : *
706 : * Description:
707 : * For use with platforms that may have separate memory and compute nodes.
708 : * This function will export node relationships linking which memory
709 : * initiator nodes can access memory targets at a given ranked access
710 : * class.
711 : */
712 0 : int register_memory_node_under_compute_node(unsigned int mem_nid,
713 : unsigned int cpu_nid,
714 : unsigned access)
715 : {
716 0 : struct node *init_node, *targ_node;
717 0 : struct node_access_nodes *initiator, *target;
718 0 : int ret;
719 :
720 0 : if (!node_online(cpu_nid) || !node_online(mem_nid))
721 0 : return -ENODEV;
722 :
723 0 : init_node = node_devices[cpu_nid];
724 0 : targ_node = node_devices[mem_nid];
725 0 : initiator = node_init_node_access(init_node, access);
726 0 : target = node_init_node_access(targ_node, access);
727 0 : if (!initiator || !target)
728 : return -ENOMEM;
729 :
730 0 : ret = sysfs_add_link_to_group(&initiator->dev.kobj, "targets",
731 : &targ_node->dev.kobj,
732 0 : dev_name(&targ_node->dev));
733 0 : if (ret)
734 : return ret;
735 :
736 0 : ret = sysfs_add_link_to_group(&target->dev.kobj, "initiators",
737 : &init_node->dev.kobj,
738 0 : dev_name(&init_node->dev));
739 0 : if (ret)
740 0 : goto err;
741 :
742 : return 0;
743 0 : err:
744 0 : sysfs_remove_link_from_group(&initiator->dev.kobj, "targets",
745 0 : dev_name(&targ_node->dev));
746 0 : return ret;
747 : }
748 :
749 0 : int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
750 : {
751 0 : struct device *obj;
752 :
753 0 : if (!node_online(nid))
754 : return 0;
755 :
756 0 : obj = get_cpu_device(cpu);
757 0 : if (!obj)
758 : return 0;
759 :
760 0 : sysfs_remove_link(&node_devices[nid]->dev.kobj,
761 0 : kobject_name(&obj->kobj));
762 0 : sysfs_remove_link(&obj->kobj,
763 0 : kobject_name(&node_devices[nid]->dev.kobj));
764 :
765 0 : return 0;
766 : }
767 :
768 : #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
769 : static int __ref get_nid_for_pfn(unsigned long pfn)
770 : {
771 : if (!pfn_valid_within(pfn))
772 : return -1;
773 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
774 : if (system_state < SYSTEM_RUNNING)
775 : return early_pfn_to_nid(pfn);
776 : #endif
777 : return pfn_to_nid(pfn);
778 : }
779 :
780 : static void do_register_memory_block_under_node(int nid,
781 : struct memory_block *mem_blk)
782 : {
783 : int ret;
784 :
785 : /*
786 : * If this memory block spans multiple nodes, we only indicate
787 : * the last processed node.
788 : */
789 : mem_blk->nid = nid;
790 :
791 : ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
792 : &mem_blk->dev.kobj,
793 : kobject_name(&mem_blk->dev.kobj));
794 : if (ret && ret != -EEXIST)
795 : dev_err_ratelimited(&node_devices[nid]->dev,
796 : "can't create link to %s in sysfs (%d)\n",
797 : kobject_name(&mem_blk->dev.kobj), ret);
798 :
799 : ret = sysfs_create_link_nowarn(&mem_blk->dev.kobj,
800 : &node_devices[nid]->dev.kobj,
801 : kobject_name(&node_devices[nid]->dev.kobj));
802 : if (ret && ret != -EEXIST)
803 : dev_err_ratelimited(&mem_blk->dev,
804 : "can't create link to %s in sysfs (%d)\n",
805 : kobject_name(&node_devices[nid]->dev.kobj),
806 : ret);
807 : }
808 :
809 : /* register memory section under specified node if it spans that node */
810 : static int register_mem_block_under_node_early(struct memory_block *mem_blk,
811 : void *arg)
812 : {
813 : unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
814 : unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
815 : unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
816 : int nid = *(int *)arg;
817 : unsigned long pfn;
818 :
819 : for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
820 : int page_nid;
821 :
822 : /*
823 : * memory block could have several absent sections from start.
824 : * skip pfn range from absent section
825 : */
826 : if (!pfn_in_present_section(pfn)) {
827 : pfn = round_down(pfn + PAGES_PER_SECTION,
828 : PAGES_PER_SECTION) - 1;
829 : continue;
830 : }
831 :
832 : /*
833 : * We need to check if page belongs to nid only at the boot
834 : * case because node's ranges can be interleaved.
835 : */
836 : page_nid = get_nid_for_pfn(pfn);
837 : if (page_nid < 0)
838 : continue;
839 : if (page_nid != nid)
840 : continue;
841 :
842 : do_register_memory_block_under_node(nid, mem_blk);
843 : return 0;
844 : }
845 : /* mem section does not span the specified node */
846 : return 0;
847 : }
848 :
849 : /*
850 : * During hotplug we know that all pages in the memory block belong to the same
851 : * node.
852 : */
853 : static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
854 : void *arg)
855 : {
856 : int nid = *(int *)arg;
857 :
858 : do_register_memory_block_under_node(nid, mem_blk);
859 : return 0;
860 : }
861 :
862 : /*
863 : * Unregister a memory block device under the node it spans. Memory blocks
864 : * with multiple nodes cannot be offlined and therefore also never be removed.
865 : */
866 : void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
867 : {
868 : if (mem_blk->nid == NUMA_NO_NODE)
869 : return;
870 :
871 : sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
872 : kobject_name(&mem_blk->dev.kobj));
873 : sysfs_remove_link(&mem_blk->dev.kobj,
874 : kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
875 : }
876 :
877 : void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
878 : enum meminit_context context)
879 : {
880 : walk_memory_blocks_func_t func;
881 :
882 : if (context == MEMINIT_HOTPLUG)
883 : func = register_mem_block_under_node_hotplug;
884 : else
885 : func = register_mem_block_under_node_early;
886 :
887 : walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn),
888 : (void *)&nid, func);
889 : return;
890 : }
891 :
892 : #ifdef CONFIG_HUGETLBFS
893 : /*
894 : * Handle per node hstate attribute [un]registration on transistions
895 : * to/from memoryless state.
896 : */
897 : static void node_hugetlb_work(struct work_struct *work)
898 : {
899 : struct node *node = container_of(work, struct node, node_work);
900 :
901 : /*
902 : * We only get here when a node transitions to/from memoryless state.
903 : * We can detect which transition occurred by examining whether the
904 : * node has memory now. hugetlb_register_node() already check this
905 : * so we try to register the attributes. If that fails, then the
906 : * node has transitioned to memoryless, try to unregister the
907 : * attributes.
908 : */
909 : if (!hugetlb_register_node(node))
910 : hugetlb_unregister_node(node);
911 : }
912 :
913 : static void init_node_hugetlb_work(int nid)
914 : {
915 : INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work);
916 : }
917 :
918 : static int node_memory_callback(struct notifier_block *self,
919 : unsigned long action, void *arg)
920 : {
921 : struct memory_notify *mnb = arg;
922 : int nid = mnb->status_change_nid;
923 :
924 : switch (action) {
925 : case MEM_ONLINE:
926 : case MEM_OFFLINE:
927 : /*
928 : * offload per node hstate [un]registration to a work thread
929 : * when transitioning to/from memoryless state.
930 : */
931 : if (nid != NUMA_NO_NODE)
932 : schedule_work(&node_devices[nid]->node_work);
933 : break;
934 :
935 : case MEM_GOING_ONLINE:
936 : case MEM_GOING_OFFLINE:
937 : case MEM_CANCEL_ONLINE:
938 : case MEM_CANCEL_OFFLINE:
939 : default:
940 : break;
941 : }
942 :
943 : return NOTIFY_OK;
944 : }
945 : #endif /* CONFIG_HUGETLBFS */
946 : #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
947 :
948 : #if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
949 : !defined(CONFIG_HUGETLBFS)
950 : static inline int node_memory_callback(struct notifier_block *self,
951 : unsigned long action, void *arg)
952 : {
953 : return NOTIFY_OK;
954 : }
955 :
956 1 : static void init_node_hugetlb_work(int nid) { }
957 :
958 : #endif
959 :
960 1 : int __register_one_node(int nid)
961 : {
962 1 : int error;
963 1 : int cpu;
964 :
965 1 : node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
966 1 : if (!node_devices[nid])
967 : return -ENOMEM;
968 :
969 1 : error = register_node(node_devices[nid], nid);
970 :
971 : /* link cpu under this node */
972 6 : for_each_present_cpu(cpu) {
973 4 : if (cpu_to_node(cpu) == nid)
974 4 : register_cpu_under_node(cpu, nid);
975 : }
976 :
977 1 : INIT_LIST_HEAD(&node_devices[nid]->access_list);
978 : /* initialize work queue for memory hot plug */
979 1 : init_node_hugetlb_work(nid);
980 1 : node_init_caches(nid);
981 :
982 1 : return error;
983 : }
984 :
985 0 : void unregister_one_node(int nid)
986 : {
987 0 : if (!node_devices[nid])
988 : return;
989 :
990 0 : unregister_node(node_devices[nid]);
991 0 : node_devices[nid] = NULL;
992 : }
993 :
994 : /*
995 : * node states attributes
996 : */
997 :
998 : struct node_attr {
999 : struct device_attribute attr;
1000 : enum node_states state;
1001 : };
1002 :
1003 0 : static ssize_t show_node_state(struct device *dev,
1004 : struct device_attribute *attr, char *buf)
1005 : {
1006 0 : struct node_attr *na = container_of(attr, struct node_attr, attr);
1007 :
1008 0 : return sysfs_emit(buf, "%*pbl\n",
1009 0 : nodemask_pr_args(&node_states[na->state]));
1010 : }
1011 :
1012 : #define _NODE_ATTR(name, state) \
1013 : { __ATTR(name, 0444, show_node_state, NULL), state }
1014 :
1015 : static struct node_attr node_state_attr[] = {
1016 : [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE),
1017 : [N_ONLINE] = _NODE_ATTR(online, N_ONLINE),
1018 : [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
1019 : #ifdef CONFIG_HIGHMEM
1020 : [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
1021 : #endif
1022 : [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
1023 : [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
1024 : [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
1025 : N_GENERIC_INITIATOR),
1026 : };
1027 :
1028 : static struct attribute *node_state_attrs[] = {
1029 : &node_state_attr[N_POSSIBLE].attr.attr,
1030 : &node_state_attr[N_ONLINE].attr.attr,
1031 : &node_state_attr[N_NORMAL_MEMORY].attr.attr,
1032 : #ifdef CONFIG_HIGHMEM
1033 : &node_state_attr[N_HIGH_MEMORY].attr.attr,
1034 : #endif
1035 : &node_state_attr[N_MEMORY].attr.attr,
1036 : &node_state_attr[N_CPU].attr.attr,
1037 : &node_state_attr[N_GENERIC_INITIATOR].attr.attr,
1038 : NULL
1039 : };
1040 :
1041 : static struct attribute_group memory_root_attr_group = {
1042 : .attrs = node_state_attrs,
1043 : };
1044 :
1045 : static const struct attribute_group *cpu_root_attr_groups[] = {
1046 : &memory_root_attr_group,
1047 : NULL,
1048 : };
1049 :
1050 : #define NODE_CALLBACK_PRI 2 /* lower than SLAB */
1051 1 : static int __init register_node_type(void)
1052 : {
1053 1 : int ret;
1054 :
1055 1 : BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES);
1056 1 : BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES);
1057 :
1058 1 : ret = subsys_system_register(&node_subsys, cpu_root_attr_groups);
1059 1 : if (!ret) {
1060 : static struct notifier_block node_memory_callback_nb = {
1061 : .notifier_call = node_memory_callback,
1062 : .priority = NODE_CALLBACK_PRI,
1063 : };
1064 : register_hotmemory_notifier(&node_memory_callback_nb);
1065 : }
1066 :
1067 : /*
1068 : * Note: we're not going to unregister the node class if we fail
1069 : * to register the node state class attribute files.
1070 : */
1071 1 : return ret;
1072 : }
1073 : postcore_initcall(register_node_type);
|