Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * Simple NUMA memory policy for the Linux kernel.
4 : *
5 : * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 : * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 : *
8 : * NUMA policy allows the user to give hints in which node(s) memory should
9 : * be allocated.
10 : *
11 : * Support four policies per VMA and per process:
12 : *
13 : * The VMA policy has priority over the process policy for a page fault.
14 : *
15 : * interleave Allocate memory interleaved over a set of nodes,
16 : * with normal fallback if it fails.
17 : * For VMA based allocations this interleaves based on the
18 : * offset into the backing object or offset into the mapping
19 : * for anonymous memory. For process policy an process counter
20 : * is used.
21 : *
22 : * bind Only allocate memory on a specific set of nodes,
23 : * no fallback.
24 : * FIXME: memory is allocated starting with the first node
25 : * to the last. It would be better if bind would truly restrict
26 : * the allocation to memory nodes instead
27 : *
28 : * preferred Try a specific node first before normal fallback.
29 : * As a special case NUMA_NO_NODE here means do the allocation
30 : * on the local CPU. This is normally identical to default,
31 : * but useful to set in a VMA when you have a non default
32 : * process policy.
33 : *
34 : * default Allocate on the local node first, or when on a VMA
35 : * use the process policy. This is what Linux always did
36 : * in a NUMA aware kernel and still does by, ahem, default.
37 : *
38 : * The process policy is applied for most non interrupt memory allocations
39 : * in that process' context. Interrupts ignore the policies and always
40 : * try to allocate on the local CPU. The VMA policy is only applied for memory
41 : * allocations for a VMA in the VM.
42 : *
43 : * Currently there are a few corner cases in swapping where the policy
44 : * is not applied, but the majority should be handled. When process policy
45 : * is used it is not remembered over swap outs/swap ins.
46 : *
47 : * Only the highest zone in the zone hierarchy gets policied. Allocations
48 : * requesting a lower zone just use default policy. This implies that
49 : * on systems with highmem kernel lowmem allocation don't get policied.
50 : * Same with GFP_DMA allocations.
51 : *
52 : * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 : * all users and remembered even when nobody has memory mapped.
54 : */
55 :
56 : /* Notebook:
57 : fix mmap readahead to honour policy and enable policy for any page cache
58 : object
59 : statistics for bigpages
60 : global policy for page cache? currently it uses process policy. Requires
61 : first item above.
62 : handle mremap for shared memory (currently ignored for the policy)
63 : grows down?
64 : make bind policy root only? It can trigger oom much faster and the
65 : kernel is not always grateful with that.
66 : */
67 :
68 : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 :
70 : #include <linux/mempolicy.h>
71 : #include <linux/pagewalk.h>
72 : #include <linux/highmem.h>
73 : #include <linux/hugetlb.h>
74 : #include <linux/kernel.h>
75 : #include <linux/sched.h>
76 : #include <linux/sched/mm.h>
77 : #include <linux/sched/numa_balancing.h>
78 : #include <linux/sched/task.h>
79 : #include <linux/nodemask.h>
80 : #include <linux/cpuset.h>
81 : #include <linux/slab.h>
82 : #include <linux/string.h>
83 : #include <linux/export.h>
84 : #include <linux/nsproxy.h>
85 : #include <linux/interrupt.h>
86 : #include <linux/init.h>
87 : #include <linux/compat.h>
88 : #include <linux/ptrace.h>
89 : #include <linux/swap.h>
90 : #include <linux/seq_file.h>
91 : #include <linux/proc_fs.h>
92 : #include <linux/migrate.h>
93 : #include <linux/ksm.h>
94 : #include <linux/rmap.h>
95 : #include <linux/security.h>
96 : #include <linux/syscalls.h>
97 : #include <linux/ctype.h>
98 : #include <linux/mm_inline.h>
99 : #include <linux/mmu_notifier.h>
100 : #include <linux/printk.h>
101 : #include <linux/swapops.h>
102 :
103 : #include <asm/tlbflush.h>
104 : #include <linux/uaccess.h>
105 :
106 : #include "internal.h"
107 :
108 : /* Internal flags */
109 : #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
110 : #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
111 :
112 : static struct kmem_cache *policy_cache;
113 : static struct kmem_cache *sn_cache;
114 :
115 : /* Highest zone. An specific allocation for a zone below that is not
116 : policied. */
117 : enum zone_type policy_zone = 0;
118 :
119 : /*
120 : * run-time system-wide default policy => local allocation
121 : */
122 : static struct mempolicy default_policy = {
123 : .refcnt = ATOMIC_INIT(1), /* never free it */
124 : .mode = MPOL_PREFERRED,
125 : .flags = MPOL_F_LOCAL,
126 : };
127 :
128 : static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129 :
130 : /**
131 : * numa_map_to_online_node - Find closest online node
132 : * @node: Node id to start the search
133 : *
134 : * Lookup the next closest node by distance if @nid is not online.
135 : */
136 0 : int numa_map_to_online_node(int node)
137 : {
138 0 : int min_dist = INT_MAX, dist, n, min_node;
139 :
140 0 : if (node == NUMA_NO_NODE || node_online(node))
141 0 : return node;
142 :
143 0 : min_node = node;
144 0 : for_each_online_node(n) {
145 0 : dist = node_distance(node, n);
146 0 : if (dist < min_dist) {
147 0 : min_dist = dist;
148 0 : min_node = n;
149 : }
150 : }
151 :
152 : return min_node;
153 : }
154 : EXPORT_SYMBOL_GPL(numa_map_to_online_node);
155 :
156 192491 : struct mempolicy *get_task_policy(struct task_struct *p)
157 : {
158 192491 : struct mempolicy *pol = p->mempolicy;
159 192491 : int node;
160 :
161 192491 : if (pol)
162 : return pol;
163 :
164 188781 : node = numa_node_id();
165 188781 : if (node != NUMA_NO_NODE) {
166 188781 : pol = &preferred_node_policy[node];
167 : /* preferred_node_policy is not initialised early in boot */
168 188781 : if (pol->mode)
169 188499 : return pol;
170 : }
171 :
172 : return &default_policy;
173 : }
174 :
175 : static const struct mempolicy_operations {
176 : int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
177 : void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
178 : } mpol_ops[MPOL_MAX];
179 :
180 1 : static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
181 : {
182 1 : return pol->flags & MPOL_MODE_FLAGS;
183 : }
184 :
185 0 : static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
186 : const nodemask_t *rel)
187 : {
188 0 : nodemask_t tmp;
189 0 : nodes_fold(tmp, *orig, nodes_weight(*rel));
190 0 : nodes_onto(*ret, tmp, *rel);
191 0 : }
192 :
193 1 : static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
194 : {
195 1 : if (nodes_empty(*nodes))
196 : return -EINVAL;
197 1 : pol->v.nodes = *nodes;
198 1 : return 0;
199 : }
200 :
201 0 : static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
202 : {
203 0 : if (!nodes)
204 0 : pol->flags |= MPOL_F_LOCAL; /* local allocation */
205 0 : else if (nodes_empty(*nodes))
206 : return -EINVAL; /* no allowed nodes */
207 : else
208 0 : pol->v.preferred_node = first_node(*nodes);
209 : return 0;
210 : }
211 :
212 0 : static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
213 : {
214 0 : if (nodes_empty(*nodes))
215 : return -EINVAL;
216 0 : pol->v.nodes = *nodes;
217 0 : return 0;
218 : }
219 :
220 : /*
221 : * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
222 : * any, for the new policy. mpol_new() has already validated the nodes
223 : * parameter with respect to the policy mode and flags. But, we need to
224 : * handle an empty nodemask with MPOL_PREFERRED here.
225 : *
226 : * Must be called holding task's alloc_lock to protect task's mems_allowed
227 : * and mempolicy. May also be called holding the mmap_lock for write.
228 : */
229 3 : static int mpol_set_nodemask(struct mempolicy *pol,
230 : const nodemask_t *nodes, struct nodemask_scratch *nsc)
231 : {
232 3 : int ret;
233 :
234 : /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
235 3 : if (pol == NULL)
236 : return 0;
237 : /* Check N_MEMORY */
238 1 : nodes_and(nsc->mask1,
239 : cpuset_current_mems_allowed, node_states[N_MEMORY]);
240 :
241 1 : VM_BUG_ON(!nodes);
242 1 : if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
243 : nodes = NULL; /* explicit local allocation */
244 : else {
245 1 : if (pol->flags & MPOL_F_RELATIVE_NODES)
246 0 : mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
247 : else
248 1 : nodes_and(nsc->mask2, *nodes, nsc->mask1);
249 :
250 1 : if (mpol_store_user_nodemask(pol))
251 0 : pol->w.user_nodemask = *nodes;
252 : else
253 1 : pol->w.cpuset_mems_allowed =
254 : cpuset_current_mems_allowed;
255 : }
256 :
257 1 : if (nodes)
258 1 : ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
259 : else
260 0 : ret = mpol_ops[pol->mode].create(pol, NULL);
261 : return ret;
262 : }
263 :
264 : /*
265 : * This function just creates a new policy, does some check and simple
266 : * initialization. You must invoke mpol_set_nodemask() to set nodes.
267 : */
268 3 : static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269 : nodemask_t *nodes)
270 : {
271 3 : struct mempolicy *policy;
272 :
273 3 : pr_debug("setting mode %d flags %d nodes[0] %lx\n",
274 : mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
275 :
276 3 : if (mode == MPOL_DEFAULT) {
277 2 : if (nodes && !nodes_empty(*nodes))
278 3 : return ERR_PTR(-EINVAL);
279 2 : return NULL;
280 : }
281 1 : VM_BUG_ON(!nodes);
282 :
283 : /*
284 : * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
285 : * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
286 : * All other modes require a valid pointer to a non-empty nodemask.
287 : */
288 1 : if (mode == MPOL_PREFERRED) {
289 0 : if (nodes_empty(*nodes)) {
290 0 : if (((flags & MPOL_F_STATIC_NODES) ||
291 : (flags & MPOL_F_RELATIVE_NODES)))
292 3 : return ERR_PTR(-EINVAL);
293 : }
294 1 : } else if (mode == MPOL_LOCAL) {
295 0 : if (!nodes_empty(*nodes) ||
296 0 : (flags & MPOL_F_STATIC_NODES) ||
297 : (flags & MPOL_F_RELATIVE_NODES))
298 3 : return ERR_PTR(-EINVAL);
299 : mode = MPOL_PREFERRED;
300 1 : } else if (nodes_empty(*nodes))
301 3 : return ERR_PTR(-EINVAL);
302 1 : policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
303 1 : if (!policy)
304 3 : return ERR_PTR(-ENOMEM);
305 1 : atomic_set(&policy->refcnt, 1);
306 1 : policy->mode = mode;
307 1 : policy->flags = flags;
308 :
309 1 : return policy;
310 : }
311 :
312 : /* Slow path of a mpol destructor. */
313 2 : void __mpol_put(struct mempolicy *p)
314 : {
315 4 : if (!atomic_dec_and_test(&p->refcnt))
316 : return;
317 2 : kmem_cache_free(policy_cache, p);
318 : }
319 :
320 0 : static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
321 : {
322 0 : }
323 :
324 0 : static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
325 : {
326 0 : nodemask_t tmp;
327 :
328 0 : if (pol->flags & MPOL_F_STATIC_NODES)
329 0 : nodes_and(tmp, pol->w.user_nodemask, *nodes);
330 0 : else if (pol->flags & MPOL_F_RELATIVE_NODES)
331 0 : mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
332 : else {
333 0 : nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
334 : *nodes);
335 0 : pol->w.cpuset_mems_allowed = *nodes;
336 : }
337 :
338 0 : if (nodes_empty(tmp))
339 0 : tmp = *nodes;
340 :
341 0 : pol->v.nodes = tmp;
342 0 : }
343 :
344 0 : static void mpol_rebind_preferred(struct mempolicy *pol,
345 : const nodemask_t *nodes)
346 : {
347 0 : nodemask_t tmp;
348 :
349 0 : if (pol->flags & MPOL_F_STATIC_NODES) {
350 0 : int node = first_node(pol->w.user_nodemask);
351 :
352 0 : if (node_isset(node, *nodes)) {
353 0 : pol->v.preferred_node = node;
354 0 : pol->flags &= ~MPOL_F_LOCAL;
355 : } else
356 0 : pol->flags |= MPOL_F_LOCAL;
357 0 : } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
358 0 : mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
359 0 : pol->v.preferred_node = first_node(tmp);
360 0 : } else if (!(pol->flags & MPOL_F_LOCAL)) {
361 0 : pol->v.preferred_node = node_remap(pol->v.preferred_node,
362 : pol->w.cpuset_mems_allowed,
363 : *nodes);
364 0 : pol->w.cpuset_mems_allowed = *nodes;
365 : }
366 0 : }
367 :
368 : /*
369 : * mpol_rebind_policy - Migrate a policy to a different set of nodes
370 : *
371 : * Per-vma policies are protected by mmap_lock. Allocations using per-task
372 : * policies are protected by task->mems_allowed_seq to prevent a premature
373 : * OOM/allocation failure due to parallel nodemask modification.
374 : */
375 0 : static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
376 : {
377 0 : if (!pol)
378 : return;
379 0 : if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
380 0 : nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
381 : return;
382 :
383 0 : mpol_ops[pol->mode].rebind(pol, newmask);
384 : }
385 :
386 : /*
387 : * Wrapper for mpol_rebind_policy() that just requires task
388 : * pointer, and updates task mempolicy.
389 : *
390 : * Called with task's alloc_lock held.
391 : */
392 :
393 0 : void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
394 : {
395 0 : mpol_rebind_policy(tsk->mempolicy, new);
396 0 : }
397 :
398 : /*
399 : * Rebind each vma in mm to new nodemask.
400 : *
401 : * Call holding a reference to mm. Takes mm->mmap_lock during call.
402 : */
403 :
404 0 : void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
405 : {
406 0 : struct vm_area_struct *vma;
407 :
408 0 : mmap_write_lock(mm);
409 0 : for (vma = mm->mmap; vma; vma = vma->vm_next)
410 0 : mpol_rebind_policy(vma->vm_policy, new);
411 0 : mmap_write_unlock(mm);
412 0 : }
413 :
414 : static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
415 : [MPOL_DEFAULT] = {
416 : .rebind = mpol_rebind_default,
417 : },
418 : [MPOL_INTERLEAVE] = {
419 : .create = mpol_new_interleave,
420 : .rebind = mpol_rebind_nodemask,
421 : },
422 : [MPOL_PREFERRED] = {
423 : .create = mpol_new_preferred,
424 : .rebind = mpol_rebind_preferred,
425 : },
426 : [MPOL_BIND] = {
427 : .create = mpol_new_bind,
428 : .rebind = mpol_rebind_nodemask,
429 : },
430 : };
431 :
432 : static int migrate_page_add(struct page *page, struct list_head *pagelist,
433 : unsigned long flags);
434 :
435 : struct queue_pages {
436 : struct list_head *pagelist;
437 : unsigned long flags;
438 : nodemask_t *nmask;
439 : unsigned long start;
440 : unsigned long end;
441 : struct vm_area_struct *first;
442 : };
443 :
444 : /*
445 : * Check if the page's nid is in qp->nmask.
446 : *
447 : * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
448 : * in the invert of qp->nmask.
449 : */
450 0 : static inline bool queue_pages_required(struct page *page,
451 : struct queue_pages *qp)
452 : {
453 0 : int nid = page_to_nid(page);
454 0 : unsigned long flags = qp->flags;
455 :
456 0 : return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
457 : }
458 :
459 : /*
460 : * queue_pages_pmd() has four possible return values:
461 : * 0 - pages are placed on the right node or queued successfully.
462 : * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
463 : * specified.
464 : * 2 - THP was split.
465 : * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
466 : * existing page was already on a node that does not follow the
467 : * policy.
468 : */
469 0 : static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
470 : unsigned long end, struct mm_walk *walk)
471 : __releases(ptl)
472 : {
473 0 : int ret = 0;
474 0 : struct page *page;
475 0 : struct queue_pages *qp = walk->private;
476 0 : unsigned long flags;
477 :
478 0 : if (unlikely(is_pmd_migration_entry(*pmd))) {
479 0 : ret = -EIO;
480 0 : goto unlock;
481 : }
482 0 : page = pmd_page(*pmd);
483 0 : if (is_huge_zero_page(page)) {
484 0 : spin_unlock(ptl);
485 0 : __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
486 0 : ret = 2;
487 0 : goto out;
488 : }
489 0 : if (!queue_pages_required(page, qp))
490 0 : goto unlock;
491 :
492 0 : flags = qp->flags;
493 : /* go to thp migration */
494 0 : if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
495 0 : if (!vma_migratable(walk->vma) ||
496 0 : migrate_page_add(page, qp->pagelist, flags)) {
497 0 : ret = 1;
498 0 : goto unlock;
499 : }
500 : } else
501 : ret = -EIO;
502 0 : unlock:
503 0 : spin_unlock(ptl);
504 0 : out:
505 0 : return ret;
506 : }
507 :
508 : /*
509 : * Scan through pages checking if pages follow certain conditions,
510 : * and move them to the pagelist if they do.
511 : *
512 : * queue_pages_pte_range() has three possible return values:
513 : * 0 - pages are placed on the right node or queued successfully.
514 : * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
515 : * specified.
516 : * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
517 : * on a node that does not follow the policy.
518 : */
519 0 : static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
520 : unsigned long end, struct mm_walk *walk)
521 : {
522 0 : struct vm_area_struct *vma = walk->vma;
523 0 : struct page *page;
524 0 : struct queue_pages *qp = walk->private;
525 0 : unsigned long flags = qp->flags;
526 0 : int ret;
527 0 : bool has_unmovable = false;
528 0 : pte_t *pte, *mapped_pte;
529 0 : spinlock_t *ptl;
530 :
531 0 : ptl = pmd_trans_huge_lock(pmd, vma);
532 0 : if (ptl) {
533 0 : ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
534 0 : if (ret != 2)
535 : return ret;
536 : }
537 : /* THP was split, fall through to pte walk */
538 :
539 0 : if (pmd_trans_unstable(pmd))
540 : return 0;
541 :
542 0 : mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
543 0 : for (; addr != end; pte++, addr += PAGE_SIZE) {
544 0 : if (!pte_present(*pte))
545 0 : continue;
546 0 : page = vm_normal_page(vma, addr, *pte);
547 0 : if (!page)
548 0 : continue;
549 : /*
550 : * vm_normal_page() filters out zero pages, but there might
551 : * still be PageReserved pages to skip, perhaps in a VDSO.
552 : */
553 0 : if (PageReserved(page))
554 0 : continue;
555 0 : if (!queue_pages_required(page, qp))
556 0 : continue;
557 0 : if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
558 : /* MPOL_MF_STRICT must be specified if we get here */
559 0 : if (!vma_migratable(vma)) {
560 : has_unmovable = true;
561 : break;
562 : }
563 :
564 : /*
565 : * Do not abort immediately since there may be
566 : * temporary off LRU pages in the range. Still
567 : * need migrate other LRU pages.
568 : */
569 0 : if (migrate_page_add(page, qp->pagelist, flags))
570 0 : has_unmovable = true;
571 : } else
572 : break;
573 : }
574 0 : pte_unmap_unlock(mapped_pte, ptl);
575 0 : cond_resched();
576 :
577 0 : if (has_unmovable)
578 : return 1;
579 :
580 0 : return addr != end ? -EIO : 0;
581 : }
582 :
583 0 : static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
584 : unsigned long addr, unsigned long end,
585 : struct mm_walk *walk)
586 : {
587 0 : int ret = 0;
588 : #ifdef CONFIG_HUGETLB_PAGE
589 : struct queue_pages *qp = walk->private;
590 : unsigned long flags = (qp->flags & MPOL_MF_VALID);
591 : struct page *page;
592 : spinlock_t *ptl;
593 : pte_t entry;
594 :
595 : ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
596 : entry = huge_ptep_get(pte);
597 : if (!pte_present(entry))
598 : goto unlock;
599 : page = pte_page(entry);
600 : if (!queue_pages_required(page, qp))
601 : goto unlock;
602 :
603 : if (flags == MPOL_MF_STRICT) {
604 : /*
605 : * STRICT alone means only detecting misplaced page and no
606 : * need to further check other vma.
607 : */
608 : ret = -EIO;
609 : goto unlock;
610 : }
611 :
612 : if (!vma_migratable(walk->vma)) {
613 : /*
614 : * Must be STRICT with MOVE*, otherwise .test_walk() have
615 : * stopped walking current vma.
616 : * Detecting misplaced page but allow migrating pages which
617 : * have been queued.
618 : */
619 : ret = 1;
620 : goto unlock;
621 : }
622 :
623 : /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
624 : if (flags & (MPOL_MF_MOVE_ALL) ||
625 : (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
626 : if (!isolate_huge_page(page, qp->pagelist) &&
627 : (flags & MPOL_MF_STRICT))
628 : /*
629 : * Failed to isolate page but allow migrating pages
630 : * which have been queued.
631 : */
632 : ret = 1;
633 : }
634 : unlock:
635 : spin_unlock(ptl);
636 : #else
637 0 : BUG();
638 : #endif
639 : return ret;
640 : }
641 :
642 : #ifdef CONFIG_NUMA_BALANCING
643 : /*
644 : * This is used to mark a range of virtual addresses to be inaccessible.
645 : * These are later cleared by a NUMA hinting fault. Depending on these
646 : * faults, pages may be migrated for better NUMA placement.
647 : *
648 : * This is assuming that NUMA faults are handled using PROT_NONE. If
649 : * an architecture makes a different choice, it will need further
650 : * changes to the core.
651 : */
652 : unsigned long change_prot_numa(struct vm_area_struct *vma,
653 : unsigned long addr, unsigned long end)
654 : {
655 : int nr_updated;
656 :
657 : nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
658 : if (nr_updated)
659 : count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
660 :
661 : return nr_updated;
662 : }
663 : #else
664 : static unsigned long change_prot_numa(struct vm_area_struct *vma,
665 : unsigned long addr, unsigned long end)
666 : {
667 : return 0;
668 : }
669 : #endif /* CONFIG_NUMA_BALANCING */
670 :
671 0 : static int queue_pages_test_walk(unsigned long start, unsigned long end,
672 : struct mm_walk *walk)
673 : {
674 0 : struct vm_area_struct *vma = walk->vma;
675 0 : struct queue_pages *qp = walk->private;
676 0 : unsigned long endvma = vma->vm_end;
677 0 : unsigned long flags = qp->flags;
678 :
679 : /* range check first */
680 0 : VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
681 :
682 0 : if (!qp->first) {
683 0 : qp->first = vma;
684 0 : if (!(flags & MPOL_MF_DISCONTIG_OK) &&
685 0 : (qp->start < vma->vm_start))
686 : /* hole at head side of range */
687 : return -EFAULT;
688 : }
689 0 : if (!(flags & MPOL_MF_DISCONTIG_OK) &&
690 0 : ((vma->vm_end < qp->end) &&
691 0 : (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
692 : /* hole at middle or tail of range */
693 : return -EFAULT;
694 :
695 : /*
696 : * Need check MPOL_MF_STRICT to return -EIO if possible
697 : * regardless of vma_migratable
698 : */
699 0 : if (!vma_migratable(vma) &&
700 0 : !(flags & MPOL_MF_STRICT))
701 : return 1;
702 :
703 0 : if (endvma > end)
704 : endvma = end;
705 :
706 0 : if (flags & MPOL_MF_LAZY) {
707 : /* Similar to task_numa_work, skip inaccessible VMAs */
708 0 : if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
709 : !(vma->vm_flags & VM_MIXEDMAP))
710 0 : change_prot_numa(vma, start, endvma);
711 : return 1;
712 : }
713 :
714 : /* queue pages from current vma */
715 0 : if (flags & MPOL_MF_VALID)
716 0 : return 0;
717 : return 1;
718 : }
719 :
720 : static const struct mm_walk_ops queue_pages_walk_ops = {
721 : .hugetlb_entry = queue_pages_hugetlb,
722 : .pmd_entry = queue_pages_pte_range,
723 : .test_walk = queue_pages_test_walk,
724 : };
725 :
726 : /*
727 : * Walk through page tables and collect pages to be migrated.
728 : *
729 : * If pages found in a given range are on a set of nodes (determined by
730 : * @nodes and @flags,) it's isolated and queued to the pagelist which is
731 : * passed via @private.
732 : *
733 : * queue_pages_range() has three possible return values:
734 : * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
735 : * specified.
736 : * 0 - queue pages successfully or no misplaced page.
737 : * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
738 : * memory range specified by nodemask and maxnode points outside
739 : * your accessible address space (-EFAULT)
740 : */
741 : static int
742 0 : queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
743 : nodemask_t *nodes, unsigned long flags,
744 : struct list_head *pagelist)
745 : {
746 0 : int err;
747 0 : struct queue_pages qp = {
748 : .pagelist = pagelist,
749 : .flags = flags,
750 : .nmask = nodes,
751 : .start = start,
752 : .end = end,
753 : .first = NULL,
754 : };
755 :
756 0 : err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
757 :
758 0 : if (!qp.first)
759 : /* whole range in hole */
760 0 : err = -EFAULT;
761 :
762 0 : return err;
763 : }
764 :
765 : /*
766 : * Apply policy to a single VMA
767 : * This must be called with the mmap_lock held for writing.
768 : */
769 0 : static int vma_replace_policy(struct vm_area_struct *vma,
770 : struct mempolicy *pol)
771 : {
772 0 : int err;
773 0 : struct mempolicy *old;
774 0 : struct mempolicy *new;
775 :
776 0 : pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
777 : vma->vm_start, vma->vm_end, vma->vm_pgoff,
778 : vma->vm_ops, vma->vm_file,
779 : vma->vm_ops ? vma->vm_ops->set_policy : NULL);
780 :
781 0 : new = mpol_dup(pol);
782 0 : if (IS_ERR(new))
783 0 : return PTR_ERR(new);
784 :
785 0 : if (vma->vm_ops && vma->vm_ops->set_policy) {
786 0 : err = vma->vm_ops->set_policy(vma, new);
787 0 : if (err)
788 0 : goto err_out;
789 : }
790 :
791 0 : old = vma->vm_policy;
792 0 : vma->vm_policy = new; /* protected by mmap_lock */
793 0 : mpol_put(old);
794 :
795 : return 0;
796 0 : err_out:
797 0 : mpol_put(new);
798 : return err;
799 : }
800 :
801 : /* Step 2: apply policy to a range and do splits. */
802 0 : static int mbind_range(struct mm_struct *mm, unsigned long start,
803 : unsigned long end, struct mempolicy *new_pol)
804 : {
805 0 : struct vm_area_struct *next;
806 0 : struct vm_area_struct *prev;
807 0 : struct vm_area_struct *vma;
808 0 : int err = 0;
809 0 : pgoff_t pgoff;
810 0 : unsigned long vmstart;
811 0 : unsigned long vmend;
812 :
813 0 : vma = find_vma(mm, start);
814 0 : VM_BUG_ON(!vma);
815 :
816 0 : prev = vma->vm_prev;
817 0 : if (start > vma->vm_start)
818 0 : prev = vma;
819 :
820 0 : for (; vma && vma->vm_start < end; prev = vma, vma = next) {
821 0 : next = vma->vm_next;
822 0 : vmstart = max(start, vma->vm_start);
823 0 : vmend = min(end, vma->vm_end);
824 :
825 0 : if (mpol_equal(vma_policy(vma), new_pol))
826 0 : continue;
827 :
828 0 : pgoff = vma->vm_pgoff +
829 0 : ((vmstart - vma->vm_start) >> PAGE_SHIFT);
830 0 : prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
831 : vma->anon_vma, vma->vm_file, pgoff,
832 : new_pol, vma->vm_userfaultfd_ctx);
833 0 : if (prev) {
834 0 : vma = prev;
835 0 : next = vma->vm_next;
836 0 : if (mpol_equal(vma_policy(vma), new_pol))
837 0 : continue;
838 : /* vma_merge() joined vma && vma->next, case 8 */
839 0 : goto replace;
840 : }
841 0 : if (vma->vm_start != vmstart) {
842 0 : err = split_vma(vma->vm_mm, vma, vmstart, 1);
843 0 : if (err)
844 0 : goto out;
845 : }
846 0 : if (vma->vm_end != vmend) {
847 0 : err = split_vma(vma->vm_mm, vma, vmend, 0);
848 0 : if (err)
849 0 : goto out;
850 : }
851 0 : replace:
852 0 : err = vma_replace_policy(vma, new_pol);
853 0 : if (err)
854 0 : goto out;
855 : }
856 :
857 0 : out:
858 0 : return err;
859 : }
860 :
861 : /* Set the process memory policy */
862 3 : static long do_set_mempolicy(unsigned short mode, unsigned short flags,
863 : nodemask_t *nodes)
864 : {
865 3 : struct mempolicy *new, *old;
866 3 : NODEMASK_SCRATCH(scratch);
867 3 : int ret;
868 :
869 3 : if (!scratch)
870 : return -ENOMEM;
871 :
872 3 : new = mpol_new(mode, flags, nodes);
873 3 : if (IS_ERR(new)) {
874 0 : ret = PTR_ERR(new);
875 0 : goto out;
876 : }
877 :
878 3 : if (flags & MPOL_F_NUMA_BALANCING) {
879 0 : if (new && new->mode == MPOL_BIND) {
880 0 : new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
881 : } else {
882 0 : ret = -EINVAL;
883 0 : mpol_put(new);
884 0 : goto out;
885 : }
886 : }
887 :
888 3 : ret = mpol_set_nodemask(new, nodes, scratch);
889 3 : if (ret) {
890 0 : mpol_put(new);
891 0 : goto out;
892 : }
893 3 : task_lock(current);
894 3 : old = current->mempolicy;
895 3 : current->mempolicy = new;
896 3 : if (new && new->mode == MPOL_INTERLEAVE)
897 1 : current->il_prev = MAX_NUMNODES-1;
898 3 : task_unlock(current);
899 3 : mpol_put(old);
900 : ret = 0;
901 3 : out:
902 3 : NODEMASK_SCRATCH_FREE(scratch);
903 3 : return ret;
904 : }
905 :
906 : /*
907 : * Return nodemask for policy for get_mempolicy() query
908 : *
909 : * Called with task's alloc_lock held
910 : */
911 0 : static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
912 : {
913 0 : nodes_clear(*nodes);
914 0 : if (p == &default_policy)
915 : return;
916 :
917 0 : switch (p->mode) {
918 0 : case MPOL_BIND:
919 : case MPOL_INTERLEAVE:
920 0 : *nodes = p->v.nodes;
921 0 : break;
922 0 : case MPOL_PREFERRED:
923 0 : if (!(p->flags & MPOL_F_LOCAL))
924 0 : node_set(p->v.preferred_node, *nodes);
925 : /* else return empty node mask for local allocation */
926 : break;
927 0 : default:
928 0 : BUG();
929 : }
930 : }
931 :
932 0 : static int lookup_node(struct mm_struct *mm, unsigned long addr)
933 : {
934 0 : struct page *p = NULL;
935 0 : int err;
936 :
937 0 : int locked = 1;
938 0 : err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
939 0 : if (err > 0) {
940 0 : err = page_to_nid(p);
941 0 : put_page(p);
942 : }
943 0 : if (locked)
944 0 : mmap_read_unlock(mm);
945 0 : return err;
946 : }
947 :
948 : /* Retrieve NUMA policy */
949 0 : static long do_get_mempolicy(int *policy, nodemask_t *nmask,
950 : unsigned long addr, unsigned long flags)
951 : {
952 0 : int err;
953 0 : struct mm_struct *mm = current->mm;
954 0 : struct vm_area_struct *vma = NULL;
955 0 : struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
956 :
957 0 : if (flags &
958 : ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
959 : return -EINVAL;
960 :
961 0 : if (flags & MPOL_F_MEMS_ALLOWED) {
962 0 : if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
963 : return -EINVAL;
964 0 : *policy = 0; /* just so it's initialized */
965 0 : task_lock(current);
966 0 : *nmask = cpuset_current_mems_allowed;
967 0 : task_unlock(current);
968 0 : return 0;
969 : }
970 :
971 0 : if (flags & MPOL_F_ADDR) {
972 : /*
973 : * Do NOT fall back to task policy if the
974 : * vma/shared policy at addr is NULL. We
975 : * want to return MPOL_DEFAULT in this case.
976 : */
977 0 : mmap_read_lock(mm);
978 0 : vma = find_vma_intersection(mm, addr, addr+1);
979 0 : if (!vma) {
980 0 : mmap_read_unlock(mm);
981 0 : return -EFAULT;
982 : }
983 0 : if (vma->vm_ops && vma->vm_ops->get_policy)
984 0 : pol = vma->vm_ops->get_policy(vma, addr);
985 : else
986 0 : pol = vma->vm_policy;
987 0 : } else if (addr)
988 : return -EINVAL;
989 :
990 0 : if (!pol)
991 0 : pol = &default_policy; /* indicates default behavior */
992 :
993 0 : if (flags & MPOL_F_NODE) {
994 0 : if (flags & MPOL_F_ADDR) {
995 : /*
996 : * Take a refcount on the mpol, lookup_node()
997 : * wil drop the mmap_lock, so after calling
998 : * lookup_node() only "pol" remains valid, "vma"
999 : * is stale.
1000 : */
1001 0 : pol_refcount = pol;
1002 0 : vma = NULL;
1003 0 : mpol_get(pol);
1004 0 : err = lookup_node(mm, addr);
1005 0 : if (err < 0)
1006 0 : goto out;
1007 0 : *policy = err;
1008 0 : } else if (pol == current->mempolicy &&
1009 0 : pol->mode == MPOL_INTERLEAVE) {
1010 0 : *policy = next_node_in(current->il_prev, pol->v.nodes);
1011 : } else {
1012 0 : err = -EINVAL;
1013 0 : goto out;
1014 : }
1015 : } else {
1016 0 : *policy = pol == &default_policy ? MPOL_DEFAULT :
1017 0 : pol->mode;
1018 : /*
1019 : * Internal mempolicy flags must be masked off before exposing
1020 : * the policy to userspace.
1021 : */
1022 0 : *policy |= (pol->flags & MPOL_MODE_FLAGS);
1023 : }
1024 :
1025 0 : err = 0;
1026 0 : if (nmask) {
1027 0 : if (mpol_store_user_nodemask(pol)) {
1028 0 : *nmask = pol->w.user_nodemask;
1029 : } else {
1030 0 : task_lock(current);
1031 0 : get_policy_nodemask(pol, nmask);
1032 0 : task_unlock(current);
1033 : }
1034 : }
1035 :
1036 0 : out:
1037 0 : mpol_cond_put(pol);
1038 0 : if (vma)
1039 0 : mmap_read_unlock(mm);
1040 0 : if (pol_refcount)
1041 0 : mpol_put(pol_refcount);
1042 0 : return err;
1043 : }
1044 :
1045 : #ifdef CONFIG_MIGRATION
1046 : /*
1047 : * page migration, thp tail pages can be passed.
1048 : */
1049 0 : static int migrate_page_add(struct page *page, struct list_head *pagelist,
1050 : unsigned long flags)
1051 : {
1052 0 : struct page *head = compound_head(page);
1053 : /*
1054 : * Avoid migrating a page that is shared with others.
1055 : */
1056 0 : if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1057 0 : if (!isolate_lru_page(head)) {
1058 0 : list_add_tail(&head->lru, pagelist);
1059 0 : mod_node_page_state(page_pgdat(head),
1060 0 : NR_ISOLATED_ANON + page_is_file_lru(head),
1061 0 : thp_nr_pages(head));
1062 0 : } else if (flags & MPOL_MF_STRICT) {
1063 : /*
1064 : * Non-movable page may reach here. And, there may be
1065 : * temporary off LRU pages or non-LRU movable pages.
1066 : * Treat them as unmovable pages since they can't be
1067 : * isolated, so they can't be moved at the moment. It
1068 : * should return -EIO for this case too.
1069 : */
1070 0 : return -EIO;
1071 : }
1072 : }
1073 :
1074 : return 0;
1075 : }
1076 :
1077 : /*
1078 : * Migrate pages from one node to a target node.
1079 : * Returns error or the number of pages not migrated.
1080 : */
1081 0 : static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1082 : int flags)
1083 : {
1084 0 : nodemask_t nmask;
1085 0 : LIST_HEAD(pagelist);
1086 0 : int err = 0;
1087 0 : struct migration_target_control mtc = {
1088 : .nid = dest,
1089 : .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1090 : };
1091 :
1092 0 : nodes_clear(nmask);
1093 0 : node_set(source, nmask);
1094 :
1095 : /*
1096 : * This does not "check" the range but isolates all pages that
1097 : * need migration. Between passing in the full user address
1098 : * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1099 : */
1100 0 : VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1101 0 : queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1102 0 : flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1103 :
1104 0 : if (!list_empty(&pagelist)) {
1105 0 : err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1106 : (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1107 0 : if (err)
1108 0 : putback_movable_pages(&pagelist);
1109 : }
1110 :
1111 0 : return err;
1112 : }
1113 :
1114 : /*
1115 : * Move pages between the two nodesets so as to preserve the physical
1116 : * layout as much as possible.
1117 : *
1118 : * Returns the number of page that could not be moved.
1119 : */
1120 0 : int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1121 : const nodemask_t *to, int flags)
1122 : {
1123 0 : int busy = 0;
1124 0 : int err = 0;
1125 0 : nodemask_t tmp;
1126 :
1127 0 : migrate_prep();
1128 :
1129 0 : mmap_read_lock(mm);
1130 :
1131 : /*
1132 : * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1133 : * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1134 : * bit in 'tmp', and return that <source, dest> pair for migration.
1135 : * The pair of nodemasks 'to' and 'from' define the map.
1136 : *
1137 : * If no pair of bits is found that way, fallback to picking some
1138 : * pair of 'source' and 'dest' bits that are not the same. If the
1139 : * 'source' and 'dest' bits are the same, this represents a node
1140 : * that will be migrating to itself, so no pages need move.
1141 : *
1142 : * If no bits are left in 'tmp', or if all remaining bits left
1143 : * in 'tmp' correspond to the same bit in 'to', return false
1144 : * (nothing left to migrate).
1145 : *
1146 : * This lets us pick a pair of nodes to migrate between, such that
1147 : * if possible the dest node is not already occupied by some other
1148 : * source node, minimizing the risk of overloading the memory on a
1149 : * node that would happen if we migrated incoming memory to a node
1150 : * before migrating outgoing memory source that same node.
1151 : *
1152 : * A single scan of tmp is sufficient. As we go, we remember the
1153 : * most recent <s, d> pair that moved (s != d). If we find a pair
1154 : * that not only moved, but what's better, moved to an empty slot
1155 : * (d is not set in tmp), then we break out then, with that pair.
1156 : * Otherwise when we finish scanning from_tmp, we at least have the
1157 : * most recent <s, d> pair that moved. If we get all the way through
1158 : * the scan of tmp without finding any node that moved, much less
1159 : * moved to an empty node, then there is nothing left worth migrating.
1160 : */
1161 :
1162 0 : tmp = *from;
1163 0 : while (!nodes_empty(tmp)) {
1164 0 : int s,d;
1165 0 : int source = NUMA_NO_NODE;
1166 0 : int dest = 0;
1167 :
1168 0 : for_each_node_mask(s, tmp) {
1169 :
1170 : /*
1171 : * do_migrate_pages() tries to maintain the relative
1172 : * node relationship of the pages established between
1173 : * threads and memory areas.
1174 : *
1175 : * However if the number of source nodes is not equal to
1176 : * the number of destination nodes we can not preserve
1177 : * this node relative relationship. In that case, skip
1178 : * copying memory from a node that is in the destination
1179 : * mask.
1180 : *
1181 : * Example: [2,3,4] -> [3,4,5] moves everything.
1182 : * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1183 : */
1184 :
1185 0 : if ((nodes_weight(*from) != nodes_weight(*to)) &&
1186 0 : (node_isset(s, *to)))
1187 0 : continue;
1188 :
1189 0 : d = node_remap(s, *from, *to);
1190 0 : if (s == d)
1191 0 : continue;
1192 :
1193 0 : source = s; /* Node moved. Memorize */
1194 0 : dest = d;
1195 :
1196 : /* dest not in remaining from nodes? */
1197 0 : if (!node_isset(dest, tmp))
1198 : break;
1199 : }
1200 0 : if (source == NUMA_NO_NODE)
1201 : break;
1202 :
1203 0 : node_clear(source, tmp);
1204 0 : err = migrate_to_node(mm, source, dest, flags);
1205 0 : if (err > 0)
1206 0 : busy += err;
1207 0 : if (err < 0)
1208 : break;
1209 : }
1210 0 : mmap_read_unlock(mm);
1211 0 : if (err < 0)
1212 0 : return err;
1213 : return busy;
1214 :
1215 : }
1216 :
1217 : /*
1218 : * Allocate a new page for page migration based on vma policy.
1219 : * Start by assuming the page is mapped by the same vma as contains @start.
1220 : * Search forward from there, if not. N.B., this assumes that the
1221 : * list of pages handed to migrate_pages()--which is how we get here--
1222 : * is in virtual address order.
1223 : */
1224 0 : static struct page *new_page(struct page *page, unsigned long start)
1225 : {
1226 0 : struct vm_area_struct *vma;
1227 0 : unsigned long address;
1228 :
1229 0 : vma = find_vma(current->mm, start);
1230 0 : while (vma) {
1231 0 : address = page_address_in_vma(page, vma);
1232 0 : if (address != -EFAULT)
1233 : break;
1234 0 : vma = vma->vm_next;
1235 : }
1236 :
1237 0 : if (PageHuge(page)) {
1238 : return alloc_huge_page_vma(page_hstate(compound_head(page)),
1239 : vma, address);
1240 0 : } else if (PageTransHuge(page)) {
1241 0 : struct page *thp;
1242 :
1243 0 : thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1244 : HPAGE_PMD_ORDER);
1245 0 : if (!thp)
1246 : return NULL;
1247 0 : prep_transhuge_page(thp);
1248 0 : return thp;
1249 : }
1250 : /*
1251 : * if !vma, alloc_page_vma() will use task or system default policy
1252 : */
1253 0 : return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1254 : vma, address);
1255 : }
1256 : #else
1257 :
1258 : static int migrate_page_add(struct page *page, struct list_head *pagelist,
1259 : unsigned long flags)
1260 : {
1261 : return -EIO;
1262 : }
1263 :
1264 : int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1265 : const nodemask_t *to, int flags)
1266 : {
1267 : return -ENOSYS;
1268 : }
1269 :
1270 : static struct page *new_page(struct page *page, unsigned long start)
1271 : {
1272 : return NULL;
1273 : }
1274 : #endif
1275 :
1276 0 : static long do_mbind(unsigned long start, unsigned long len,
1277 : unsigned short mode, unsigned short mode_flags,
1278 : nodemask_t *nmask, unsigned long flags)
1279 : {
1280 0 : struct mm_struct *mm = current->mm;
1281 0 : struct mempolicy *new;
1282 0 : unsigned long end;
1283 0 : int err;
1284 0 : int ret;
1285 0 : LIST_HEAD(pagelist);
1286 :
1287 0 : if (flags & ~(unsigned long)MPOL_MF_VALID)
1288 : return -EINVAL;
1289 0 : if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1290 : return -EPERM;
1291 :
1292 0 : if (start & ~PAGE_MASK)
1293 : return -EINVAL;
1294 :
1295 0 : if (mode == MPOL_DEFAULT)
1296 0 : flags &= ~MPOL_MF_STRICT;
1297 :
1298 0 : len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1299 0 : end = start + len;
1300 :
1301 0 : if (end < start)
1302 : return -EINVAL;
1303 0 : if (end == start)
1304 : return 0;
1305 :
1306 0 : new = mpol_new(mode, mode_flags, nmask);
1307 0 : if (IS_ERR(new))
1308 0 : return PTR_ERR(new);
1309 :
1310 0 : if (flags & MPOL_MF_LAZY)
1311 : new->flags |= MPOL_F_MOF;
1312 :
1313 : /*
1314 : * If we are using the default policy then operation
1315 : * on discontinuous address spaces is okay after all
1316 : */
1317 0 : if (!new)
1318 0 : flags |= MPOL_MF_DISCONTIG_OK;
1319 :
1320 0 : pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1321 : start, start + len, mode, mode_flags,
1322 : nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1323 :
1324 0 : if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1325 :
1326 0 : migrate_prep();
1327 : }
1328 : {
1329 0 : NODEMASK_SCRATCH(scratch);
1330 0 : if (scratch) {
1331 0 : mmap_write_lock(mm);
1332 0 : err = mpol_set_nodemask(new, nmask, scratch);
1333 0 : if (err)
1334 0 : mmap_write_unlock(mm);
1335 : } else
1336 : err = -ENOMEM;
1337 0 : NODEMASK_SCRATCH_FREE(scratch);
1338 : }
1339 0 : if (err)
1340 0 : goto mpol_out;
1341 :
1342 0 : ret = queue_pages_range(mm, start, end, nmask,
1343 : flags | MPOL_MF_INVERT, &pagelist);
1344 :
1345 0 : if (ret < 0) {
1346 0 : err = ret;
1347 0 : goto up_out;
1348 : }
1349 :
1350 0 : err = mbind_range(mm, start, end, new);
1351 :
1352 0 : if (!err) {
1353 0 : int nr_failed = 0;
1354 :
1355 0 : if (!list_empty(&pagelist)) {
1356 0 : WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1357 0 : nr_failed = migrate_pages(&pagelist, new_page, NULL,
1358 : start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1359 0 : if (nr_failed)
1360 0 : putback_movable_pages(&pagelist);
1361 : }
1362 :
1363 0 : if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1364 0 : err = -EIO;
1365 : } else {
1366 0 : up_out:
1367 0 : if (!list_empty(&pagelist))
1368 0 : putback_movable_pages(&pagelist);
1369 : }
1370 :
1371 0 : mmap_write_unlock(mm);
1372 0 : mpol_out:
1373 0 : mpol_put(new);
1374 0 : return err;
1375 : }
1376 :
1377 : /*
1378 : * User space interface with variable sized bitmaps for nodelists.
1379 : */
1380 :
1381 : /* Copy a node mask from user space. */
1382 0 : static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1383 : unsigned long maxnode)
1384 : {
1385 0 : unsigned long k;
1386 0 : unsigned long t;
1387 0 : unsigned long nlongs;
1388 0 : unsigned long endmask;
1389 :
1390 0 : --maxnode;
1391 0 : nodes_clear(*nodes);
1392 0 : if (maxnode == 0 || !nmask)
1393 : return 0;
1394 0 : if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1395 : return -EINVAL;
1396 :
1397 0 : nlongs = BITS_TO_LONGS(maxnode);
1398 0 : if ((maxnode % BITS_PER_LONG) == 0)
1399 : endmask = ~0UL;
1400 : else
1401 0 : endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1402 :
1403 : /*
1404 : * When the user specified more nodes than supported just check
1405 : * if the non supported part is all zero.
1406 : *
1407 : * If maxnode have more longs than MAX_NUMNODES, check
1408 : * the bits in that area first. And then go through to
1409 : * check the rest bits which equal or bigger than MAX_NUMNODES.
1410 : * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1411 : */
1412 0 : if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1413 0 : for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1414 0 : if (get_user(t, nmask + k))
1415 : return -EFAULT;
1416 0 : if (k == nlongs - 1) {
1417 0 : if (t & endmask)
1418 : return -EINVAL;
1419 0 : } else if (t)
1420 : return -EINVAL;
1421 : }
1422 : nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1423 : endmask = ~0UL;
1424 : }
1425 :
1426 0 : if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1427 : unsigned long valid_mask = endmask;
1428 :
1429 : valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1430 : if (get_user(t, nmask + nlongs - 1))
1431 : return -EFAULT;
1432 : if (t & valid_mask)
1433 : return -EINVAL;
1434 : }
1435 :
1436 0 : if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1437 : return -EFAULT;
1438 0 : nodes_addr(*nodes)[nlongs-1] &= endmask;
1439 0 : return 0;
1440 : }
1441 :
1442 : /* Copy a kernel node mask to user space */
1443 0 : static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1444 : nodemask_t *nodes)
1445 : {
1446 0 : unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1447 0 : unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1448 :
1449 0 : if (copy > nbytes) {
1450 0 : if (copy > PAGE_SIZE)
1451 : return -EINVAL;
1452 0 : if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1453 : return -EFAULT;
1454 : copy = nbytes;
1455 : }
1456 0 : return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1457 : }
1458 :
1459 0 : static long kernel_mbind(unsigned long start, unsigned long len,
1460 : unsigned long mode, const unsigned long __user *nmask,
1461 : unsigned long maxnode, unsigned int flags)
1462 : {
1463 0 : nodemask_t nodes;
1464 0 : int err;
1465 0 : unsigned short mode_flags;
1466 :
1467 0 : start = untagged_addr(start);
1468 0 : mode_flags = mode & MPOL_MODE_FLAGS;
1469 0 : mode &= ~MPOL_MODE_FLAGS;
1470 0 : if (mode >= MPOL_MAX)
1471 : return -EINVAL;
1472 0 : if ((mode_flags & MPOL_F_STATIC_NODES) &&
1473 : (mode_flags & MPOL_F_RELATIVE_NODES))
1474 : return -EINVAL;
1475 0 : err = get_nodes(&nodes, nmask, maxnode);
1476 0 : if (err)
1477 0 : return err;
1478 0 : return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1479 : }
1480 :
1481 0 : SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1482 : unsigned long, mode, const unsigned long __user *, nmask,
1483 : unsigned long, maxnode, unsigned int, flags)
1484 : {
1485 0 : return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1486 : }
1487 :
1488 : /* Set the process memory policy */
1489 0 : static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1490 : unsigned long maxnode)
1491 : {
1492 0 : int err;
1493 0 : nodemask_t nodes;
1494 0 : unsigned short flags;
1495 :
1496 0 : flags = mode & MPOL_MODE_FLAGS;
1497 0 : mode &= ~MPOL_MODE_FLAGS;
1498 0 : if ((unsigned int)mode >= MPOL_MAX)
1499 : return -EINVAL;
1500 0 : if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1501 : return -EINVAL;
1502 0 : err = get_nodes(&nodes, nmask, maxnode);
1503 0 : if (err)
1504 0 : return err;
1505 0 : return do_set_mempolicy(mode, flags, &nodes);
1506 : }
1507 :
1508 0 : SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1509 : unsigned long, maxnode)
1510 : {
1511 0 : return kernel_set_mempolicy(mode, nmask, maxnode);
1512 : }
1513 :
1514 0 : static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1515 : const unsigned long __user *old_nodes,
1516 : const unsigned long __user *new_nodes)
1517 : {
1518 0 : struct mm_struct *mm = NULL;
1519 0 : struct task_struct *task;
1520 0 : nodemask_t task_nodes;
1521 0 : int err;
1522 0 : nodemask_t *old;
1523 0 : nodemask_t *new;
1524 0 : NODEMASK_SCRATCH(scratch);
1525 :
1526 0 : if (!scratch)
1527 : return -ENOMEM;
1528 :
1529 0 : old = &scratch->mask1;
1530 0 : new = &scratch->mask2;
1531 :
1532 0 : err = get_nodes(old, old_nodes, maxnode);
1533 0 : if (err)
1534 0 : goto out;
1535 :
1536 0 : err = get_nodes(new, new_nodes, maxnode);
1537 0 : if (err)
1538 0 : goto out;
1539 :
1540 : /* Find the mm_struct */
1541 0 : rcu_read_lock();
1542 0 : task = pid ? find_task_by_vpid(pid) : current;
1543 0 : if (!task) {
1544 0 : rcu_read_unlock();
1545 0 : err = -ESRCH;
1546 0 : goto out;
1547 : }
1548 0 : get_task_struct(task);
1549 :
1550 0 : err = -EINVAL;
1551 :
1552 : /*
1553 : * Check if this process has the right to modify the specified process.
1554 : * Use the regular "ptrace_may_access()" checks.
1555 : */
1556 0 : if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1557 0 : rcu_read_unlock();
1558 0 : err = -EPERM;
1559 0 : goto out_put;
1560 : }
1561 0 : rcu_read_unlock();
1562 :
1563 0 : task_nodes = cpuset_mems_allowed(task);
1564 : /* Is the user allowed to access the target nodes? */
1565 0 : if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1566 0 : err = -EPERM;
1567 0 : goto out_put;
1568 : }
1569 :
1570 0 : task_nodes = cpuset_mems_allowed(current);
1571 0 : nodes_and(*new, *new, task_nodes);
1572 0 : if (nodes_empty(*new))
1573 0 : goto out_put;
1574 :
1575 0 : err = security_task_movememory(task);
1576 0 : if (err)
1577 0 : goto out_put;
1578 :
1579 0 : mm = get_task_mm(task);
1580 0 : put_task_struct(task);
1581 :
1582 0 : if (!mm) {
1583 0 : err = -EINVAL;
1584 0 : goto out;
1585 : }
1586 :
1587 0 : err = do_migrate_pages(mm, old, new,
1588 0 : capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1589 :
1590 0 : mmput(mm);
1591 0 : out:
1592 0 : NODEMASK_SCRATCH_FREE(scratch);
1593 :
1594 0 : return err;
1595 :
1596 0 : out_put:
1597 0 : put_task_struct(task);
1598 0 : goto out;
1599 :
1600 : }
1601 :
1602 0 : SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1603 : const unsigned long __user *, old_nodes,
1604 : const unsigned long __user *, new_nodes)
1605 : {
1606 0 : return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1607 : }
1608 :
1609 :
1610 : /* Retrieve NUMA policy */
1611 0 : static int kernel_get_mempolicy(int __user *policy,
1612 : unsigned long __user *nmask,
1613 : unsigned long maxnode,
1614 : unsigned long addr,
1615 : unsigned long flags)
1616 : {
1617 0 : int err;
1618 0 : int pval;
1619 0 : nodemask_t nodes;
1620 :
1621 0 : if (nmask != NULL && maxnode < nr_node_ids)
1622 : return -EINVAL;
1623 :
1624 0 : addr = untagged_addr(addr);
1625 :
1626 0 : err = do_get_mempolicy(&pval, &nodes, addr, flags);
1627 :
1628 0 : if (err)
1629 : return err;
1630 :
1631 0 : if (policy && put_user(pval, policy))
1632 : return -EFAULT;
1633 :
1634 0 : if (nmask)
1635 0 : err = copy_nodes_to_user(nmask, maxnode, &nodes);
1636 :
1637 : return err;
1638 : }
1639 :
1640 0 : SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1641 : unsigned long __user *, nmask, unsigned long, maxnode,
1642 : unsigned long, addr, unsigned long, flags)
1643 : {
1644 0 : return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1645 : }
1646 :
1647 : #ifdef CONFIG_COMPAT
1648 :
1649 0 : COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1650 : compat_ulong_t __user *, nmask,
1651 : compat_ulong_t, maxnode,
1652 : compat_ulong_t, addr, compat_ulong_t, flags)
1653 : {
1654 0 : long err;
1655 0 : unsigned long __user *nm = NULL;
1656 0 : unsigned long nr_bits, alloc_size;
1657 0 : DECLARE_BITMAP(bm, MAX_NUMNODES);
1658 :
1659 0 : nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1660 0 : alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1661 :
1662 0 : if (nmask)
1663 0 : nm = compat_alloc_user_space(alloc_size);
1664 :
1665 0 : err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1666 :
1667 0 : if (!err && nmask) {
1668 0 : unsigned long copy_size;
1669 0 : copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1670 0 : err = copy_from_user(bm, nm, copy_size);
1671 : /* ensure entire bitmap is zeroed */
1672 0 : err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1673 0 : err |= compat_put_bitmap(nmask, bm, nr_bits);
1674 : }
1675 :
1676 0 : return err;
1677 : }
1678 :
1679 0 : COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1680 : compat_ulong_t, maxnode)
1681 : {
1682 0 : unsigned long __user *nm = NULL;
1683 0 : unsigned long nr_bits, alloc_size;
1684 0 : DECLARE_BITMAP(bm, MAX_NUMNODES);
1685 :
1686 0 : nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1687 0 : alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1688 :
1689 0 : if (nmask) {
1690 0 : if (compat_get_bitmap(bm, nmask, nr_bits))
1691 : return -EFAULT;
1692 0 : nm = compat_alloc_user_space(alloc_size);
1693 0 : if (copy_to_user(nm, bm, alloc_size))
1694 : return -EFAULT;
1695 : }
1696 :
1697 0 : return kernel_set_mempolicy(mode, nm, nr_bits+1);
1698 : }
1699 :
1700 0 : COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1701 : compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1702 : compat_ulong_t, maxnode, compat_ulong_t, flags)
1703 : {
1704 0 : unsigned long __user *nm = NULL;
1705 0 : unsigned long nr_bits, alloc_size;
1706 0 : nodemask_t bm;
1707 :
1708 0 : nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1709 0 : alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1710 :
1711 0 : if (nmask) {
1712 0 : if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1713 : return -EFAULT;
1714 0 : nm = compat_alloc_user_space(alloc_size);
1715 0 : if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1716 : return -EFAULT;
1717 : }
1718 :
1719 0 : return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1720 : }
1721 :
1722 0 : COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1723 : compat_ulong_t, maxnode,
1724 : const compat_ulong_t __user *, old_nodes,
1725 : const compat_ulong_t __user *, new_nodes)
1726 : {
1727 0 : unsigned long __user *old = NULL;
1728 0 : unsigned long __user *new = NULL;
1729 0 : nodemask_t tmp_mask;
1730 0 : unsigned long nr_bits;
1731 0 : unsigned long size;
1732 :
1733 0 : nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1734 0 : size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1735 0 : if (old_nodes) {
1736 0 : if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1737 : return -EFAULT;
1738 0 : old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1739 0 : if (new_nodes)
1740 0 : new = old + size / sizeof(unsigned long);
1741 0 : if (copy_to_user(old, nodes_addr(tmp_mask), size))
1742 : return -EFAULT;
1743 : }
1744 0 : if (new_nodes) {
1745 0 : if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1746 : return -EFAULT;
1747 0 : if (new == NULL)
1748 0 : new = compat_alloc_user_space(size);
1749 0 : if (copy_to_user(new, nodes_addr(tmp_mask), size))
1750 : return -EFAULT;
1751 : }
1752 0 : return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1753 : }
1754 :
1755 : #endif /* CONFIG_COMPAT */
1756 :
1757 0 : bool vma_migratable(struct vm_area_struct *vma)
1758 : {
1759 0 : if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1760 : return false;
1761 :
1762 : /*
1763 : * DAX device mappings require predictable access latency, so avoid
1764 : * incurring periodic faults.
1765 : */
1766 0 : if (vma_is_dax(vma))
1767 : return false;
1768 :
1769 0 : if (is_vm_hugetlb_page(vma) &&
1770 : !hugepage_migration_supported(hstate_vma(vma)))
1771 : return false;
1772 :
1773 : /*
1774 : * Migration allocates pages in the highest zone. If we cannot
1775 : * do so then migration (at least from node to node) is not
1776 : * possible.
1777 : */
1778 0 : if (vma->vm_file &&
1779 0 : gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1780 0 : < policy_zone)
1781 0 : return false;
1782 : return true;
1783 : }
1784 :
1785 72054 : struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1786 : unsigned long addr)
1787 : {
1788 72054 : struct mempolicy *pol = NULL;
1789 :
1790 72054 : if (vma) {
1791 72054 : if (vma->vm_ops && vma->vm_ops->get_policy) {
1792 8 : pol = vma->vm_ops->get_policy(vma, addr);
1793 72046 : } else if (vma->vm_policy) {
1794 0 : pol = vma->vm_policy;
1795 :
1796 : /*
1797 : * shmem_alloc_page() passes MPOL_F_SHARED policy with
1798 : * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1799 : * count on these policies which will be dropped by
1800 : * mpol_cond_put() later
1801 : */
1802 0 : if (mpol_needs_cond_ref(pol))
1803 0 : mpol_get(pol);
1804 : }
1805 : }
1806 :
1807 72054 : return pol;
1808 : }
1809 :
1810 : /*
1811 : * get_vma_policy(@vma, @addr)
1812 : * @vma: virtual memory area whose policy is sought
1813 : * @addr: address in @vma for shared policy lookup
1814 : *
1815 : * Returns effective policy for a VMA at specified address.
1816 : * Falls back to current->mempolicy or system default policy, as necessary.
1817 : * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1818 : * count--added by the get_policy() vm_op, as appropriate--to protect against
1819 : * freeing by another task. It is the caller's responsibility to free the
1820 : * extra reference for shared policies.
1821 : */
1822 72059 : static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1823 : unsigned long addr)
1824 : {
1825 72059 : struct mempolicy *pol = __get_vma_policy(vma, addr);
1826 :
1827 72057 : if (!pol)
1828 72063 : pol = get_task_policy(current);
1829 :
1830 72057 : return pol;
1831 : }
1832 :
1833 0 : bool vma_policy_mof(struct vm_area_struct *vma)
1834 : {
1835 0 : struct mempolicy *pol;
1836 :
1837 0 : if (vma->vm_ops && vma->vm_ops->get_policy) {
1838 0 : bool ret = false;
1839 :
1840 0 : pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1841 0 : if (pol && (pol->flags & MPOL_F_MOF))
1842 0 : ret = true;
1843 0 : mpol_cond_put(pol);
1844 :
1845 0 : return ret;
1846 : }
1847 :
1848 0 : pol = vma->vm_policy;
1849 0 : if (!pol)
1850 0 : pol = get_task_policy(current);
1851 :
1852 0 : return pol->flags & MPOL_F_MOF;
1853 : }
1854 :
1855 0 : static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1856 : {
1857 0 : enum zone_type dynamic_policy_zone = policy_zone;
1858 :
1859 0 : BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1860 :
1861 : /*
1862 : * if policy->v.nodes has movable memory only,
1863 : * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1864 : *
1865 : * policy->v.nodes is intersect with node_states[N_MEMORY].
1866 : * so if the following test faile, it implies
1867 : * policy->v.nodes has movable memory only.
1868 : */
1869 0 : if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1870 0 : dynamic_policy_zone = ZONE_MOVABLE;
1871 :
1872 0 : return zone >= dynamic_policy_zone;
1873 : }
1874 :
1875 : /*
1876 : * Return a nodemask representing a mempolicy for filtering nodes for
1877 : * page allocation
1878 : */
1879 188776 : nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1880 : {
1881 : /* Lower zones don't get a nodemask applied for MPOL_BIND */
1882 188776 : if (unlikely(policy->mode == MPOL_BIND) &&
1883 0 : apply_policy_zone(policy, gfp_zone(gfp)) &&
1884 0 : cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1885 0 : return &policy->v.nodes;
1886 :
1887 : return NULL;
1888 : }
1889 :
1890 : /* Return the node id preferred by the given mempolicy, or the given id */
1891 188763 : static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1892 : {
1893 188763 : if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1894 188405 : nd = policy->v.preferred_node;
1895 : else {
1896 : /*
1897 : * __GFP_THISNODE shouldn't even be used with the bind policy
1898 : * because we might easily break the expectation to stay on the
1899 : * requested node and not break the policy.
1900 : */
1901 716 : WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1902 : }
1903 :
1904 188763 : return nd;
1905 : }
1906 :
1907 : /* Do dynamic interleaving for a process */
1908 7170 : static unsigned interleave_nodes(struct mempolicy *policy)
1909 : {
1910 7170 : unsigned next;
1911 7170 : struct task_struct *me = current;
1912 :
1913 7170 : next = next_node_in(me->il_prev, policy->v.nodes);
1914 7170 : if (next < MAX_NUMNODES)
1915 7170 : me->il_prev = next;
1916 7170 : return next;
1917 : }
1918 :
1919 : /*
1920 : * Depending on the memory policy provide a node from which to allocate the
1921 : * next slab entry.
1922 : */
1923 27483 : unsigned int mempolicy_slab_node(void)
1924 : {
1925 27483 : struct mempolicy *policy;
1926 27483 : int node = numa_mem_id();
1927 :
1928 27483 : if (in_interrupt())
1929 : return node;
1930 :
1931 27449 : policy = current->mempolicy;
1932 27449 : if (!policy || policy->flags & MPOL_F_LOCAL)
1933 : return node;
1934 :
1935 3450 : switch (policy->mode) {
1936 0 : case MPOL_PREFERRED:
1937 : /*
1938 : * handled MPOL_F_LOCAL above
1939 : */
1940 0 : return policy->v.preferred_node;
1941 :
1942 3450 : case MPOL_INTERLEAVE:
1943 3450 : return interleave_nodes(policy);
1944 :
1945 : case MPOL_BIND: {
1946 0 : struct zoneref *z;
1947 :
1948 : /*
1949 : * Follow bind policy behavior and start allocation at the
1950 : * first node.
1951 : */
1952 0 : struct zonelist *zonelist;
1953 0 : enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1954 0 : zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1955 0 : z = first_zones_zonelist(zonelist, highest_zoneidx,
1956 : &policy->v.nodes);
1957 0 : return z->zone ? zone_to_nid(z->zone) : node;
1958 : }
1959 :
1960 0 : default:
1961 0 : BUG();
1962 : }
1963 : }
1964 :
1965 : /*
1966 : * Do static interleaving for a VMA with known offset @n. Returns the n'th
1967 : * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1968 : * number of present nodes.
1969 : */
1970 0 : static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1971 : {
1972 0 : unsigned nnodes = nodes_weight(pol->v.nodes);
1973 0 : unsigned target;
1974 0 : int i;
1975 0 : int nid;
1976 :
1977 0 : if (!nnodes)
1978 0 : return numa_node_id();
1979 0 : target = (unsigned int)n % nnodes;
1980 0 : nid = first_node(pol->v.nodes);
1981 0 : for (i = 0; i < target; i++)
1982 0 : nid = next_node(nid, pol->v.nodes);
1983 0 : return nid;
1984 : }
1985 :
1986 : /* Determine a node number for interleave */
1987 0 : static inline unsigned interleave_nid(struct mempolicy *pol,
1988 : struct vm_area_struct *vma, unsigned long addr, int shift)
1989 : {
1990 0 : if (vma) {
1991 0 : unsigned long off;
1992 :
1993 : /*
1994 : * for small pages, there is no difference between
1995 : * shift and PAGE_SHIFT, so the bit-shift is safe.
1996 : * for huge pages, since vm_pgoff is in units of small
1997 : * pages, we need to shift off the always 0 bits to get
1998 : * a useful offset.
1999 : */
2000 0 : BUG_ON(shift < PAGE_SHIFT);
2001 0 : off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
2002 0 : off += (addr - vma->vm_start) >> shift;
2003 0 : return offset_il_node(pol, off);
2004 : } else
2005 0 : return interleave_nodes(pol);
2006 : }
2007 :
2008 : #ifdef CONFIG_HUGETLBFS
2009 : /*
2010 : * huge_node(@vma, @addr, @gfp_flags, @mpol)
2011 : * @vma: virtual memory area whose policy is sought
2012 : * @addr: address in @vma for shared policy lookup and interleave policy
2013 : * @gfp_flags: for requested zone
2014 : * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2015 : * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2016 : *
2017 : * Returns a nid suitable for a huge page allocation and a pointer
2018 : * to the struct mempolicy for conditional unref after allocation.
2019 : * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2020 : * @nodemask for filtering the zonelist.
2021 : *
2022 : * Must be protected by read_mems_allowed_begin()
2023 : */
2024 : int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2025 : struct mempolicy **mpol, nodemask_t **nodemask)
2026 : {
2027 : int nid;
2028 :
2029 : *mpol = get_vma_policy(vma, addr);
2030 : *nodemask = NULL; /* assume !MPOL_BIND */
2031 :
2032 : if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2033 : nid = interleave_nid(*mpol, vma, addr,
2034 : huge_page_shift(hstate_vma(vma)));
2035 : } else {
2036 : nid = policy_node(gfp_flags, *mpol, numa_node_id());
2037 : if ((*mpol)->mode == MPOL_BIND)
2038 : *nodemask = &(*mpol)->v.nodes;
2039 : }
2040 : return nid;
2041 : }
2042 :
2043 : /*
2044 : * init_nodemask_of_mempolicy
2045 : *
2046 : * If the current task's mempolicy is "default" [NULL], return 'false'
2047 : * to indicate default policy. Otherwise, extract the policy nodemask
2048 : * for 'bind' or 'interleave' policy into the argument nodemask, or
2049 : * initialize the argument nodemask to contain the single node for
2050 : * 'preferred' or 'local' policy and return 'true' to indicate presence
2051 : * of non-default mempolicy.
2052 : *
2053 : * We don't bother with reference counting the mempolicy [mpol_get/put]
2054 : * because the current task is examining it's own mempolicy and a task's
2055 : * mempolicy is only ever changed by the task itself.
2056 : *
2057 : * N.B., it is the caller's responsibility to free a returned nodemask.
2058 : */
2059 : bool init_nodemask_of_mempolicy(nodemask_t *mask)
2060 : {
2061 : struct mempolicy *mempolicy;
2062 : int nid;
2063 :
2064 : if (!(mask && current->mempolicy))
2065 : return false;
2066 :
2067 : task_lock(current);
2068 : mempolicy = current->mempolicy;
2069 : switch (mempolicy->mode) {
2070 : case MPOL_PREFERRED:
2071 : if (mempolicy->flags & MPOL_F_LOCAL)
2072 : nid = numa_node_id();
2073 : else
2074 : nid = mempolicy->v.preferred_node;
2075 : init_nodemask_of_node(mask, nid);
2076 : break;
2077 :
2078 : case MPOL_BIND:
2079 : case MPOL_INTERLEAVE:
2080 : *mask = mempolicy->v.nodes;
2081 : break;
2082 :
2083 : default:
2084 : BUG();
2085 : }
2086 : task_unlock(current);
2087 :
2088 : return true;
2089 : }
2090 : #endif
2091 :
2092 : /*
2093 : * mempolicy_nodemask_intersects
2094 : *
2095 : * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2096 : * policy. Otherwise, check for intersection between mask and the policy
2097 : * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
2098 : * policy, always return true since it may allocate elsewhere on fallback.
2099 : *
2100 : * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2101 : */
2102 0 : bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2103 : const nodemask_t *mask)
2104 : {
2105 0 : struct mempolicy *mempolicy;
2106 0 : bool ret = true;
2107 :
2108 0 : if (!mask)
2109 : return ret;
2110 0 : task_lock(tsk);
2111 0 : mempolicy = tsk->mempolicy;
2112 0 : if (!mempolicy)
2113 0 : goto out;
2114 :
2115 0 : switch (mempolicy->mode) {
2116 : case MPOL_PREFERRED:
2117 : /*
2118 : * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2119 : * allocate from, they may fallback to other nodes when oom.
2120 : * Thus, it's possible for tsk to have allocated memory from
2121 : * nodes in mask.
2122 : */
2123 : break;
2124 0 : case MPOL_BIND:
2125 : case MPOL_INTERLEAVE:
2126 0 : ret = nodes_intersects(mempolicy->v.nodes, *mask);
2127 0 : break;
2128 0 : default:
2129 0 : BUG();
2130 : }
2131 0 : out:
2132 0 : task_unlock(tsk);
2133 0 : return ret;
2134 : }
2135 :
2136 : /* Allocate a page in interleaved policy.
2137 : Own path because it needs to do special accounting. */
2138 3720 : static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2139 : unsigned nid)
2140 : {
2141 3720 : struct page *page;
2142 :
2143 3720 : page = __alloc_pages(gfp, order, nid);
2144 : /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2145 3720 : if (!static_branch_likely(&vm_numa_stat_key))
2146 : return page;
2147 3720 : if (page && page_to_nid(page) == nid) {
2148 3720 : preempt_disable();
2149 3720 : __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2150 3720 : preempt_enable();
2151 : }
2152 : return page;
2153 : }
2154 :
2155 : /**
2156 : * alloc_pages_vma - Allocate a page for a VMA.
2157 : *
2158 : * @gfp:
2159 : * %GFP_USER user allocation.
2160 : * %GFP_KERNEL kernel allocations,
2161 : * %GFP_HIGHMEM highmem/user allocations,
2162 : * %GFP_FS allocation should not call back into a file system.
2163 : * %GFP_ATOMIC don't sleep.
2164 : *
2165 : * @order:Order of the GFP allocation.
2166 : * @vma: Pointer to VMA or NULL if not available.
2167 : * @addr: Virtual Address of the allocation. Must be inside the VMA.
2168 : * @node: Which node to prefer for allocation (modulo policy).
2169 : * @hugepage: for hugepages try only the preferred node if possible
2170 : *
2171 : * This function allocates a page from the kernel page pool and applies
2172 : * a NUMA policy associated with the VMA or the current process.
2173 : * When VMA is not NULL caller must read-lock the mmap_lock of the
2174 : * mm_struct of the VMA to prevent it from going away. Should be used for
2175 : * all allocations for pages that will be mapped into user space. Returns
2176 : * NULL when no page can be allocated.
2177 : */
2178 : struct page *
2179 72061 : alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2180 : unsigned long addr, int node, bool hugepage)
2181 : {
2182 72061 : struct mempolicy *pol;
2183 72061 : struct page *page;
2184 72061 : int preferred_nid;
2185 72061 : nodemask_t *nmask;
2186 :
2187 72061 : pol = get_vma_policy(vma, addr);
2188 :
2189 72064 : if (pol->mode == MPOL_INTERLEAVE) {
2190 0 : unsigned nid;
2191 :
2192 0 : nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2193 0 : mpol_cond_put(pol);
2194 0 : page = alloc_page_interleave(gfp, order, nid);
2195 0 : goto out;
2196 : }
2197 :
2198 72064 : if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2199 17 : int hpage_node = node;
2200 :
2201 : /*
2202 : * For hugepage allocation and non-interleave policy which
2203 : * allows the current node (or other explicitly preferred
2204 : * node) we only try to allocate from the current/preferred
2205 : * node and don't fall back to other nodes, as the cost of
2206 : * remote accesses would likely offset THP benefits.
2207 : *
2208 : * If the policy is interleave, or does not allow the current
2209 : * node in its nodemask, we allocate the standard way.
2210 : */
2211 17 : if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2212 17 : hpage_node = pol->v.preferred_node;
2213 :
2214 17 : nmask = policy_nodemask(gfp, pol);
2215 17 : if (!nmask || node_isset(hpage_node, *nmask)) {
2216 17 : mpol_cond_put(pol);
2217 : /*
2218 : * First, try to allocate THP only on local node, but
2219 : * don't reclaim unnecessarily, just compact.
2220 : */
2221 17 : page = __alloc_pages_node(hpage_node,
2222 : gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2223 :
2224 : /*
2225 : * If hugepage allocations are configured to always
2226 : * synchronous compact or the vma has been madvised
2227 : * to prefer hugepage backing, retry allowing remote
2228 : * memory with both reclaim and compact as well.
2229 : */
2230 17 : if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2231 0 : page = __alloc_pages_node(hpage_node,
2232 : gfp, order);
2233 :
2234 17 : goto out;
2235 : }
2236 : }
2237 :
2238 72047 : nmask = policy_nodemask(gfp, pol);
2239 72044 : preferred_nid = policy_node(gfp, pol, node);
2240 72045 : page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2241 72037 : mpol_cond_put(pol);
2242 72056 : out:
2243 72056 : return page;
2244 : }
2245 : EXPORT_SYMBOL(alloc_pages_vma);
2246 :
2247 : /**
2248 : * alloc_pages_current - Allocate pages.
2249 : *
2250 : * @gfp:
2251 : * %GFP_USER user allocation,
2252 : * %GFP_KERNEL kernel allocation,
2253 : * %GFP_HIGHMEM highmem allocation,
2254 : * %GFP_FS don't call back into a file system.
2255 : * %GFP_ATOMIC don't sleep.
2256 : * @order: Power of two of allocation size in pages. 0 is a single page.
2257 : *
2258 : * Allocate a page from the kernel page pool. When not in
2259 : * interrupt context and apply the current process NUMA policy.
2260 : * Returns NULL when no page can be allocated.
2261 : */
2262 120428 : struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2263 : {
2264 120428 : struct mempolicy *pol = &default_policy;
2265 120428 : struct page *page;
2266 :
2267 120428 : if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2268 120354 : pol = get_task_policy(current);
2269 :
2270 : /*
2271 : * No reference counting needed for current->mempolicy
2272 : * nor system default_policy
2273 : */
2274 120428 : if (pol->mode == MPOL_INTERLEAVE)
2275 3720 : page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2276 : else
2277 116708 : page = __alloc_pages_nodemask(gfp, order,
2278 : policy_node(gfp, pol, numa_node_id()),
2279 : policy_nodemask(gfp, pol));
2280 :
2281 120429 : return page;
2282 : }
2283 : EXPORT_SYMBOL(alloc_pages_current);
2284 :
2285 86786 : int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2286 : {
2287 86786 : struct mempolicy *pol = mpol_dup(vma_policy(src));
2288 :
2289 86786 : if (IS_ERR(pol))
2290 0 : return PTR_ERR(pol);
2291 86786 : dst->vm_policy = pol;
2292 86786 : return 0;
2293 : }
2294 :
2295 : /*
2296 : * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2297 : * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2298 : * with the mems_allowed returned by cpuset_mems_allowed(). This
2299 : * keeps mempolicies cpuset relative after its cpuset moves. See
2300 : * further kernel/cpuset.c update_nodemask().
2301 : *
2302 : * current's mempolicy may be rebinded by the other task(the task that changes
2303 : * cpuset's mems), so we needn't do rebind work for current task.
2304 : */
2305 :
2306 : /* Slow path of a mempolicy duplicate */
2307 4 : struct mempolicy *__mpol_dup(struct mempolicy *old)
2308 : {
2309 4 : struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2310 :
2311 4 : if (!new)
2312 4 : return ERR_PTR(-ENOMEM);
2313 :
2314 : /* task's mempolicy is protected by alloc_lock */
2315 4 : if (old == current->mempolicy) {
2316 4 : task_lock(current);
2317 4 : *new = *old;
2318 4 : task_unlock(current);
2319 : } else
2320 0 : *new = *old;
2321 :
2322 4 : if (current_cpuset_is_being_rebound()) {
2323 : nodemask_t mems = cpuset_mems_allowed(current);
2324 : mpol_rebind_policy(new, &mems);
2325 : }
2326 4 : atomic_set(&new->refcnt, 1);
2327 4 : return new;
2328 : }
2329 :
2330 : /* Slow path of a mempolicy comparison */
2331 0 : bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2332 : {
2333 0 : if (!a || !b)
2334 : return false;
2335 0 : if (a->mode != b->mode)
2336 : return false;
2337 0 : if (a->flags != b->flags)
2338 : return false;
2339 0 : if (mpol_store_user_nodemask(a))
2340 0 : if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2341 : return false;
2342 :
2343 0 : switch (a->mode) {
2344 0 : case MPOL_BIND:
2345 : case MPOL_INTERLEAVE:
2346 0 : return !!nodes_equal(a->v.nodes, b->v.nodes);
2347 0 : case MPOL_PREFERRED:
2348 : /* a's ->flags is the same as b's */
2349 0 : if (a->flags & MPOL_F_LOCAL)
2350 : return true;
2351 0 : return a->v.preferred_node == b->v.preferred_node;
2352 0 : default:
2353 0 : BUG();
2354 : return false;
2355 : }
2356 : }
2357 :
2358 : /*
2359 : * Shared memory backing store policy support.
2360 : *
2361 : * Remember policies even when nobody has shared memory mapped.
2362 : * The policies are kept in Red-Black tree linked from the inode.
2363 : * They are protected by the sp->lock rwlock, which should be held
2364 : * for any accesses to the tree.
2365 : */
2366 :
2367 : /*
2368 : * lookup first element intersecting start-end. Caller holds sp->lock for
2369 : * reading or for writing
2370 : */
2371 : static struct sp_node *
2372 0 : sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2373 : {
2374 0 : struct rb_node *n = sp->root.rb_node;
2375 :
2376 0 : while (n) {
2377 0 : struct sp_node *p = rb_entry(n, struct sp_node, nd);
2378 :
2379 0 : if (start >= p->end)
2380 0 : n = n->rb_right;
2381 0 : else if (end <= p->start)
2382 0 : n = n->rb_left;
2383 : else
2384 : break;
2385 : }
2386 0 : if (!n)
2387 : return NULL;
2388 0 : for (;;) {
2389 0 : struct sp_node *w = NULL;
2390 0 : struct rb_node *prev = rb_prev(n);
2391 0 : if (!prev)
2392 : break;
2393 0 : w = rb_entry(prev, struct sp_node, nd);
2394 0 : if (w->end <= start)
2395 : break;
2396 : n = prev;
2397 : }
2398 0 : return rb_entry(n, struct sp_node, nd);
2399 : }
2400 :
2401 : /*
2402 : * Insert a new shared policy into the list. Caller holds sp->lock for
2403 : * writing.
2404 : */
2405 0 : static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2406 : {
2407 0 : struct rb_node **p = &sp->root.rb_node;
2408 0 : struct rb_node *parent = NULL;
2409 0 : struct sp_node *nd;
2410 :
2411 0 : while (*p) {
2412 0 : parent = *p;
2413 0 : nd = rb_entry(parent, struct sp_node, nd);
2414 0 : if (new->start < nd->start)
2415 0 : p = &(*p)->rb_left;
2416 0 : else if (new->end > nd->end)
2417 0 : p = &(*p)->rb_right;
2418 : else
2419 0 : BUG();
2420 : }
2421 0 : rb_link_node(&new->nd, parent, p);
2422 0 : rb_insert_color(&new->nd, &sp->root);
2423 0 : pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2424 : new->policy ? new->policy->mode : 0);
2425 0 : }
2426 :
2427 : /* Find shared policy intersecting idx */
2428 : struct mempolicy *
2429 1994 : mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2430 : {
2431 1994 : struct mempolicy *pol = NULL;
2432 1994 : struct sp_node *sn;
2433 :
2434 1994 : if (!sp->root.rb_node)
2435 : return NULL;
2436 0 : read_lock(&sp->lock);
2437 0 : sn = sp_lookup(sp, idx, idx+1);
2438 0 : if (sn) {
2439 0 : mpol_get(sn->policy);
2440 0 : pol = sn->policy;
2441 : }
2442 0 : read_unlock(&sp->lock);
2443 0 : return pol;
2444 : }
2445 :
2446 0 : static void sp_free(struct sp_node *n)
2447 : {
2448 0 : mpol_put(n->policy);
2449 0 : kmem_cache_free(sn_cache, n);
2450 0 : }
2451 :
2452 : /**
2453 : * mpol_misplaced - check whether current page node is valid in policy
2454 : *
2455 : * @page: page to be checked
2456 : * @vma: vm area where page mapped
2457 : * @addr: virtual address where page mapped
2458 : *
2459 : * Lookup current policy node id for vma,addr and "compare to" page's
2460 : * node id.
2461 : *
2462 : * Returns:
2463 : * -1 - not misplaced, page is in the right node
2464 : * node - node id where the page should be
2465 : *
2466 : * Policy determination "mimics" alloc_page_vma().
2467 : * Called from fault path where we know the vma and faulting address.
2468 : */
2469 0 : int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2470 : {
2471 0 : struct mempolicy *pol;
2472 0 : struct zoneref *z;
2473 0 : int curnid = page_to_nid(page);
2474 0 : unsigned long pgoff;
2475 0 : int thiscpu = raw_smp_processor_id();
2476 0 : int thisnid = cpu_to_node(thiscpu);
2477 0 : int polnid = NUMA_NO_NODE;
2478 0 : int ret = -1;
2479 :
2480 0 : pol = get_vma_policy(vma, addr);
2481 0 : if (!(pol->flags & MPOL_F_MOF))
2482 0 : goto out;
2483 :
2484 0 : switch (pol->mode) {
2485 0 : case MPOL_INTERLEAVE:
2486 0 : pgoff = vma->vm_pgoff;
2487 0 : pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2488 0 : polnid = offset_il_node(pol, pgoff);
2489 0 : break;
2490 :
2491 0 : case MPOL_PREFERRED:
2492 0 : if (pol->flags & MPOL_F_LOCAL)
2493 0 : polnid = numa_node_id();
2494 : else
2495 0 : polnid = pol->v.preferred_node;
2496 : break;
2497 :
2498 0 : case MPOL_BIND:
2499 : /* Optimize placement among multiple nodes via NUMA balancing */
2500 0 : if (pol->flags & MPOL_F_MORON) {
2501 0 : if (node_isset(thisnid, pol->v.nodes))
2502 : break;
2503 0 : goto out;
2504 : }
2505 :
2506 : /*
2507 : * allows binding to multiple nodes.
2508 : * use current page if in policy nodemask,
2509 : * else select nearest allowed node, if any.
2510 : * If no allowed nodes, use current [!misplaced].
2511 : */
2512 0 : if (node_isset(curnid, pol->v.nodes))
2513 0 : goto out;
2514 0 : z = first_zones_zonelist(
2515 : node_zonelist(numa_node_id(), GFP_HIGHUSER),
2516 : gfp_zone(GFP_HIGHUSER),
2517 : &pol->v.nodes);
2518 0 : polnid = zone_to_nid(z->zone);
2519 0 : break;
2520 :
2521 0 : default:
2522 0 : BUG();
2523 : }
2524 :
2525 : /* Migrate the page towards the node whose CPU is referencing it */
2526 0 : if (pol->flags & MPOL_F_MORON) {
2527 0 : polnid = thisnid;
2528 :
2529 0 : if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2530 : goto out;
2531 : }
2532 :
2533 0 : if (curnid != polnid)
2534 0 : ret = polnid;
2535 0 : out:
2536 0 : mpol_cond_put(pol);
2537 :
2538 0 : return ret;
2539 : }
2540 :
2541 : /*
2542 : * Drop the (possibly final) reference to task->mempolicy. It needs to be
2543 : * dropped after task->mempolicy is set to NULL so that any allocation done as
2544 : * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2545 : * policy.
2546 : */
2547 1257 : void mpol_put_task_policy(struct task_struct *task)
2548 : {
2549 1257 : struct mempolicy *pol;
2550 :
2551 1257 : task_lock(task);
2552 1257 : pol = task->mempolicy;
2553 1257 : task->mempolicy = NULL;
2554 1257 : task_unlock(task);
2555 1257 : mpol_put(pol);
2556 1257 : }
2557 :
2558 0 : static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2559 : {
2560 0 : pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2561 0 : rb_erase(&n->nd, &sp->root);
2562 0 : sp_free(n);
2563 0 : }
2564 :
2565 0 : static void sp_node_init(struct sp_node *node, unsigned long start,
2566 : unsigned long end, struct mempolicy *pol)
2567 : {
2568 0 : node->start = start;
2569 0 : node->end = end;
2570 0 : node->policy = pol;
2571 : }
2572 :
2573 0 : static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2574 : struct mempolicy *pol)
2575 : {
2576 0 : struct sp_node *n;
2577 0 : struct mempolicy *newpol;
2578 :
2579 0 : n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2580 0 : if (!n)
2581 : return NULL;
2582 :
2583 0 : newpol = mpol_dup(pol);
2584 0 : if (IS_ERR(newpol)) {
2585 0 : kmem_cache_free(sn_cache, n);
2586 0 : return NULL;
2587 : }
2588 0 : newpol->flags |= MPOL_F_SHARED;
2589 0 : sp_node_init(n, start, end, newpol);
2590 :
2591 0 : return n;
2592 : }
2593 :
2594 : /* Replace a policy range. */
2595 0 : static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2596 : unsigned long end, struct sp_node *new)
2597 : {
2598 0 : struct sp_node *n;
2599 0 : struct sp_node *n_new = NULL;
2600 0 : struct mempolicy *mpol_new = NULL;
2601 0 : int ret = 0;
2602 :
2603 0 : restart:
2604 0 : write_lock(&sp->lock);
2605 0 : n = sp_lookup(sp, start, end);
2606 : /* Take care of old policies in the same range. */
2607 0 : while (n && n->start < end) {
2608 0 : struct rb_node *next = rb_next(&n->nd);
2609 0 : if (n->start >= start) {
2610 0 : if (n->end <= end)
2611 0 : sp_delete(sp, n);
2612 : else
2613 0 : n->start = end;
2614 : } else {
2615 : /* Old policy spanning whole new range. */
2616 0 : if (n->end > end) {
2617 0 : if (!n_new)
2618 0 : goto alloc_new;
2619 :
2620 0 : *mpol_new = *n->policy;
2621 0 : atomic_set(&mpol_new->refcnt, 1);
2622 0 : sp_node_init(n_new, end, n->end, mpol_new);
2623 0 : n->end = start;
2624 0 : sp_insert(sp, n_new);
2625 0 : n_new = NULL;
2626 0 : mpol_new = NULL;
2627 0 : break;
2628 : } else
2629 0 : n->end = start;
2630 : }
2631 0 : if (!next)
2632 : break;
2633 0 : n = rb_entry(next, struct sp_node, nd);
2634 : }
2635 0 : if (new)
2636 0 : sp_insert(sp, new);
2637 0 : write_unlock(&sp->lock);
2638 0 : ret = 0;
2639 :
2640 0 : err_out:
2641 0 : if (mpol_new)
2642 0 : mpol_put(mpol_new);
2643 0 : if (n_new)
2644 0 : kmem_cache_free(sn_cache, n_new);
2645 :
2646 0 : return ret;
2647 :
2648 0 : alloc_new:
2649 0 : write_unlock(&sp->lock);
2650 0 : ret = -ENOMEM;
2651 0 : n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2652 0 : if (!n_new)
2653 0 : goto err_out;
2654 0 : mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2655 0 : if (!mpol_new)
2656 0 : goto err_out;
2657 0 : goto restart;
2658 : }
2659 :
2660 : /**
2661 : * mpol_shared_policy_init - initialize shared policy for inode
2662 : * @sp: pointer to inode shared policy
2663 : * @mpol: struct mempolicy to install
2664 : *
2665 : * Install non-NULL @mpol in inode's shared policy rb-tree.
2666 : * On entry, the current task has a reference on a non-NULL @mpol.
2667 : * This must be released on exit.
2668 : * This is called at get_inode() calls and we can use GFP_KERNEL.
2669 : */
2670 967 : void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2671 : {
2672 967 : int ret;
2673 :
2674 967 : sp->root = RB_ROOT; /* empty tree == default mempolicy */
2675 967 : rwlock_init(&sp->lock);
2676 :
2677 967 : if (mpol) {
2678 0 : struct vm_area_struct pvma;
2679 0 : struct mempolicy *new;
2680 0 : NODEMASK_SCRATCH(scratch);
2681 :
2682 0 : if (!scratch)
2683 : goto put_mpol;
2684 : /* contextualize the tmpfs mount point mempolicy */
2685 0 : new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2686 0 : if (IS_ERR(new))
2687 0 : goto free_scratch; /* no valid nodemask intersection */
2688 :
2689 0 : task_lock(current);
2690 0 : ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2691 0 : task_unlock(current);
2692 0 : if (ret)
2693 0 : goto put_new;
2694 :
2695 : /* Create pseudo-vma that contains just the policy */
2696 0 : vma_init(&pvma, NULL);
2697 0 : pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2698 0 : mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2699 :
2700 0 : put_new:
2701 0 : mpol_put(new); /* drop initial ref */
2702 0 : free_scratch:
2703 0 : NODEMASK_SCRATCH_FREE(scratch);
2704 0 : put_mpol:
2705 0 : mpol_put(mpol); /* drop our incoming ref on sb mpol */
2706 : }
2707 967 : }
2708 :
2709 0 : int mpol_set_shared_policy(struct shared_policy *info,
2710 : struct vm_area_struct *vma, struct mempolicy *npol)
2711 : {
2712 0 : int err;
2713 0 : struct sp_node *new = NULL;
2714 0 : unsigned long sz = vma_pages(vma);
2715 :
2716 0 : pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2717 : vma->vm_pgoff,
2718 : sz, npol ? npol->mode : -1,
2719 : npol ? npol->flags : -1,
2720 : npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2721 :
2722 0 : if (npol) {
2723 0 : new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2724 0 : if (!new)
2725 : return -ENOMEM;
2726 : }
2727 0 : err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2728 0 : if (err && new)
2729 0 : sp_free(new);
2730 : return err;
2731 : }
2732 :
2733 : /* Free a backing policy store on inode delete. */
2734 588 : void mpol_free_shared_policy(struct shared_policy *p)
2735 : {
2736 588 : struct sp_node *n;
2737 588 : struct rb_node *next;
2738 :
2739 588 : if (!p->root.rb_node)
2740 : return;
2741 0 : write_lock(&p->lock);
2742 0 : next = rb_first(&p->root);
2743 0 : while (next) {
2744 0 : n = rb_entry(next, struct sp_node, nd);
2745 0 : next = rb_next(&n->nd);
2746 0 : sp_delete(p, n);
2747 : }
2748 0 : write_unlock(&p->lock);
2749 : }
2750 :
2751 : #ifdef CONFIG_NUMA_BALANCING
2752 : static int __initdata numabalancing_override;
2753 :
2754 : static void __init check_numabalancing_enable(void)
2755 : {
2756 : bool numabalancing_default = false;
2757 :
2758 : if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2759 : numabalancing_default = true;
2760 :
2761 : /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2762 : if (numabalancing_override)
2763 : set_numabalancing_state(numabalancing_override == 1);
2764 :
2765 : if (num_online_nodes() > 1 && !numabalancing_override) {
2766 : pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2767 : numabalancing_default ? "Enabling" : "Disabling");
2768 : set_numabalancing_state(numabalancing_default);
2769 : }
2770 : }
2771 :
2772 : static int __init setup_numabalancing(char *str)
2773 : {
2774 : int ret = 0;
2775 : if (!str)
2776 : goto out;
2777 :
2778 : if (!strcmp(str, "enable")) {
2779 : numabalancing_override = 1;
2780 : ret = 1;
2781 : } else if (!strcmp(str, "disable")) {
2782 : numabalancing_override = -1;
2783 : ret = 1;
2784 : }
2785 : out:
2786 : if (!ret)
2787 : pr_warn("Unable to parse numa_balancing=\n");
2788 :
2789 : return ret;
2790 : }
2791 : __setup("numa_balancing=", setup_numabalancing);
2792 : #else
2793 1 : static inline void __init check_numabalancing_enable(void)
2794 : {
2795 1 : }
2796 : #endif /* CONFIG_NUMA_BALANCING */
2797 :
2798 : /* assumes fs == KERNEL_DS */
2799 1 : void __init numa_policy_init(void)
2800 : {
2801 1 : nodemask_t interleave_nodes;
2802 1 : unsigned long largest = 0;
2803 1 : int nid, prefer = 0;
2804 :
2805 1 : policy_cache = kmem_cache_create("numa_policy",
2806 : sizeof(struct mempolicy),
2807 : 0, SLAB_PANIC, NULL);
2808 :
2809 1 : sn_cache = kmem_cache_create("shared_policy_node",
2810 : sizeof(struct sp_node),
2811 : 0, SLAB_PANIC, NULL);
2812 :
2813 2 : for_each_node(nid) {
2814 1 : preferred_node_policy[nid] = (struct mempolicy) {
2815 : .refcnt = ATOMIC_INIT(1),
2816 : .mode = MPOL_PREFERRED,
2817 : .flags = MPOL_F_MOF | MPOL_F_MORON,
2818 : .v = { .preferred_node = nid, },
2819 : };
2820 : }
2821 :
2822 : /*
2823 : * Set interleaving policy for system init. Interleaving is only
2824 : * enabled across suitably sized nodes (default is >= 16MB), or
2825 : * fall back to the largest node if they're all smaller.
2826 : */
2827 1 : nodes_clear(interleave_nodes);
2828 2 : for_each_node_state(nid, N_MEMORY) {
2829 1 : unsigned long total_pages = node_present_pages(nid);
2830 :
2831 : /* Preserve the largest node */
2832 1 : if (largest < total_pages) {
2833 1 : largest = total_pages;
2834 1 : prefer = nid;
2835 : }
2836 :
2837 : /* Interleave this node? */
2838 1 : if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2839 2 : node_set(nid, interleave_nodes);
2840 : }
2841 :
2842 : /* All too small, use the largest */
2843 1 : if (unlikely(nodes_empty(interleave_nodes)))
2844 0 : node_set(prefer, interleave_nodes);
2845 :
2846 1 : if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2847 0 : pr_err("%s: interleaving failed\n", __func__);
2848 :
2849 1 : check_numabalancing_enable();
2850 1 : }
2851 :
2852 : /* Reset policy of current process to default */
2853 2 : void numa_default_policy(void)
2854 : {
2855 2 : do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2856 2 : }
2857 :
2858 : /*
2859 : * Parse and format mempolicy from/to strings
2860 : */
2861 :
2862 : /*
2863 : * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2864 : */
2865 : static const char * const policy_modes[] =
2866 : {
2867 : [MPOL_DEFAULT] = "default",
2868 : [MPOL_PREFERRED] = "prefer",
2869 : [MPOL_BIND] = "bind",
2870 : [MPOL_INTERLEAVE] = "interleave",
2871 : [MPOL_LOCAL] = "local",
2872 : };
2873 :
2874 :
2875 : #ifdef CONFIG_TMPFS
2876 : /**
2877 : * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2878 : * @str: string containing mempolicy to parse
2879 : * @mpol: pointer to struct mempolicy pointer, returned on success.
2880 : *
2881 : * Format of input:
2882 : * <mode>[=<flags>][:<nodelist>]
2883 : *
2884 : * On success, returns 0, else 1
2885 : */
2886 0 : int mpol_parse_str(char *str, struct mempolicy **mpol)
2887 : {
2888 0 : struct mempolicy *new = NULL;
2889 0 : unsigned short mode_flags;
2890 0 : nodemask_t nodes;
2891 0 : char *nodelist = strchr(str, ':');
2892 0 : char *flags = strchr(str, '=');
2893 0 : int err = 1, mode;
2894 :
2895 0 : if (flags)
2896 0 : *flags++ = '\0'; /* terminate mode string */
2897 :
2898 0 : if (nodelist) {
2899 : /* NUL-terminate mode or flags string */
2900 0 : *nodelist++ = '\0';
2901 0 : if (nodelist_parse(nodelist, nodes))
2902 0 : goto out;
2903 0 : if (!nodes_subset(nodes, node_states[N_MEMORY]))
2904 0 : goto out;
2905 : } else
2906 0 : nodes_clear(nodes);
2907 :
2908 0 : mode = match_string(policy_modes, MPOL_MAX, str);
2909 0 : if (mode < 0)
2910 0 : goto out;
2911 :
2912 0 : switch (mode) {
2913 0 : case MPOL_PREFERRED:
2914 : /*
2915 : * Insist on a nodelist of one node only, although later
2916 : * we use first_node(nodes) to grab a single node, so here
2917 : * nodelist (or nodes) cannot be empty.
2918 : */
2919 0 : if (nodelist) {
2920 : char *rest = nodelist;
2921 0 : while (isdigit(*rest))
2922 0 : rest++;
2923 0 : if (*rest)
2924 0 : goto out;
2925 0 : if (nodes_empty(nodes))
2926 0 : goto out;
2927 : }
2928 : break;
2929 0 : case MPOL_INTERLEAVE:
2930 : /*
2931 : * Default to online nodes with memory if no nodelist
2932 : */
2933 0 : if (!nodelist)
2934 0 : nodes = node_states[N_MEMORY];
2935 : break;
2936 0 : case MPOL_LOCAL:
2937 : /*
2938 : * Don't allow a nodelist; mpol_new() checks flags
2939 : */
2940 0 : if (nodelist)
2941 0 : goto out;
2942 : mode = MPOL_PREFERRED;
2943 : break;
2944 0 : case MPOL_DEFAULT:
2945 : /*
2946 : * Insist on a empty nodelist
2947 : */
2948 0 : if (!nodelist)
2949 0 : err = 0;
2950 0 : goto out;
2951 0 : case MPOL_BIND:
2952 : /*
2953 : * Insist on a nodelist
2954 : */
2955 0 : if (!nodelist)
2956 0 : goto out;
2957 : }
2958 :
2959 0 : mode_flags = 0;
2960 0 : if (flags) {
2961 : /*
2962 : * Currently, we only support two mutually exclusive
2963 : * mode flags.
2964 : */
2965 0 : if (!strcmp(flags, "static"))
2966 : mode_flags |= MPOL_F_STATIC_NODES;
2967 0 : else if (!strcmp(flags, "relative"))
2968 : mode_flags |= MPOL_F_RELATIVE_NODES;
2969 : else
2970 0 : goto out;
2971 : }
2972 :
2973 0 : new = mpol_new(mode, mode_flags, &nodes);
2974 0 : if (IS_ERR(new))
2975 0 : goto out;
2976 :
2977 : /*
2978 : * Save nodes for mpol_to_str() to show the tmpfs mount options
2979 : * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2980 : */
2981 0 : if (mode != MPOL_PREFERRED)
2982 0 : new->v.nodes = nodes;
2983 0 : else if (nodelist)
2984 0 : new->v.preferred_node = first_node(nodes);
2985 : else
2986 0 : new->flags |= MPOL_F_LOCAL;
2987 :
2988 : /*
2989 : * Save nodes for contextualization: this will be used to "clone"
2990 : * the mempolicy in a specific context [cpuset] at a later time.
2991 : */
2992 0 : new->w.user_nodemask = nodes;
2993 :
2994 0 : err = 0;
2995 :
2996 0 : out:
2997 : /* Restore string for error message */
2998 0 : if (nodelist)
2999 0 : *--nodelist = ':';
3000 0 : if (flags)
3001 0 : *--flags = '=';
3002 0 : if (!err)
3003 0 : *mpol = new;
3004 0 : return err;
3005 : }
3006 : #endif /* CONFIG_TMPFS */
3007 :
3008 : /**
3009 : * mpol_to_str - format a mempolicy structure for printing
3010 : * @buffer: to contain formatted mempolicy string
3011 : * @maxlen: length of @buffer
3012 : * @pol: pointer to mempolicy to be formatted
3013 : *
3014 : * Convert @pol into a string. If @buffer is too short, truncate the string.
3015 : * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3016 : * longest flag, "relative", and to display at least a few node ids.
3017 : */
3018 0 : void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3019 : {
3020 0 : char *p = buffer;
3021 0 : nodemask_t nodes = NODE_MASK_NONE;
3022 0 : unsigned short mode = MPOL_DEFAULT;
3023 0 : unsigned short flags = 0;
3024 :
3025 0 : if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3026 0 : mode = pol->mode;
3027 0 : flags = pol->flags;
3028 : }
3029 :
3030 0 : switch (mode) {
3031 : case MPOL_DEFAULT:
3032 : break;
3033 0 : case MPOL_PREFERRED:
3034 0 : if (flags & MPOL_F_LOCAL)
3035 : mode = MPOL_LOCAL;
3036 : else
3037 0 : node_set(pol->v.preferred_node, nodes);
3038 : break;
3039 0 : case MPOL_BIND:
3040 : case MPOL_INTERLEAVE:
3041 0 : nodes = pol->v.nodes;
3042 0 : break;
3043 : default:
3044 0 : WARN_ON_ONCE(1);
3045 0 : snprintf(p, maxlen, "unknown");
3046 0 : return;
3047 : }
3048 :
3049 0 : p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3050 :
3051 0 : if (flags & MPOL_MODE_FLAGS) {
3052 0 : p += snprintf(p, buffer + maxlen - p, "=");
3053 :
3054 : /*
3055 : * Currently, the only defined flags are mutually exclusive
3056 : */
3057 0 : if (flags & MPOL_F_STATIC_NODES)
3058 0 : p += snprintf(p, buffer + maxlen - p, "static");
3059 0 : else if (flags & MPOL_F_RELATIVE_NODES)
3060 0 : p += snprintf(p, buffer + maxlen - p, "relative");
3061 : }
3062 :
3063 0 : if (!nodes_empty(nodes))
3064 0 : p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3065 : nodemask_pr_args(&nodes));
3066 : }
|