LCOV - code coverage report
Current view: top level - mm - mempolicy.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 209 1225 17.1 %
Date: 2021-04-22 12:43:58 Functions: 23 87 26.4 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * Simple NUMA memory policy for the Linux kernel.
       4             :  *
       5             :  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
       6             :  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
       7             :  *
       8             :  * NUMA policy allows the user to give hints in which node(s) memory should
       9             :  * be allocated.
      10             :  *
      11             :  * Support four policies per VMA and per process:
      12             :  *
      13             :  * The VMA policy has priority over the process policy for a page fault.
      14             :  *
      15             :  * interleave     Allocate memory interleaved over a set of nodes,
      16             :  *                with normal fallback if it fails.
      17             :  *                For VMA based allocations this interleaves based on the
      18             :  *                offset into the backing object or offset into the mapping
      19             :  *                for anonymous memory. For process policy an process counter
      20             :  *                is used.
      21             :  *
      22             :  * bind           Only allocate memory on a specific set of nodes,
      23             :  *                no fallback.
      24             :  *                FIXME: memory is allocated starting with the first node
      25             :  *                to the last. It would be better if bind would truly restrict
      26             :  *                the allocation to memory nodes instead
      27             :  *
      28             :  * preferred       Try a specific node first before normal fallback.
      29             :  *                As a special case NUMA_NO_NODE here means do the allocation
      30             :  *                on the local CPU. This is normally identical to default,
      31             :  *                but useful to set in a VMA when you have a non default
      32             :  *                process policy.
      33             :  *
      34             :  * default        Allocate on the local node first, or when on a VMA
      35             :  *                use the process policy. This is what Linux always did
      36             :  *                in a NUMA aware kernel and still does by, ahem, default.
      37             :  *
      38             :  * The process policy is applied for most non interrupt memory allocations
      39             :  * in that process' context. Interrupts ignore the policies and always
      40             :  * try to allocate on the local CPU. The VMA policy is only applied for memory
      41             :  * allocations for a VMA in the VM.
      42             :  *
      43             :  * Currently there are a few corner cases in swapping where the policy
      44             :  * is not applied, but the majority should be handled. When process policy
      45             :  * is used it is not remembered over swap outs/swap ins.
      46             :  *
      47             :  * Only the highest zone in the zone hierarchy gets policied. Allocations
      48             :  * requesting a lower zone just use default policy. This implies that
      49             :  * on systems with highmem kernel lowmem allocation don't get policied.
      50             :  * Same with GFP_DMA allocations.
      51             :  *
      52             :  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
      53             :  * all users and remembered even when nobody has memory mapped.
      54             :  */
      55             : 
      56             : /* Notebook:
      57             :    fix mmap readahead to honour policy and enable policy for any page cache
      58             :    object
      59             :    statistics for bigpages
      60             :    global policy for page cache? currently it uses process policy. Requires
      61             :    first item above.
      62             :    handle mremap for shared memory (currently ignored for the policy)
      63             :    grows down?
      64             :    make bind policy root only? It can trigger oom much faster and the
      65             :    kernel is not always grateful with that.
      66             : */
      67             : 
      68             : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      69             : 
      70             : #include <linux/mempolicy.h>
      71             : #include <linux/pagewalk.h>
      72             : #include <linux/highmem.h>
      73             : #include <linux/hugetlb.h>
      74             : #include <linux/kernel.h>
      75             : #include <linux/sched.h>
      76             : #include <linux/sched/mm.h>
      77             : #include <linux/sched/numa_balancing.h>
      78             : #include <linux/sched/task.h>
      79             : #include <linux/nodemask.h>
      80             : #include <linux/cpuset.h>
      81             : #include <linux/slab.h>
      82             : #include <linux/string.h>
      83             : #include <linux/export.h>
      84             : #include <linux/nsproxy.h>
      85             : #include <linux/interrupt.h>
      86             : #include <linux/init.h>
      87             : #include <linux/compat.h>
      88             : #include <linux/ptrace.h>
      89             : #include <linux/swap.h>
      90             : #include <linux/seq_file.h>
      91             : #include <linux/proc_fs.h>
      92             : #include <linux/migrate.h>
      93             : #include <linux/ksm.h>
      94             : #include <linux/rmap.h>
      95             : #include <linux/security.h>
      96             : #include <linux/syscalls.h>
      97             : #include <linux/ctype.h>
      98             : #include <linux/mm_inline.h>
      99             : #include <linux/mmu_notifier.h>
     100             : #include <linux/printk.h>
     101             : #include <linux/swapops.h>
     102             : 
     103             : #include <asm/tlbflush.h>
     104             : #include <linux/uaccess.h>
     105             : 
     106             : #include "internal.h"
     107             : 
     108             : /* Internal flags */
     109             : #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)      /* Skip checks for continuous vmas */
     110             : #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)            /* Invert check for nodemask */
     111             : 
     112             : static struct kmem_cache *policy_cache;
     113             : static struct kmem_cache *sn_cache;
     114             : 
     115             : /* Highest zone. An specific allocation for a zone below that is not
     116             :    policied. */
     117             : enum zone_type policy_zone = 0;
     118             : 
     119             : /*
     120             :  * run-time system-wide default policy => local allocation
     121             :  */
     122             : static struct mempolicy default_policy = {
     123             :         .refcnt = ATOMIC_INIT(1), /* never free it */
     124             :         .mode = MPOL_PREFERRED,
     125             :         .flags = MPOL_F_LOCAL,
     126             : };
     127             : 
     128             : static struct mempolicy preferred_node_policy[MAX_NUMNODES];
     129             : 
     130             : /**
     131             :  * numa_map_to_online_node - Find closest online node
     132             :  * @node: Node id to start the search
     133             :  *
     134             :  * Lookup the next closest node by distance if @nid is not online.
     135             :  */
     136           0 : int numa_map_to_online_node(int node)
     137             : {
     138           0 :         int min_dist = INT_MAX, dist, n, min_node;
     139             : 
     140           0 :         if (node == NUMA_NO_NODE || node_online(node))
     141           0 :                 return node;
     142             : 
     143           0 :         min_node = node;
     144           0 :         for_each_online_node(n) {
     145           0 :                 dist = node_distance(node, n);
     146           0 :                 if (dist < min_dist) {
     147           0 :                         min_dist = dist;
     148           0 :                         min_node = n;
     149             :                 }
     150             :         }
     151             : 
     152             :         return min_node;
     153             : }
     154             : EXPORT_SYMBOL_GPL(numa_map_to_online_node);
     155             : 
     156      192491 : struct mempolicy *get_task_policy(struct task_struct *p)
     157             : {
     158      192491 :         struct mempolicy *pol = p->mempolicy;
     159      192491 :         int node;
     160             : 
     161      192491 :         if (pol)
     162             :                 return pol;
     163             : 
     164      188781 :         node = numa_node_id();
     165      188781 :         if (node != NUMA_NO_NODE) {
     166      188781 :                 pol = &preferred_node_policy[node];
     167             :                 /* preferred_node_policy is not initialised early in boot */
     168      188781 :                 if (pol->mode)
     169      188499 :                         return pol;
     170             :         }
     171             : 
     172             :         return &default_policy;
     173             : }
     174             : 
     175             : static const struct mempolicy_operations {
     176             :         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
     177             :         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
     178             : } mpol_ops[MPOL_MAX];
     179             : 
     180           1 : static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
     181             : {
     182           1 :         return pol->flags & MPOL_MODE_FLAGS;
     183             : }
     184             : 
     185           0 : static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
     186             :                                    const nodemask_t *rel)
     187             : {
     188           0 :         nodemask_t tmp;
     189           0 :         nodes_fold(tmp, *orig, nodes_weight(*rel));
     190           0 :         nodes_onto(*ret, tmp, *rel);
     191           0 : }
     192             : 
     193           1 : static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
     194             : {
     195           1 :         if (nodes_empty(*nodes))
     196             :                 return -EINVAL;
     197           1 :         pol->v.nodes = *nodes;
     198           1 :         return 0;
     199             : }
     200             : 
     201           0 : static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
     202             : {
     203           0 :         if (!nodes)
     204           0 :                 pol->flags |= MPOL_F_LOCAL;  /* local allocation */
     205           0 :         else if (nodes_empty(*nodes))
     206             :                 return -EINVAL;                 /*  no allowed nodes */
     207             :         else
     208           0 :                 pol->v.preferred_node = first_node(*nodes);
     209             :         return 0;
     210             : }
     211             : 
     212           0 : static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
     213             : {
     214           0 :         if (nodes_empty(*nodes))
     215             :                 return -EINVAL;
     216           0 :         pol->v.nodes = *nodes;
     217           0 :         return 0;
     218             : }
     219             : 
     220             : /*
     221             :  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
     222             :  * any, for the new policy.  mpol_new() has already validated the nodes
     223             :  * parameter with respect to the policy mode and flags.  But, we need to
     224             :  * handle an empty nodemask with MPOL_PREFERRED here.
     225             :  *
     226             :  * Must be called holding task's alloc_lock to protect task's mems_allowed
     227             :  * and mempolicy.  May also be called holding the mmap_lock for write.
     228             :  */
     229           3 : static int mpol_set_nodemask(struct mempolicy *pol,
     230             :                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
     231             : {
     232           3 :         int ret;
     233             : 
     234             :         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
     235           3 :         if (pol == NULL)
     236             :                 return 0;
     237             :         /* Check N_MEMORY */
     238           1 :         nodes_and(nsc->mask1,
     239             :                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
     240             : 
     241           1 :         VM_BUG_ON(!nodes);
     242           1 :         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
     243             :                 nodes = NULL;   /* explicit local allocation */
     244             :         else {
     245           1 :                 if (pol->flags & MPOL_F_RELATIVE_NODES)
     246           0 :                         mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
     247             :                 else
     248           1 :                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
     249             : 
     250           1 :                 if (mpol_store_user_nodemask(pol))
     251           0 :                         pol->w.user_nodemask = *nodes;
     252             :                 else
     253           1 :                         pol->w.cpuset_mems_allowed =
     254             :                                                 cpuset_current_mems_allowed;
     255             :         }
     256             : 
     257           1 :         if (nodes)
     258           1 :                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
     259             :         else
     260           0 :                 ret = mpol_ops[pol->mode].create(pol, NULL);
     261             :         return ret;
     262             : }
     263             : 
     264             : /*
     265             :  * This function just creates a new policy, does some check and simple
     266             :  * initialization. You must invoke mpol_set_nodemask() to set nodes.
     267             :  */
     268           3 : static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
     269             :                                   nodemask_t *nodes)
     270             : {
     271           3 :         struct mempolicy *policy;
     272             : 
     273           3 :         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
     274             :                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
     275             : 
     276           3 :         if (mode == MPOL_DEFAULT) {
     277           2 :                 if (nodes && !nodes_empty(*nodes))
     278           3 :                         return ERR_PTR(-EINVAL);
     279           2 :                 return NULL;
     280             :         }
     281           1 :         VM_BUG_ON(!nodes);
     282             : 
     283             :         /*
     284             :          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
     285             :          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
     286             :          * All other modes require a valid pointer to a non-empty nodemask.
     287             :          */
     288           1 :         if (mode == MPOL_PREFERRED) {
     289           0 :                 if (nodes_empty(*nodes)) {
     290           0 :                         if (((flags & MPOL_F_STATIC_NODES) ||
     291             :                              (flags & MPOL_F_RELATIVE_NODES)))
     292           3 :                                 return ERR_PTR(-EINVAL);
     293             :                 }
     294           1 :         } else if (mode == MPOL_LOCAL) {
     295           0 :                 if (!nodes_empty(*nodes) ||
     296           0 :                     (flags & MPOL_F_STATIC_NODES) ||
     297             :                     (flags & MPOL_F_RELATIVE_NODES))
     298           3 :                         return ERR_PTR(-EINVAL);
     299             :                 mode = MPOL_PREFERRED;
     300           1 :         } else if (nodes_empty(*nodes))
     301           3 :                 return ERR_PTR(-EINVAL);
     302           1 :         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
     303           1 :         if (!policy)
     304           3 :                 return ERR_PTR(-ENOMEM);
     305           1 :         atomic_set(&policy->refcnt, 1);
     306           1 :         policy->mode = mode;
     307           1 :         policy->flags = flags;
     308             : 
     309           1 :         return policy;
     310             : }
     311             : 
     312             : /* Slow path of a mpol destructor. */
     313           2 : void __mpol_put(struct mempolicy *p)
     314             : {
     315           4 :         if (!atomic_dec_and_test(&p->refcnt))
     316             :                 return;
     317           2 :         kmem_cache_free(policy_cache, p);
     318             : }
     319             : 
     320           0 : static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
     321             : {
     322           0 : }
     323             : 
     324           0 : static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
     325             : {
     326           0 :         nodemask_t tmp;
     327             : 
     328           0 :         if (pol->flags & MPOL_F_STATIC_NODES)
     329           0 :                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
     330           0 :         else if (pol->flags & MPOL_F_RELATIVE_NODES)
     331           0 :                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
     332             :         else {
     333           0 :                 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
     334             :                                                                 *nodes);
     335           0 :                 pol->w.cpuset_mems_allowed = *nodes;
     336             :         }
     337             : 
     338           0 :         if (nodes_empty(tmp))
     339           0 :                 tmp = *nodes;
     340             : 
     341           0 :         pol->v.nodes = tmp;
     342           0 : }
     343             : 
     344           0 : static void mpol_rebind_preferred(struct mempolicy *pol,
     345             :                                                 const nodemask_t *nodes)
     346             : {
     347           0 :         nodemask_t tmp;
     348             : 
     349           0 :         if (pol->flags & MPOL_F_STATIC_NODES) {
     350           0 :                 int node = first_node(pol->w.user_nodemask);
     351             : 
     352           0 :                 if (node_isset(node, *nodes)) {
     353           0 :                         pol->v.preferred_node = node;
     354           0 :                         pol->flags &= ~MPOL_F_LOCAL;
     355             :                 } else
     356           0 :                         pol->flags |= MPOL_F_LOCAL;
     357           0 :         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
     358           0 :                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
     359           0 :                 pol->v.preferred_node = first_node(tmp);
     360           0 :         } else if (!(pol->flags & MPOL_F_LOCAL)) {
     361           0 :                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
     362             :                                                    pol->w.cpuset_mems_allowed,
     363             :                                                    *nodes);
     364           0 :                 pol->w.cpuset_mems_allowed = *nodes;
     365             :         }
     366           0 : }
     367             : 
     368             : /*
     369             :  * mpol_rebind_policy - Migrate a policy to a different set of nodes
     370             :  *
     371             :  * Per-vma policies are protected by mmap_lock. Allocations using per-task
     372             :  * policies are protected by task->mems_allowed_seq to prevent a premature
     373             :  * OOM/allocation failure due to parallel nodemask modification.
     374             :  */
     375           0 : static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
     376             : {
     377           0 :         if (!pol)
     378             :                 return;
     379           0 :         if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
     380           0 :             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
     381             :                 return;
     382             : 
     383           0 :         mpol_ops[pol->mode].rebind(pol, newmask);
     384             : }
     385             : 
     386             : /*
     387             :  * Wrapper for mpol_rebind_policy() that just requires task
     388             :  * pointer, and updates task mempolicy.
     389             :  *
     390             :  * Called with task's alloc_lock held.
     391             :  */
     392             : 
     393           0 : void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
     394             : {
     395           0 :         mpol_rebind_policy(tsk->mempolicy, new);
     396           0 : }
     397             : 
     398             : /*
     399             :  * Rebind each vma in mm to new nodemask.
     400             :  *
     401             :  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
     402             :  */
     403             : 
     404           0 : void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
     405             : {
     406           0 :         struct vm_area_struct *vma;
     407             : 
     408           0 :         mmap_write_lock(mm);
     409           0 :         for (vma = mm->mmap; vma; vma = vma->vm_next)
     410           0 :                 mpol_rebind_policy(vma->vm_policy, new);
     411           0 :         mmap_write_unlock(mm);
     412           0 : }
     413             : 
     414             : static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
     415             :         [MPOL_DEFAULT] = {
     416             :                 .rebind = mpol_rebind_default,
     417             :         },
     418             :         [MPOL_INTERLEAVE] = {
     419             :                 .create = mpol_new_interleave,
     420             :                 .rebind = mpol_rebind_nodemask,
     421             :         },
     422             :         [MPOL_PREFERRED] = {
     423             :                 .create = mpol_new_preferred,
     424             :                 .rebind = mpol_rebind_preferred,
     425             :         },
     426             :         [MPOL_BIND] = {
     427             :                 .create = mpol_new_bind,
     428             :                 .rebind = mpol_rebind_nodemask,
     429             :         },
     430             : };
     431             : 
     432             : static int migrate_page_add(struct page *page, struct list_head *pagelist,
     433             :                                 unsigned long flags);
     434             : 
     435             : struct queue_pages {
     436             :         struct list_head *pagelist;
     437             :         unsigned long flags;
     438             :         nodemask_t *nmask;
     439             :         unsigned long start;
     440             :         unsigned long end;
     441             :         struct vm_area_struct *first;
     442             : };
     443             : 
     444             : /*
     445             :  * Check if the page's nid is in qp->nmask.
     446             :  *
     447             :  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
     448             :  * in the invert of qp->nmask.
     449             :  */
     450           0 : static inline bool queue_pages_required(struct page *page,
     451             :                                         struct queue_pages *qp)
     452             : {
     453           0 :         int nid = page_to_nid(page);
     454           0 :         unsigned long flags = qp->flags;
     455             : 
     456           0 :         return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
     457             : }
     458             : 
     459             : /*
     460             :  * queue_pages_pmd() has four possible return values:
     461             :  * 0 - pages are placed on the right node or queued successfully.
     462             :  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
     463             :  *     specified.
     464             :  * 2 - THP was split.
     465             :  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
     466             :  *        existing page was already on a node that does not follow the
     467             :  *        policy.
     468             :  */
     469           0 : static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
     470             :                                 unsigned long end, struct mm_walk *walk)
     471             :         __releases(ptl)
     472             : {
     473           0 :         int ret = 0;
     474           0 :         struct page *page;
     475           0 :         struct queue_pages *qp = walk->private;
     476           0 :         unsigned long flags;
     477             : 
     478           0 :         if (unlikely(is_pmd_migration_entry(*pmd))) {
     479           0 :                 ret = -EIO;
     480           0 :                 goto unlock;
     481             :         }
     482           0 :         page = pmd_page(*pmd);
     483           0 :         if (is_huge_zero_page(page)) {
     484           0 :                 spin_unlock(ptl);
     485           0 :                 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
     486           0 :                 ret = 2;
     487           0 :                 goto out;
     488             :         }
     489           0 :         if (!queue_pages_required(page, qp))
     490           0 :                 goto unlock;
     491             : 
     492           0 :         flags = qp->flags;
     493             :         /* go to thp migration */
     494           0 :         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
     495           0 :                 if (!vma_migratable(walk->vma) ||
     496           0 :                     migrate_page_add(page, qp->pagelist, flags)) {
     497           0 :                         ret = 1;
     498           0 :                         goto unlock;
     499             :                 }
     500             :         } else
     501             :                 ret = -EIO;
     502           0 : unlock:
     503           0 :         spin_unlock(ptl);
     504           0 : out:
     505           0 :         return ret;
     506             : }
     507             : 
     508             : /*
     509             :  * Scan through pages checking if pages follow certain conditions,
     510             :  * and move them to the pagelist if they do.
     511             :  *
     512             :  * queue_pages_pte_range() has three possible return values:
     513             :  * 0 - pages are placed on the right node or queued successfully.
     514             :  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
     515             :  *     specified.
     516             :  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
     517             :  *        on a node that does not follow the policy.
     518             :  */
     519           0 : static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
     520             :                         unsigned long end, struct mm_walk *walk)
     521             : {
     522           0 :         struct vm_area_struct *vma = walk->vma;
     523           0 :         struct page *page;
     524           0 :         struct queue_pages *qp = walk->private;
     525           0 :         unsigned long flags = qp->flags;
     526           0 :         int ret;
     527           0 :         bool has_unmovable = false;
     528           0 :         pte_t *pte, *mapped_pte;
     529           0 :         spinlock_t *ptl;
     530             : 
     531           0 :         ptl = pmd_trans_huge_lock(pmd, vma);
     532           0 :         if (ptl) {
     533           0 :                 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
     534           0 :                 if (ret != 2)
     535             :                         return ret;
     536             :         }
     537             :         /* THP was split, fall through to pte walk */
     538             : 
     539           0 :         if (pmd_trans_unstable(pmd))
     540             :                 return 0;
     541             : 
     542           0 :         mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
     543           0 :         for (; addr != end; pte++, addr += PAGE_SIZE) {
     544           0 :                 if (!pte_present(*pte))
     545           0 :                         continue;
     546           0 :                 page = vm_normal_page(vma, addr, *pte);
     547           0 :                 if (!page)
     548           0 :                         continue;
     549             :                 /*
     550             :                  * vm_normal_page() filters out zero pages, but there might
     551             :                  * still be PageReserved pages to skip, perhaps in a VDSO.
     552             :                  */
     553           0 :                 if (PageReserved(page))
     554           0 :                         continue;
     555           0 :                 if (!queue_pages_required(page, qp))
     556           0 :                         continue;
     557           0 :                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
     558             :                         /* MPOL_MF_STRICT must be specified if we get here */
     559           0 :                         if (!vma_migratable(vma)) {
     560             :                                 has_unmovable = true;
     561             :                                 break;
     562             :                         }
     563             : 
     564             :                         /*
     565             :                          * Do not abort immediately since there may be
     566             :                          * temporary off LRU pages in the range.  Still
     567             :                          * need migrate other LRU pages.
     568             :                          */
     569           0 :                         if (migrate_page_add(page, qp->pagelist, flags))
     570           0 :                                 has_unmovable = true;
     571             :                 } else
     572             :                         break;
     573             :         }
     574           0 :         pte_unmap_unlock(mapped_pte, ptl);
     575           0 :         cond_resched();
     576             : 
     577           0 :         if (has_unmovable)
     578             :                 return 1;
     579             : 
     580           0 :         return addr != end ? -EIO : 0;
     581             : }
     582             : 
     583           0 : static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
     584             :                                unsigned long addr, unsigned long end,
     585             :                                struct mm_walk *walk)
     586             : {
     587           0 :         int ret = 0;
     588             : #ifdef CONFIG_HUGETLB_PAGE
     589             :         struct queue_pages *qp = walk->private;
     590             :         unsigned long flags = (qp->flags & MPOL_MF_VALID);
     591             :         struct page *page;
     592             :         spinlock_t *ptl;
     593             :         pte_t entry;
     594             : 
     595             :         ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
     596             :         entry = huge_ptep_get(pte);
     597             :         if (!pte_present(entry))
     598             :                 goto unlock;
     599             :         page = pte_page(entry);
     600             :         if (!queue_pages_required(page, qp))
     601             :                 goto unlock;
     602             : 
     603             :         if (flags == MPOL_MF_STRICT) {
     604             :                 /*
     605             :                  * STRICT alone means only detecting misplaced page and no
     606             :                  * need to further check other vma.
     607             :                  */
     608             :                 ret = -EIO;
     609             :                 goto unlock;
     610             :         }
     611             : 
     612             :         if (!vma_migratable(walk->vma)) {
     613             :                 /*
     614             :                  * Must be STRICT with MOVE*, otherwise .test_walk() have
     615             :                  * stopped walking current vma.
     616             :                  * Detecting misplaced page but allow migrating pages which
     617             :                  * have been queued.
     618             :                  */
     619             :                 ret = 1;
     620             :                 goto unlock;
     621             :         }
     622             : 
     623             :         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
     624             :         if (flags & (MPOL_MF_MOVE_ALL) ||
     625             :             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
     626             :                 if (!isolate_huge_page(page, qp->pagelist) &&
     627             :                         (flags & MPOL_MF_STRICT))
     628             :                         /*
     629             :                          * Failed to isolate page but allow migrating pages
     630             :                          * which have been queued.
     631             :                          */
     632             :                         ret = 1;
     633             :         }
     634             : unlock:
     635             :         spin_unlock(ptl);
     636             : #else
     637           0 :         BUG();
     638             : #endif
     639             :         return ret;
     640             : }
     641             : 
     642             : #ifdef CONFIG_NUMA_BALANCING
     643             : /*
     644             :  * This is used to mark a range of virtual addresses to be inaccessible.
     645             :  * These are later cleared by a NUMA hinting fault. Depending on these
     646             :  * faults, pages may be migrated for better NUMA placement.
     647             :  *
     648             :  * This is assuming that NUMA faults are handled using PROT_NONE. If
     649             :  * an architecture makes a different choice, it will need further
     650             :  * changes to the core.
     651             :  */
     652             : unsigned long change_prot_numa(struct vm_area_struct *vma,
     653             :                         unsigned long addr, unsigned long end)
     654             : {
     655             :         int nr_updated;
     656             : 
     657             :         nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
     658             :         if (nr_updated)
     659             :                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
     660             : 
     661             :         return nr_updated;
     662             : }
     663             : #else
     664             : static unsigned long change_prot_numa(struct vm_area_struct *vma,
     665             :                         unsigned long addr, unsigned long end)
     666             : {
     667             :         return 0;
     668             : }
     669             : #endif /* CONFIG_NUMA_BALANCING */
     670             : 
     671           0 : static int queue_pages_test_walk(unsigned long start, unsigned long end,
     672             :                                 struct mm_walk *walk)
     673             : {
     674           0 :         struct vm_area_struct *vma = walk->vma;
     675           0 :         struct queue_pages *qp = walk->private;
     676           0 :         unsigned long endvma = vma->vm_end;
     677           0 :         unsigned long flags = qp->flags;
     678             : 
     679             :         /* range check first */
     680           0 :         VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
     681             : 
     682           0 :         if (!qp->first) {
     683           0 :                 qp->first = vma;
     684           0 :                 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
     685           0 :                         (qp->start < vma->vm_start))
     686             :                         /* hole at head side of range */
     687             :                         return -EFAULT;
     688             :         }
     689           0 :         if (!(flags & MPOL_MF_DISCONTIG_OK) &&
     690           0 :                 ((vma->vm_end < qp->end) &&
     691           0 :                 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
     692             :                 /* hole at middle or tail of range */
     693             :                 return -EFAULT;
     694             : 
     695             :         /*
     696             :          * Need check MPOL_MF_STRICT to return -EIO if possible
     697             :          * regardless of vma_migratable
     698             :          */
     699           0 :         if (!vma_migratable(vma) &&
     700           0 :             !(flags & MPOL_MF_STRICT))
     701             :                 return 1;
     702             : 
     703           0 :         if (endvma > end)
     704             :                 endvma = end;
     705             : 
     706           0 :         if (flags & MPOL_MF_LAZY) {
     707             :                 /* Similar to task_numa_work, skip inaccessible VMAs */
     708           0 :                 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
     709             :                         !(vma->vm_flags & VM_MIXEDMAP))
     710           0 :                         change_prot_numa(vma, start, endvma);
     711             :                 return 1;
     712             :         }
     713             : 
     714             :         /* queue pages from current vma */
     715           0 :         if (flags & MPOL_MF_VALID)
     716           0 :                 return 0;
     717             :         return 1;
     718             : }
     719             : 
     720             : static const struct mm_walk_ops queue_pages_walk_ops = {
     721             :         .hugetlb_entry          = queue_pages_hugetlb,
     722             :         .pmd_entry              = queue_pages_pte_range,
     723             :         .test_walk              = queue_pages_test_walk,
     724             : };
     725             : 
     726             : /*
     727             :  * Walk through page tables and collect pages to be migrated.
     728             :  *
     729             :  * If pages found in a given range are on a set of nodes (determined by
     730             :  * @nodes and @flags,) it's isolated and queued to the pagelist which is
     731             :  * passed via @private.
     732             :  *
     733             :  * queue_pages_range() has three possible return values:
     734             :  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
     735             :  *     specified.
     736             :  * 0 - queue pages successfully or no misplaced page.
     737             :  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
     738             :  *         memory range specified by nodemask and maxnode points outside
     739             :  *         your accessible address space (-EFAULT)
     740             :  */
     741             : static int
     742           0 : queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
     743             :                 nodemask_t *nodes, unsigned long flags,
     744             :                 struct list_head *pagelist)
     745             : {
     746           0 :         int err;
     747           0 :         struct queue_pages qp = {
     748             :                 .pagelist = pagelist,
     749             :                 .flags = flags,
     750             :                 .nmask = nodes,
     751             :                 .start = start,
     752             :                 .end = end,
     753             :                 .first = NULL,
     754             :         };
     755             : 
     756           0 :         err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
     757             : 
     758           0 :         if (!qp.first)
     759             :                 /* whole range in hole */
     760           0 :                 err = -EFAULT;
     761             : 
     762           0 :         return err;
     763             : }
     764             : 
     765             : /*
     766             :  * Apply policy to a single VMA
     767             :  * This must be called with the mmap_lock held for writing.
     768             :  */
     769           0 : static int vma_replace_policy(struct vm_area_struct *vma,
     770             :                                                 struct mempolicy *pol)
     771             : {
     772           0 :         int err;
     773           0 :         struct mempolicy *old;
     774           0 :         struct mempolicy *new;
     775             : 
     776           0 :         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
     777             :                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
     778             :                  vma->vm_ops, vma->vm_file,
     779             :                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
     780             : 
     781           0 :         new = mpol_dup(pol);
     782           0 :         if (IS_ERR(new))
     783           0 :                 return PTR_ERR(new);
     784             : 
     785           0 :         if (vma->vm_ops && vma->vm_ops->set_policy) {
     786           0 :                 err = vma->vm_ops->set_policy(vma, new);
     787           0 :                 if (err)
     788           0 :                         goto err_out;
     789             :         }
     790             : 
     791           0 :         old = vma->vm_policy;
     792           0 :         vma->vm_policy = new; /* protected by mmap_lock */
     793           0 :         mpol_put(old);
     794             : 
     795             :         return 0;
     796           0 :  err_out:
     797           0 :         mpol_put(new);
     798             :         return err;
     799             : }
     800             : 
     801             : /* Step 2: apply policy to a range and do splits. */
     802           0 : static int mbind_range(struct mm_struct *mm, unsigned long start,
     803             :                        unsigned long end, struct mempolicy *new_pol)
     804             : {
     805           0 :         struct vm_area_struct *next;
     806           0 :         struct vm_area_struct *prev;
     807           0 :         struct vm_area_struct *vma;
     808           0 :         int err = 0;
     809           0 :         pgoff_t pgoff;
     810           0 :         unsigned long vmstart;
     811           0 :         unsigned long vmend;
     812             : 
     813           0 :         vma = find_vma(mm, start);
     814           0 :         VM_BUG_ON(!vma);
     815             : 
     816           0 :         prev = vma->vm_prev;
     817           0 :         if (start > vma->vm_start)
     818           0 :                 prev = vma;
     819             : 
     820           0 :         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
     821           0 :                 next = vma->vm_next;
     822           0 :                 vmstart = max(start, vma->vm_start);
     823           0 :                 vmend   = min(end, vma->vm_end);
     824             : 
     825           0 :                 if (mpol_equal(vma_policy(vma), new_pol))
     826           0 :                         continue;
     827             : 
     828           0 :                 pgoff = vma->vm_pgoff +
     829           0 :                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
     830           0 :                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
     831             :                                  vma->anon_vma, vma->vm_file, pgoff,
     832             :                                  new_pol, vma->vm_userfaultfd_ctx);
     833           0 :                 if (prev) {
     834           0 :                         vma = prev;
     835           0 :                         next = vma->vm_next;
     836           0 :                         if (mpol_equal(vma_policy(vma), new_pol))
     837           0 :                                 continue;
     838             :                         /* vma_merge() joined vma && vma->next, case 8 */
     839           0 :                         goto replace;
     840             :                 }
     841           0 :                 if (vma->vm_start != vmstart) {
     842           0 :                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
     843           0 :                         if (err)
     844           0 :                                 goto out;
     845             :                 }
     846           0 :                 if (vma->vm_end != vmend) {
     847           0 :                         err = split_vma(vma->vm_mm, vma, vmend, 0);
     848           0 :                         if (err)
     849           0 :                                 goto out;
     850             :                 }
     851           0 :  replace:
     852           0 :                 err = vma_replace_policy(vma, new_pol);
     853           0 :                 if (err)
     854           0 :                         goto out;
     855             :         }
     856             : 
     857           0 :  out:
     858           0 :         return err;
     859             : }
     860             : 
     861             : /* Set the process memory policy */
     862           3 : static long do_set_mempolicy(unsigned short mode, unsigned short flags,
     863             :                              nodemask_t *nodes)
     864             : {
     865           3 :         struct mempolicy *new, *old;
     866           3 :         NODEMASK_SCRATCH(scratch);
     867           3 :         int ret;
     868             : 
     869           3 :         if (!scratch)
     870             :                 return -ENOMEM;
     871             : 
     872           3 :         new = mpol_new(mode, flags, nodes);
     873           3 :         if (IS_ERR(new)) {
     874           0 :                 ret = PTR_ERR(new);
     875           0 :                 goto out;
     876             :         }
     877             : 
     878           3 :         if (flags & MPOL_F_NUMA_BALANCING) {
     879           0 :                 if (new && new->mode == MPOL_BIND) {
     880           0 :                         new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
     881             :                 } else {
     882           0 :                         ret = -EINVAL;
     883           0 :                         mpol_put(new);
     884           0 :                         goto out;
     885             :                 }
     886             :         }
     887             : 
     888           3 :         ret = mpol_set_nodemask(new, nodes, scratch);
     889           3 :         if (ret) {
     890           0 :                 mpol_put(new);
     891           0 :                 goto out;
     892             :         }
     893           3 :         task_lock(current);
     894           3 :         old = current->mempolicy;
     895           3 :         current->mempolicy = new;
     896           3 :         if (new && new->mode == MPOL_INTERLEAVE)
     897           1 :                 current->il_prev = MAX_NUMNODES-1;
     898           3 :         task_unlock(current);
     899           3 :         mpol_put(old);
     900             :         ret = 0;
     901           3 : out:
     902           3 :         NODEMASK_SCRATCH_FREE(scratch);
     903           3 :         return ret;
     904             : }
     905             : 
     906             : /*
     907             :  * Return nodemask for policy for get_mempolicy() query
     908             :  *
     909             :  * Called with task's alloc_lock held
     910             :  */
     911           0 : static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
     912             : {
     913           0 :         nodes_clear(*nodes);
     914           0 :         if (p == &default_policy)
     915             :                 return;
     916             : 
     917           0 :         switch (p->mode) {
     918           0 :         case MPOL_BIND:
     919             :         case MPOL_INTERLEAVE:
     920           0 :                 *nodes = p->v.nodes;
     921           0 :                 break;
     922           0 :         case MPOL_PREFERRED:
     923           0 :                 if (!(p->flags & MPOL_F_LOCAL))
     924           0 :                         node_set(p->v.preferred_node, *nodes);
     925             :                 /* else return empty node mask for local allocation */
     926             :                 break;
     927           0 :         default:
     928           0 :                 BUG();
     929             :         }
     930             : }
     931             : 
     932           0 : static int lookup_node(struct mm_struct *mm, unsigned long addr)
     933             : {
     934           0 :         struct page *p = NULL;
     935           0 :         int err;
     936             : 
     937           0 :         int locked = 1;
     938           0 :         err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
     939           0 :         if (err > 0) {
     940           0 :                 err = page_to_nid(p);
     941           0 :                 put_page(p);
     942             :         }
     943           0 :         if (locked)
     944           0 :                 mmap_read_unlock(mm);
     945           0 :         return err;
     946             : }
     947             : 
     948             : /* Retrieve NUMA policy */
     949           0 : static long do_get_mempolicy(int *policy, nodemask_t *nmask,
     950             :                              unsigned long addr, unsigned long flags)
     951             : {
     952           0 :         int err;
     953           0 :         struct mm_struct *mm = current->mm;
     954           0 :         struct vm_area_struct *vma = NULL;
     955           0 :         struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
     956             : 
     957           0 :         if (flags &
     958             :                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
     959             :                 return -EINVAL;
     960             : 
     961           0 :         if (flags & MPOL_F_MEMS_ALLOWED) {
     962           0 :                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
     963             :                         return -EINVAL;
     964           0 :                 *policy = 0;    /* just so it's initialized */
     965           0 :                 task_lock(current);
     966           0 :                 *nmask  = cpuset_current_mems_allowed;
     967           0 :                 task_unlock(current);
     968           0 :                 return 0;
     969             :         }
     970             : 
     971           0 :         if (flags & MPOL_F_ADDR) {
     972             :                 /*
     973             :                  * Do NOT fall back to task policy if the
     974             :                  * vma/shared policy at addr is NULL.  We
     975             :                  * want to return MPOL_DEFAULT in this case.
     976             :                  */
     977           0 :                 mmap_read_lock(mm);
     978           0 :                 vma = find_vma_intersection(mm, addr, addr+1);
     979           0 :                 if (!vma) {
     980           0 :                         mmap_read_unlock(mm);
     981           0 :                         return -EFAULT;
     982             :                 }
     983           0 :                 if (vma->vm_ops && vma->vm_ops->get_policy)
     984           0 :                         pol = vma->vm_ops->get_policy(vma, addr);
     985             :                 else
     986           0 :                         pol = vma->vm_policy;
     987           0 :         } else if (addr)
     988             :                 return -EINVAL;
     989             : 
     990           0 :         if (!pol)
     991           0 :                 pol = &default_policy;      /* indicates default behavior */
     992             : 
     993           0 :         if (flags & MPOL_F_NODE) {
     994           0 :                 if (flags & MPOL_F_ADDR) {
     995             :                         /*
     996             :                          * Take a refcount on the mpol, lookup_node()
     997             :                          * wil drop the mmap_lock, so after calling
     998             :                          * lookup_node() only "pol" remains valid, "vma"
     999             :                          * is stale.
    1000             :                          */
    1001           0 :                         pol_refcount = pol;
    1002           0 :                         vma = NULL;
    1003           0 :                         mpol_get(pol);
    1004           0 :                         err = lookup_node(mm, addr);
    1005           0 :                         if (err < 0)
    1006           0 :                                 goto out;
    1007           0 :                         *policy = err;
    1008           0 :                 } else if (pol == current->mempolicy &&
    1009           0 :                                 pol->mode == MPOL_INTERLEAVE) {
    1010           0 :                         *policy = next_node_in(current->il_prev, pol->v.nodes);
    1011             :                 } else {
    1012           0 :                         err = -EINVAL;
    1013           0 :                         goto out;
    1014             :                 }
    1015             :         } else {
    1016           0 :                 *policy = pol == &default_policy ? MPOL_DEFAULT :
    1017           0 :                                                 pol->mode;
    1018             :                 /*
    1019             :                  * Internal mempolicy flags must be masked off before exposing
    1020             :                  * the policy to userspace.
    1021             :                  */
    1022           0 :                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
    1023             :         }
    1024             : 
    1025           0 :         err = 0;
    1026           0 :         if (nmask) {
    1027           0 :                 if (mpol_store_user_nodemask(pol)) {
    1028           0 :                         *nmask = pol->w.user_nodemask;
    1029             :                 } else {
    1030           0 :                         task_lock(current);
    1031           0 :                         get_policy_nodemask(pol, nmask);
    1032           0 :                         task_unlock(current);
    1033             :                 }
    1034             :         }
    1035             : 
    1036           0 :  out:
    1037           0 :         mpol_cond_put(pol);
    1038           0 :         if (vma)
    1039           0 :                 mmap_read_unlock(mm);
    1040           0 :         if (pol_refcount)
    1041           0 :                 mpol_put(pol_refcount);
    1042           0 :         return err;
    1043             : }
    1044             : 
    1045             : #ifdef CONFIG_MIGRATION
    1046             : /*
    1047             :  * page migration, thp tail pages can be passed.
    1048             :  */
    1049           0 : static int migrate_page_add(struct page *page, struct list_head *pagelist,
    1050             :                                 unsigned long flags)
    1051             : {
    1052           0 :         struct page *head = compound_head(page);
    1053             :         /*
    1054             :          * Avoid migrating a page that is shared with others.
    1055             :          */
    1056           0 :         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
    1057           0 :                 if (!isolate_lru_page(head)) {
    1058           0 :                         list_add_tail(&head->lru, pagelist);
    1059           0 :                         mod_node_page_state(page_pgdat(head),
    1060           0 :                                 NR_ISOLATED_ANON + page_is_file_lru(head),
    1061           0 :                                 thp_nr_pages(head));
    1062           0 :                 } else if (flags & MPOL_MF_STRICT) {
    1063             :                         /*
    1064             :                          * Non-movable page may reach here.  And, there may be
    1065             :                          * temporary off LRU pages or non-LRU movable pages.
    1066             :                          * Treat them as unmovable pages since they can't be
    1067             :                          * isolated, so they can't be moved at the moment.  It
    1068             :                          * should return -EIO for this case too.
    1069             :                          */
    1070           0 :                         return -EIO;
    1071             :                 }
    1072             :         }
    1073             : 
    1074             :         return 0;
    1075             : }
    1076             : 
    1077             : /*
    1078             :  * Migrate pages from one node to a target node.
    1079             :  * Returns error or the number of pages not migrated.
    1080             :  */
    1081           0 : static int migrate_to_node(struct mm_struct *mm, int source, int dest,
    1082             :                            int flags)
    1083             : {
    1084           0 :         nodemask_t nmask;
    1085           0 :         LIST_HEAD(pagelist);
    1086           0 :         int err = 0;
    1087           0 :         struct migration_target_control mtc = {
    1088             :                 .nid = dest,
    1089             :                 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
    1090             :         };
    1091             : 
    1092           0 :         nodes_clear(nmask);
    1093           0 :         node_set(source, nmask);
    1094             : 
    1095             :         /*
    1096             :          * This does not "check" the range but isolates all pages that
    1097             :          * need migration.  Between passing in the full user address
    1098             :          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
    1099             :          */
    1100           0 :         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
    1101           0 :         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
    1102           0 :                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
    1103             : 
    1104           0 :         if (!list_empty(&pagelist)) {
    1105           0 :                 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
    1106             :                                 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
    1107           0 :                 if (err)
    1108           0 :                         putback_movable_pages(&pagelist);
    1109             :         }
    1110             : 
    1111           0 :         return err;
    1112             : }
    1113             : 
    1114             : /*
    1115             :  * Move pages between the two nodesets so as to preserve the physical
    1116             :  * layout as much as possible.
    1117             :  *
    1118             :  * Returns the number of page that could not be moved.
    1119             :  */
    1120           0 : int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
    1121             :                      const nodemask_t *to, int flags)
    1122             : {
    1123           0 :         int busy = 0;
    1124           0 :         int err = 0;
    1125           0 :         nodemask_t tmp;
    1126             : 
    1127           0 :         migrate_prep();
    1128             : 
    1129           0 :         mmap_read_lock(mm);
    1130             : 
    1131             :         /*
    1132             :          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
    1133             :          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
    1134             :          * bit in 'tmp', and return that <source, dest> pair for migration.
    1135             :          * The pair of nodemasks 'to' and 'from' define the map.
    1136             :          *
    1137             :          * If no pair of bits is found that way, fallback to picking some
    1138             :          * pair of 'source' and 'dest' bits that are not the same.  If the
    1139             :          * 'source' and 'dest' bits are the same, this represents a node
    1140             :          * that will be migrating to itself, so no pages need move.
    1141             :          *
    1142             :          * If no bits are left in 'tmp', or if all remaining bits left
    1143             :          * in 'tmp' correspond to the same bit in 'to', return false
    1144             :          * (nothing left to migrate).
    1145             :          *
    1146             :          * This lets us pick a pair of nodes to migrate between, such that
    1147             :          * if possible the dest node is not already occupied by some other
    1148             :          * source node, minimizing the risk of overloading the memory on a
    1149             :          * node that would happen if we migrated incoming memory to a node
    1150             :          * before migrating outgoing memory source that same node.
    1151             :          *
    1152             :          * A single scan of tmp is sufficient.  As we go, we remember the
    1153             :          * most recent <s, d> pair that moved (s != d).  If we find a pair
    1154             :          * that not only moved, but what's better, moved to an empty slot
    1155             :          * (d is not set in tmp), then we break out then, with that pair.
    1156             :          * Otherwise when we finish scanning from_tmp, we at least have the
    1157             :          * most recent <s, d> pair that moved.  If we get all the way through
    1158             :          * the scan of tmp without finding any node that moved, much less
    1159             :          * moved to an empty node, then there is nothing left worth migrating.
    1160             :          */
    1161             : 
    1162           0 :         tmp = *from;
    1163           0 :         while (!nodes_empty(tmp)) {
    1164           0 :                 int s,d;
    1165           0 :                 int source = NUMA_NO_NODE;
    1166           0 :                 int dest = 0;
    1167             : 
    1168           0 :                 for_each_node_mask(s, tmp) {
    1169             : 
    1170             :                         /*
    1171             :                          * do_migrate_pages() tries to maintain the relative
    1172             :                          * node relationship of the pages established between
    1173             :                          * threads and memory areas.
    1174             :                          *
    1175             :                          * However if the number of source nodes is not equal to
    1176             :                          * the number of destination nodes we can not preserve
    1177             :                          * this node relative relationship.  In that case, skip
    1178             :                          * copying memory from a node that is in the destination
    1179             :                          * mask.
    1180             :                          *
    1181             :                          * Example: [2,3,4] -> [3,4,5] moves everything.
    1182             :                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
    1183             :                          */
    1184             : 
    1185           0 :                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
    1186           0 :                                                 (node_isset(s, *to)))
    1187           0 :                                 continue;
    1188             : 
    1189           0 :                         d = node_remap(s, *from, *to);
    1190           0 :                         if (s == d)
    1191           0 :                                 continue;
    1192             : 
    1193           0 :                         source = s;     /* Node moved. Memorize */
    1194           0 :                         dest = d;
    1195             : 
    1196             :                         /* dest not in remaining from nodes? */
    1197           0 :                         if (!node_isset(dest, tmp))
    1198             :                                 break;
    1199             :                 }
    1200           0 :                 if (source == NUMA_NO_NODE)
    1201             :                         break;
    1202             : 
    1203           0 :                 node_clear(source, tmp);
    1204           0 :                 err = migrate_to_node(mm, source, dest, flags);
    1205           0 :                 if (err > 0)
    1206           0 :                         busy += err;
    1207           0 :                 if (err < 0)
    1208             :                         break;
    1209             :         }
    1210           0 :         mmap_read_unlock(mm);
    1211           0 :         if (err < 0)
    1212           0 :                 return err;
    1213             :         return busy;
    1214             : 
    1215             : }
    1216             : 
    1217             : /*
    1218             :  * Allocate a new page for page migration based on vma policy.
    1219             :  * Start by assuming the page is mapped by the same vma as contains @start.
    1220             :  * Search forward from there, if not.  N.B., this assumes that the
    1221             :  * list of pages handed to migrate_pages()--which is how we get here--
    1222             :  * is in virtual address order.
    1223             :  */
    1224           0 : static struct page *new_page(struct page *page, unsigned long start)
    1225             : {
    1226           0 :         struct vm_area_struct *vma;
    1227           0 :         unsigned long address;
    1228             : 
    1229           0 :         vma = find_vma(current->mm, start);
    1230           0 :         while (vma) {
    1231           0 :                 address = page_address_in_vma(page, vma);
    1232           0 :                 if (address != -EFAULT)
    1233             :                         break;
    1234           0 :                 vma = vma->vm_next;
    1235             :         }
    1236             : 
    1237           0 :         if (PageHuge(page)) {
    1238             :                 return alloc_huge_page_vma(page_hstate(compound_head(page)),
    1239             :                                 vma, address);
    1240           0 :         } else if (PageTransHuge(page)) {
    1241           0 :                 struct page *thp;
    1242             : 
    1243           0 :                 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
    1244             :                                          HPAGE_PMD_ORDER);
    1245           0 :                 if (!thp)
    1246             :                         return NULL;
    1247           0 :                 prep_transhuge_page(thp);
    1248           0 :                 return thp;
    1249             :         }
    1250             :         /*
    1251             :          * if !vma, alloc_page_vma() will use task or system default policy
    1252             :          */
    1253           0 :         return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
    1254             :                         vma, address);
    1255             : }
    1256             : #else
    1257             : 
    1258             : static int migrate_page_add(struct page *page, struct list_head *pagelist,
    1259             :                                 unsigned long flags)
    1260             : {
    1261             :         return -EIO;
    1262             : }
    1263             : 
    1264             : int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
    1265             :                      const nodemask_t *to, int flags)
    1266             : {
    1267             :         return -ENOSYS;
    1268             : }
    1269             : 
    1270             : static struct page *new_page(struct page *page, unsigned long start)
    1271             : {
    1272             :         return NULL;
    1273             : }
    1274             : #endif
    1275             : 
    1276           0 : static long do_mbind(unsigned long start, unsigned long len,
    1277             :                      unsigned short mode, unsigned short mode_flags,
    1278             :                      nodemask_t *nmask, unsigned long flags)
    1279             : {
    1280           0 :         struct mm_struct *mm = current->mm;
    1281           0 :         struct mempolicy *new;
    1282           0 :         unsigned long end;
    1283           0 :         int err;
    1284           0 :         int ret;
    1285           0 :         LIST_HEAD(pagelist);
    1286             : 
    1287           0 :         if (flags & ~(unsigned long)MPOL_MF_VALID)
    1288             :                 return -EINVAL;
    1289           0 :         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
    1290             :                 return -EPERM;
    1291             : 
    1292           0 :         if (start & ~PAGE_MASK)
    1293             :                 return -EINVAL;
    1294             : 
    1295           0 :         if (mode == MPOL_DEFAULT)
    1296           0 :                 flags &= ~MPOL_MF_STRICT;
    1297             : 
    1298           0 :         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
    1299           0 :         end = start + len;
    1300             : 
    1301           0 :         if (end < start)
    1302             :                 return -EINVAL;
    1303           0 :         if (end == start)
    1304             :                 return 0;
    1305             : 
    1306           0 :         new = mpol_new(mode, mode_flags, nmask);
    1307           0 :         if (IS_ERR(new))
    1308           0 :                 return PTR_ERR(new);
    1309             : 
    1310           0 :         if (flags & MPOL_MF_LAZY)
    1311             :                 new->flags |= MPOL_F_MOF;
    1312             : 
    1313             :         /*
    1314             :          * If we are using the default policy then operation
    1315             :          * on discontinuous address spaces is okay after all
    1316             :          */
    1317           0 :         if (!new)
    1318           0 :                 flags |= MPOL_MF_DISCONTIG_OK;
    1319             : 
    1320           0 :         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
    1321             :                  start, start + len, mode, mode_flags,
    1322             :                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
    1323             : 
    1324           0 :         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
    1325             : 
    1326           0 :                 migrate_prep();
    1327             :         }
    1328             :         {
    1329           0 :                 NODEMASK_SCRATCH(scratch);
    1330           0 :                 if (scratch) {
    1331           0 :                         mmap_write_lock(mm);
    1332           0 :                         err = mpol_set_nodemask(new, nmask, scratch);
    1333           0 :                         if (err)
    1334           0 :                                 mmap_write_unlock(mm);
    1335             :                 } else
    1336             :                         err = -ENOMEM;
    1337           0 :                 NODEMASK_SCRATCH_FREE(scratch);
    1338             :         }
    1339           0 :         if (err)
    1340           0 :                 goto mpol_out;
    1341             : 
    1342           0 :         ret = queue_pages_range(mm, start, end, nmask,
    1343             :                           flags | MPOL_MF_INVERT, &pagelist);
    1344             : 
    1345           0 :         if (ret < 0) {
    1346           0 :                 err = ret;
    1347           0 :                 goto up_out;
    1348             :         }
    1349             : 
    1350           0 :         err = mbind_range(mm, start, end, new);
    1351             : 
    1352           0 :         if (!err) {
    1353           0 :                 int nr_failed = 0;
    1354             : 
    1355           0 :                 if (!list_empty(&pagelist)) {
    1356           0 :                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
    1357           0 :                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
    1358             :                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
    1359           0 :                         if (nr_failed)
    1360           0 :                                 putback_movable_pages(&pagelist);
    1361             :                 }
    1362             : 
    1363           0 :                 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
    1364           0 :                         err = -EIO;
    1365             :         } else {
    1366           0 : up_out:
    1367           0 :                 if (!list_empty(&pagelist))
    1368           0 :                         putback_movable_pages(&pagelist);
    1369             :         }
    1370             : 
    1371           0 :         mmap_write_unlock(mm);
    1372           0 : mpol_out:
    1373           0 :         mpol_put(new);
    1374           0 :         return err;
    1375             : }
    1376             : 
    1377             : /*
    1378             :  * User space interface with variable sized bitmaps for nodelists.
    1379             :  */
    1380             : 
    1381             : /* Copy a node mask from user space. */
    1382           0 : static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
    1383             :                      unsigned long maxnode)
    1384             : {
    1385           0 :         unsigned long k;
    1386           0 :         unsigned long t;
    1387           0 :         unsigned long nlongs;
    1388           0 :         unsigned long endmask;
    1389             : 
    1390           0 :         --maxnode;
    1391           0 :         nodes_clear(*nodes);
    1392           0 :         if (maxnode == 0 || !nmask)
    1393             :                 return 0;
    1394           0 :         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
    1395             :                 return -EINVAL;
    1396             : 
    1397           0 :         nlongs = BITS_TO_LONGS(maxnode);
    1398           0 :         if ((maxnode % BITS_PER_LONG) == 0)
    1399             :                 endmask = ~0UL;
    1400             :         else
    1401           0 :                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
    1402             : 
    1403             :         /*
    1404             :          * When the user specified more nodes than supported just check
    1405             :          * if the non supported part is all zero.
    1406             :          *
    1407             :          * If maxnode have more longs than MAX_NUMNODES, check
    1408             :          * the bits in that area first. And then go through to
    1409             :          * check the rest bits which equal or bigger than MAX_NUMNODES.
    1410             :          * Otherwise, just check bits [MAX_NUMNODES, maxnode).
    1411             :          */
    1412           0 :         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
    1413           0 :                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
    1414           0 :                         if (get_user(t, nmask + k))
    1415             :                                 return -EFAULT;
    1416           0 :                         if (k == nlongs - 1) {
    1417           0 :                                 if (t & endmask)
    1418             :                                         return -EINVAL;
    1419           0 :                         } else if (t)
    1420             :                                 return -EINVAL;
    1421             :                 }
    1422             :                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
    1423             :                 endmask = ~0UL;
    1424             :         }
    1425             : 
    1426           0 :         if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
    1427             :                 unsigned long valid_mask = endmask;
    1428             : 
    1429             :                 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
    1430             :                 if (get_user(t, nmask + nlongs - 1))
    1431             :                         return -EFAULT;
    1432             :                 if (t & valid_mask)
    1433             :                         return -EINVAL;
    1434             :         }
    1435             : 
    1436           0 :         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
    1437             :                 return -EFAULT;
    1438           0 :         nodes_addr(*nodes)[nlongs-1] &= endmask;
    1439           0 :         return 0;
    1440             : }
    1441             : 
    1442             : /* Copy a kernel node mask to user space */
    1443           0 : static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
    1444             :                               nodemask_t *nodes)
    1445             : {
    1446           0 :         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
    1447           0 :         unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
    1448             : 
    1449           0 :         if (copy > nbytes) {
    1450           0 :                 if (copy > PAGE_SIZE)
    1451             :                         return -EINVAL;
    1452           0 :                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
    1453             :                         return -EFAULT;
    1454             :                 copy = nbytes;
    1455             :         }
    1456           0 :         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
    1457             : }
    1458             : 
    1459           0 : static long kernel_mbind(unsigned long start, unsigned long len,
    1460             :                          unsigned long mode, const unsigned long __user *nmask,
    1461             :                          unsigned long maxnode, unsigned int flags)
    1462             : {
    1463           0 :         nodemask_t nodes;
    1464           0 :         int err;
    1465           0 :         unsigned short mode_flags;
    1466             : 
    1467           0 :         start = untagged_addr(start);
    1468           0 :         mode_flags = mode & MPOL_MODE_FLAGS;
    1469           0 :         mode &= ~MPOL_MODE_FLAGS;
    1470           0 :         if (mode >= MPOL_MAX)
    1471             :                 return -EINVAL;
    1472           0 :         if ((mode_flags & MPOL_F_STATIC_NODES) &&
    1473             :             (mode_flags & MPOL_F_RELATIVE_NODES))
    1474             :                 return -EINVAL;
    1475           0 :         err = get_nodes(&nodes, nmask, maxnode);
    1476           0 :         if (err)
    1477           0 :                 return err;
    1478           0 :         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
    1479             : }
    1480             : 
    1481           0 : SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
    1482             :                 unsigned long, mode, const unsigned long __user *, nmask,
    1483             :                 unsigned long, maxnode, unsigned int, flags)
    1484             : {
    1485           0 :         return kernel_mbind(start, len, mode, nmask, maxnode, flags);
    1486             : }
    1487             : 
    1488             : /* Set the process memory policy */
    1489           0 : static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
    1490             :                                  unsigned long maxnode)
    1491             : {
    1492           0 :         int err;
    1493           0 :         nodemask_t nodes;
    1494           0 :         unsigned short flags;
    1495             : 
    1496           0 :         flags = mode & MPOL_MODE_FLAGS;
    1497           0 :         mode &= ~MPOL_MODE_FLAGS;
    1498           0 :         if ((unsigned int)mode >= MPOL_MAX)
    1499             :                 return -EINVAL;
    1500           0 :         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
    1501             :                 return -EINVAL;
    1502           0 :         err = get_nodes(&nodes, nmask, maxnode);
    1503           0 :         if (err)
    1504           0 :                 return err;
    1505           0 :         return do_set_mempolicy(mode, flags, &nodes);
    1506             : }
    1507             : 
    1508           0 : SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
    1509             :                 unsigned long, maxnode)
    1510             : {
    1511           0 :         return kernel_set_mempolicy(mode, nmask, maxnode);
    1512             : }
    1513             : 
    1514           0 : static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
    1515             :                                 const unsigned long __user *old_nodes,
    1516             :                                 const unsigned long __user *new_nodes)
    1517             : {
    1518           0 :         struct mm_struct *mm = NULL;
    1519           0 :         struct task_struct *task;
    1520           0 :         nodemask_t task_nodes;
    1521           0 :         int err;
    1522           0 :         nodemask_t *old;
    1523           0 :         nodemask_t *new;
    1524           0 :         NODEMASK_SCRATCH(scratch);
    1525             : 
    1526           0 :         if (!scratch)
    1527             :                 return -ENOMEM;
    1528             : 
    1529           0 :         old = &scratch->mask1;
    1530           0 :         new = &scratch->mask2;
    1531             : 
    1532           0 :         err = get_nodes(old, old_nodes, maxnode);
    1533           0 :         if (err)
    1534           0 :                 goto out;
    1535             : 
    1536           0 :         err = get_nodes(new, new_nodes, maxnode);
    1537           0 :         if (err)
    1538           0 :                 goto out;
    1539             : 
    1540             :         /* Find the mm_struct */
    1541           0 :         rcu_read_lock();
    1542           0 :         task = pid ? find_task_by_vpid(pid) : current;
    1543           0 :         if (!task) {
    1544           0 :                 rcu_read_unlock();
    1545           0 :                 err = -ESRCH;
    1546           0 :                 goto out;
    1547             :         }
    1548           0 :         get_task_struct(task);
    1549             : 
    1550           0 :         err = -EINVAL;
    1551             : 
    1552             :         /*
    1553             :          * Check if this process has the right to modify the specified process.
    1554             :          * Use the regular "ptrace_may_access()" checks.
    1555             :          */
    1556           0 :         if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
    1557           0 :                 rcu_read_unlock();
    1558           0 :                 err = -EPERM;
    1559           0 :                 goto out_put;
    1560             :         }
    1561           0 :         rcu_read_unlock();
    1562             : 
    1563           0 :         task_nodes = cpuset_mems_allowed(task);
    1564             :         /* Is the user allowed to access the target nodes? */
    1565           0 :         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
    1566           0 :                 err = -EPERM;
    1567           0 :                 goto out_put;
    1568             :         }
    1569             : 
    1570           0 :         task_nodes = cpuset_mems_allowed(current);
    1571           0 :         nodes_and(*new, *new, task_nodes);
    1572           0 :         if (nodes_empty(*new))
    1573           0 :                 goto out_put;
    1574             : 
    1575           0 :         err = security_task_movememory(task);
    1576           0 :         if (err)
    1577           0 :                 goto out_put;
    1578             : 
    1579           0 :         mm = get_task_mm(task);
    1580           0 :         put_task_struct(task);
    1581             : 
    1582           0 :         if (!mm) {
    1583           0 :                 err = -EINVAL;
    1584           0 :                 goto out;
    1585             :         }
    1586             : 
    1587           0 :         err = do_migrate_pages(mm, old, new,
    1588           0 :                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
    1589             : 
    1590           0 :         mmput(mm);
    1591           0 : out:
    1592           0 :         NODEMASK_SCRATCH_FREE(scratch);
    1593             : 
    1594           0 :         return err;
    1595             : 
    1596           0 : out_put:
    1597           0 :         put_task_struct(task);
    1598           0 :         goto out;
    1599             : 
    1600             : }
    1601             : 
    1602           0 : SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
    1603             :                 const unsigned long __user *, old_nodes,
    1604             :                 const unsigned long __user *, new_nodes)
    1605             : {
    1606           0 :         return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
    1607             : }
    1608             : 
    1609             : 
    1610             : /* Retrieve NUMA policy */
    1611           0 : static int kernel_get_mempolicy(int __user *policy,
    1612             :                                 unsigned long __user *nmask,
    1613             :                                 unsigned long maxnode,
    1614             :                                 unsigned long addr,
    1615             :                                 unsigned long flags)
    1616             : {
    1617           0 :         int err;
    1618           0 :         int pval;
    1619           0 :         nodemask_t nodes;
    1620             : 
    1621           0 :         if (nmask != NULL && maxnode < nr_node_ids)
    1622             :                 return -EINVAL;
    1623             : 
    1624           0 :         addr = untagged_addr(addr);
    1625             : 
    1626           0 :         err = do_get_mempolicy(&pval, &nodes, addr, flags);
    1627             : 
    1628           0 :         if (err)
    1629             :                 return err;
    1630             : 
    1631           0 :         if (policy && put_user(pval, policy))
    1632             :                 return -EFAULT;
    1633             : 
    1634           0 :         if (nmask)
    1635           0 :                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
    1636             : 
    1637             :         return err;
    1638             : }
    1639             : 
    1640           0 : SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
    1641             :                 unsigned long __user *, nmask, unsigned long, maxnode,
    1642             :                 unsigned long, addr, unsigned long, flags)
    1643             : {
    1644           0 :         return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
    1645             : }
    1646             : 
    1647             : #ifdef CONFIG_COMPAT
    1648             : 
    1649           0 : COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
    1650             :                        compat_ulong_t __user *, nmask,
    1651             :                        compat_ulong_t, maxnode,
    1652             :                        compat_ulong_t, addr, compat_ulong_t, flags)
    1653             : {
    1654           0 :         long err;
    1655           0 :         unsigned long __user *nm = NULL;
    1656           0 :         unsigned long nr_bits, alloc_size;
    1657           0 :         DECLARE_BITMAP(bm, MAX_NUMNODES);
    1658             : 
    1659           0 :         nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
    1660           0 :         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
    1661             : 
    1662           0 :         if (nmask)
    1663           0 :                 nm = compat_alloc_user_space(alloc_size);
    1664             : 
    1665           0 :         err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
    1666             : 
    1667           0 :         if (!err && nmask) {
    1668           0 :                 unsigned long copy_size;
    1669           0 :                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
    1670           0 :                 err = copy_from_user(bm, nm, copy_size);
    1671             :                 /* ensure entire bitmap is zeroed */
    1672           0 :                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
    1673           0 :                 err |= compat_put_bitmap(nmask, bm, nr_bits);
    1674             :         }
    1675             : 
    1676           0 :         return err;
    1677             : }
    1678             : 
    1679           0 : COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
    1680             :                        compat_ulong_t, maxnode)
    1681             : {
    1682           0 :         unsigned long __user *nm = NULL;
    1683           0 :         unsigned long nr_bits, alloc_size;
    1684           0 :         DECLARE_BITMAP(bm, MAX_NUMNODES);
    1685             : 
    1686           0 :         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
    1687           0 :         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
    1688             : 
    1689           0 :         if (nmask) {
    1690           0 :                 if (compat_get_bitmap(bm, nmask, nr_bits))
    1691             :                         return -EFAULT;
    1692           0 :                 nm = compat_alloc_user_space(alloc_size);
    1693           0 :                 if (copy_to_user(nm, bm, alloc_size))
    1694             :                         return -EFAULT;
    1695             :         }
    1696             : 
    1697           0 :         return kernel_set_mempolicy(mode, nm, nr_bits+1);
    1698             : }
    1699             : 
    1700           0 : COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
    1701             :                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
    1702             :                        compat_ulong_t, maxnode, compat_ulong_t, flags)
    1703             : {
    1704           0 :         unsigned long __user *nm = NULL;
    1705           0 :         unsigned long nr_bits, alloc_size;
    1706           0 :         nodemask_t bm;
    1707             : 
    1708           0 :         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
    1709           0 :         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
    1710             : 
    1711           0 :         if (nmask) {
    1712           0 :                 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
    1713             :                         return -EFAULT;
    1714           0 :                 nm = compat_alloc_user_space(alloc_size);
    1715           0 :                 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
    1716             :                         return -EFAULT;
    1717             :         }
    1718             : 
    1719           0 :         return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
    1720             : }
    1721             : 
    1722           0 : COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
    1723             :                        compat_ulong_t, maxnode,
    1724             :                        const compat_ulong_t __user *, old_nodes,
    1725             :                        const compat_ulong_t __user *, new_nodes)
    1726             : {
    1727           0 :         unsigned long __user *old = NULL;
    1728           0 :         unsigned long __user *new = NULL;
    1729           0 :         nodemask_t tmp_mask;
    1730           0 :         unsigned long nr_bits;
    1731           0 :         unsigned long size;
    1732             : 
    1733           0 :         nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
    1734           0 :         size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
    1735           0 :         if (old_nodes) {
    1736           0 :                 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
    1737             :                         return -EFAULT;
    1738           0 :                 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
    1739           0 :                 if (new_nodes)
    1740           0 :                         new = old + size / sizeof(unsigned long);
    1741           0 :                 if (copy_to_user(old, nodes_addr(tmp_mask), size))
    1742             :                         return -EFAULT;
    1743             :         }
    1744           0 :         if (new_nodes) {
    1745           0 :                 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
    1746             :                         return -EFAULT;
    1747           0 :                 if (new == NULL)
    1748           0 :                         new = compat_alloc_user_space(size);
    1749           0 :                 if (copy_to_user(new, nodes_addr(tmp_mask), size))
    1750             :                         return -EFAULT;
    1751             :         }
    1752           0 :         return kernel_migrate_pages(pid, nr_bits + 1, old, new);
    1753             : }
    1754             : 
    1755             : #endif /* CONFIG_COMPAT */
    1756             : 
    1757           0 : bool vma_migratable(struct vm_area_struct *vma)
    1758             : {
    1759           0 :         if (vma->vm_flags & (VM_IO | VM_PFNMAP))
    1760             :                 return false;
    1761             : 
    1762             :         /*
    1763             :          * DAX device mappings require predictable access latency, so avoid
    1764             :          * incurring periodic faults.
    1765             :          */
    1766           0 :         if (vma_is_dax(vma))
    1767             :                 return false;
    1768             : 
    1769           0 :         if (is_vm_hugetlb_page(vma) &&
    1770             :                 !hugepage_migration_supported(hstate_vma(vma)))
    1771             :                 return false;
    1772             : 
    1773             :         /*
    1774             :          * Migration allocates pages in the highest zone. If we cannot
    1775             :          * do so then migration (at least from node to node) is not
    1776             :          * possible.
    1777             :          */
    1778           0 :         if (vma->vm_file &&
    1779           0 :                 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
    1780           0 :                         < policy_zone)
    1781           0 :                 return false;
    1782             :         return true;
    1783             : }
    1784             : 
    1785       72054 : struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
    1786             :                                                 unsigned long addr)
    1787             : {
    1788       72054 :         struct mempolicy *pol = NULL;
    1789             : 
    1790       72054 :         if (vma) {
    1791       72054 :                 if (vma->vm_ops && vma->vm_ops->get_policy) {
    1792           8 :                         pol = vma->vm_ops->get_policy(vma, addr);
    1793       72046 :                 } else if (vma->vm_policy) {
    1794           0 :                         pol = vma->vm_policy;
    1795             : 
    1796             :                         /*
    1797             :                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
    1798             :                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
    1799             :                          * count on these policies which will be dropped by
    1800             :                          * mpol_cond_put() later
    1801             :                          */
    1802           0 :                         if (mpol_needs_cond_ref(pol))
    1803           0 :                                 mpol_get(pol);
    1804             :                 }
    1805             :         }
    1806             : 
    1807       72054 :         return pol;
    1808             : }
    1809             : 
    1810             : /*
    1811             :  * get_vma_policy(@vma, @addr)
    1812             :  * @vma: virtual memory area whose policy is sought
    1813             :  * @addr: address in @vma for shared policy lookup
    1814             :  *
    1815             :  * Returns effective policy for a VMA at specified address.
    1816             :  * Falls back to current->mempolicy or system default policy, as necessary.
    1817             :  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
    1818             :  * count--added by the get_policy() vm_op, as appropriate--to protect against
    1819             :  * freeing by another task.  It is the caller's responsibility to free the
    1820             :  * extra reference for shared policies.
    1821             :  */
    1822       72059 : static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
    1823             :                                                 unsigned long addr)
    1824             : {
    1825       72059 :         struct mempolicy *pol = __get_vma_policy(vma, addr);
    1826             : 
    1827       72057 :         if (!pol)
    1828       72063 :                 pol = get_task_policy(current);
    1829             : 
    1830       72057 :         return pol;
    1831             : }
    1832             : 
    1833           0 : bool vma_policy_mof(struct vm_area_struct *vma)
    1834             : {
    1835           0 :         struct mempolicy *pol;
    1836             : 
    1837           0 :         if (vma->vm_ops && vma->vm_ops->get_policy) {
    1838           0 :                 bool ret = false;
    1839             : 
    1840           0 :                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
    1841           0 :                 if (pol && (pol->flags & MPOL_F_MOF))
    1842           0 :                         ret = true;
    1843           0 :                 mpol_cond_put(pol);
    1844             : 
    1845           0 :                 return ret;
    1846             :         }
    1847             : 
    1848           0 :         pol = vma->vm_policy;
    1849           0 :         if (!pol)
    1850           0 :                 pol = get_task_policy(current);
    1851             : 
    1852           0 :         return pol->flags & MPOL_F_MOF;
    1853             : }
    1854             : 
    1855           0 : static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
    1856             : {
    1857           0 :         enum zone_type dynamic_policy_zone = policy_zone;
    1858             : 
    1859           0 :         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
    1860             : 
    1861             :         /*
    1862             :          * if policy->v.nodes has movable memory only,
    1863             :          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
    1864             :          *
    1865             :          * policy->v.nodes is intersect with node_states[N_MEMORY].
    1866             :          * so if the following test faile, it implies
    1867             :          * policy->v.nodes has movable memory only.
    1868             :          */
    1869           0 :         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
    1870           0 :                 dynamic_policy_zone = ZONE_MOVABLE;
    1871             : 
    1872           0 :         return zone >= dynamic_policy_zone;
    1873             : }
    1874             : 
    1875             : /*
    1876             :  * Return a nodemask representing a mempolicy for filtering nodes for
    1877             :  * page allocation
    1878             :  */
    1879      188776 : nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
    1880             : {
    1881             :         /* Lower zones don't get a nodemask applied for MPOL_BIND */
    1882      188776 :         if (unlikely(policy->mode == MPOL_BIND) &&
    1883           0 :                         apply_policy_zone(policy, gfp_zone(gfp)) &&
    1884           0 :                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
    1885           0 :                 return &policy->v.nodes;
    1886             : 
    1887             :         return NULL;
    1888             : }
    1889             : 
    1890             : /* Return the node id preferred by the given mempolicy, or the given id */
    1891      188763 : static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
    1892             : {
    1893      188763 :         if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
    1894      188405 :                 nd = policy->v.preferred_node;
    1895             :         else {
    1896             :                 /*
    1897             :                  * __GFP_THISNODE shouldn't even be used with the bind policy
    1898             :                  * because we might easily break the expectation to stay on the
    1899             :                  * requested node and not break the policy.
    1900             :                  */
    1901         716 :                 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
    1902             :         }
    1903             : 
    1904      188763 :         return nd;
    1905             : }
    1906             : 
    1907             : /* Do dynamic interleaving for a process */
    1908        7170 : static unsigned interleave_nodes(struct mempolicy *policy)
    1909             : {
    1910        7170 :         unsigned next;
    1911        7170 :         struct task_struct *me = current;
    1912             : 
    1913        7170 :         next = next_node_in(me->il_prev, policy->v.nodes);
    1914        7170 :         if (next < MAX_NUMNODES)
    1915        7170 :                 me->il_prev = next;
    1916        7170 :         return next;
    1917             : }
    1918             : 
    1919             : /*
    1920             :  * Depending on the memory policy provide a node from which to allocate the
    1921             :  * next slab entry.
    1922             :  */
    1923       27483 : unsigned int mempolicy_slab_node(void)
    1924             : {
    1925       27483 :         struct mempolicy *policy;
    1926       27483 :         int node = numa_mem_id();
    1927             : 
    1928       27483 :         if (in_interrupt())
    1929             :                 return node;
    1930             : 
    1931       27449 :         policy = current->mempolicy;
    1932       27449 :         if (!policy || policy->flags & MPOL_F_LOCAL)
    1933             :                 return node;
    1934             : 
    1935        3450 :         switch (policy->mode) {
    1936           0 :         case MPOL_PREFERRED:
    1937             :                 /*
    1938             :                  * handled MPOL_F_LOCAL above
    1939             :                  */
    1940           0 :                 return policy->v.preferred_node;
    1941             : 
    1942        3450 :         case MPOL_INTERLEAVE:
    1943        3450 :                 return interleave_nodes(policy);
    1944             : 
    1945             :         case MPOL_BIND: {
    1946           0 :                 struct zoneref *z;
    1947             : 
    1948             :                 /*
    1949             :                  * Follow bind policy behavior and start allocation at the
    1950             :                  * first node.
    1951             :                  */
    1952           0 :                 struct zonelist *zonelist;
    1953           0 :                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
    1954           0 :                 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
    1955           0 :                 z = first_zones_zonelist(zonelist, highest_zoneidx,
    1956             :                                                         &policy->v.nodes);
    1957           0 :                 return z->zone ? zone_to_nid(z->zone) : node;
    1958             :         }
    1959             : 
    1960           0 :         default:
    1961           0 :                 BUG();
    1962             :         }
    1963             : }
    1964             : 
    1965             : /*
    1966             :  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
    1967             :  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
    1968             :  * number of present nodes.
    1969             :  */
    1970           0 : static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
    1971             : {
    1972           0 :         unsigned nnodes = nodes_weight(pol->v.nodes);
    1973           0 :         unsigned target;
    1974           0 :         int i;
    1975           0 :         int nid;
    1976             : 
    1977           0 :         if (!nnodes)
    1978           0 :                 return numa_node_id();
    1979           0 :         target = (unsigned int)n % nnodes;
    1980           0 :         nid = first_node(pol->v.nodes);
    1981           0 :         for (i = 0; i < target; i++)
    1982           0 :                 nid = next_node(nid, pol->v.nodes);
    1983           0 :         return nid;
    1984             : }
    1985             : 
    1986             : /* Determine a node number for interleave */
    1987           0 : static inline unsigned interleave_nid(struct mempolicy *pol,
    1988             :                  struct vm_area_struct *vma, unsigned long addr, int shift)
    1989             : {
    1990           0 :         if (vma) {
    1991           0 :                 unsigned long off;
    1992             : 
    1993             :                 /*
    1994             :                  * for small pages, there is no difference between
    1995             :                  * shift and PAGE_SHIFT, so the bit-shift is safe.
    1996             :                  * for huge pages, since vm_pgoff is in units of small
    1997             :                  * pages, we need to shift off the always 0 bits to get
    1998             :                  * a useful offset.
    1999             :                  */
    2000           0 :                 BUG_ON(shift < PAGE_SHIFT);
    2001           0 :                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
    2002           0 :                 off += (addr - vma->vm_start) >> shift;
    2003           0 :                 return offset_il_node(pol, off);
    2004             :         } else
    2005           0 :                 return interleave_nodes(pol);
    2006             : }
    2007             : 
    2008             : #ifdef CONFIG_HUGETLBFS
    2009             : /*
    2010             :  * huge_node(@vma, @addr, @gfp_flags, @mpol)
    2011             :  * @vma: virtual memory area whose policy is sought
    2012             :  * @addr: address in @vma for shared policy lookup and interleave policy
    2013             :  * @gfp_flags: for requested zone
    2014             :  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
    2015             :  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
    2016             :  *
    2017             :  * Returns a nid suitable for a huge page allocation and a pointer
    2018             :  * to the struct mempolicy for conditional unref after allocation.
    2019             :  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
    2020             :  * @nodemask for filtering the zonelist.
    2021             :  *
    2022             :  * Must be protected by read_mems_allowed_begin()
    2023             :  */
    2024             : int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
    2025             :                                 struct mempolicy **mpol, nodemask_t **nodemask)
    2026             : {
    2027             :         int nid;
    2028             : 
    2029             :         *mpol = get_vma_policy(vma, addr);
    2030             :         *nodemask = NULL;       /* assume !MPOL_BIND */
    2031             : 
    2032             :         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
    2033             :                 nid = interleave_nid(*mpol, vma, addr,
    2034             :                                         huge_page_shift(hstate_vma(vma)));
    2035             :         } else {
    2036             :                 nid = policy_node(gfp_flags, *mpol, numa_node_id());
    2037             :                 if ((*mpol)->mode == MPOL_BIND)
    2038             :                         *nodemask = &(*mpol)->v.nodes;
    2039             :         }
    2040             :         return nid;
    2041             : }
    2042             : 
    2043             : /*
    2044             :  * init_nodemask_of_mempolicy
    2045             :  *
    2046             :  * If the current task's mempolicy is "default" [NULL], return 'false'
    2047             :  * to indicate default policy.  Otherwise, extract the policy nodemask
    2048             :  * for 'bind' or 'interleave' policy into the argument nodemask, or
    2049             :  * initialize the argument nodemask to contain the single node for
    2050             :  * 'preferred' or 'local' policy and return 'true' to indicate presence
    2051             :  * of non-default mempolicy.
    2052             :  *
    2053             :  * We don't bother with reference counting the mempolicy [mpol_get/put]
    2054             :  * because the current task is examining it's own mempolicy and a task's
    2055             :  * mempolicy is only ever changed by the task itself.
    2056             :  *
    2057             :  * N.B., it is the caller's responsibility to free a returned nodemask.
    2058             :  */
    2059             : bool init_nodemask_of_mempolicy(nodemask_t *mask)
    2060             : {
    2061             :         struct mempolicy *mempolicy;
    2062             :         int nid;
    2063             : 
    2064             :         if (!(mask && current->mempolicy))
    2065             :                 return false;
    2066             : 
    2067             :         task_lock(current);
    2068             :         mempolicy = current->mempolicy;
    2069             :         switch (mempolicy->mode) {
    2070             :         case MPOL_PREFERRED:
    2071             :                 if (mempolicy->flags & MPOL_F_LOCAL)
    2072             :                         nid = numa_node_id();
    2073             :                 else
    2074             :                         nid = mempolicy->v.preferred_node;
    2075             :                 init_nodemask_of_node(mask, nid);
    2076             :                 break;
    2077             : 
    2078             :         case MPOL_BIND:
    2079             :         case MPOL_INTERLEAVE:
    2080             :                 *mask =  mempolicy->v.nodes;
    2081             :                 break;
    2082             : 
    2083             :         default:
    2084             :                 BUG();
    2085             :         }
    2086             :         task_unlock(current);
    2087             : 
    2088             :         return true;
    2089             : }
    2090             : #endif
    2091             : 
    2092             : /*
    2093             :  * mempolicy_nodemask_intersects
    2094             :  *
    2095             :  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
    2096             :  * policy.  Otherwise, check for intersection between mask and the policy
    2097             :  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
    2098             :  * policy, always return true since it may allocate elsewhere on fallback.
    2099             :  *
    2100             :  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
    2101             :  */
    2102           0 : bool mempolicy_nodemask_intersects(struct task_struct *tsk,
    2103             :                                         const nodemask_t *mask)
    2104             : {
    2105           0 :         struct mempolicy *mempolicy;
    2106           0 :         bool ret = true;
    2107             : 
    2108           0 :         if (!mask)
    2109             :                 return ret;
    2110           0 :         task_lock(tsk);
    2111           0 :         mempolicy = tsk->mempolicy;
    2112           0 :         if (!mempolicy)
    2113           0 :                 goto out;
    2114             : 
    2115           0 :         switch (mempolicy->mode) {
    2116             :         case MPOL_PREFERRED:
    2117             :                 /*
    2118             :                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
    2119             :                  * allocate from, they may fallback to other nodes when oom.
    2120             :                  * Thus, it's possible for tsk to have allocated memory from
    2121             :                  * nodes in mask.
    2122             :                  */
    2123             :                 break;
    2124           0 :         case MPOL_BIND:
    2125             :         case MPOL_INTERLEAVE:
    2126           0 :                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
    2127           0 :                 break;
    2128           0 :         default:
    2129           0 :                 BUG();
    2130             :         }
    2131           0 : out:
    2132           0 :         task_unlock(tsk);
    2133           0 :         return ret;
    2134             : }
    2135             : 
    2136             : /* Allocate a page in interleaved policy.
    2137             :    Own path because it needs to do special accounting. */
    2138        3720 : static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
    2139             :                                         unsigned nid)
    2140             : {
    2141        3720 :         struct page *page;
    2142             : 
    2143        3720 :         page = __alloc_pages(gfp, order, nid);
    2144             :         /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
    2145        3720 :         if (!static_branch_likely(&vm_numa_stat_key))
    2146             :                 return page;
    2147        3720 :         if (page && page_to_nid(page) == nid) {
    2148        3720 :                 preempt_disable();
    2149        3720 :                 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
    2150        3720 :                 preempt_enable();
    2151             :         }
    2152             :         return page;
    2153             : }
    2154             : 
    2155             : /**
    2156             :  *      alloc_pages_vma - Allocate a page for a VMA.
    2157             :  *
    2158             :  *      @gfp:
    2159             :  *      %GFP_USER    user allocation.
    2160             :  *      %GFP_KERNEL  kernel allocations,
    2161             :  *      %GFP_HIGHMEM highmem/user allocations,
    2162             :  *      %GFP_FS      allocation should not call back into a file system.
    2163             :  *      %GFP_ATOMIC  don't sleep.
    2164             :  *
    2165             :  *      @order:Order of the GFP allocation.
    2166             :  *      @vma:  Pointer to VMA or NULL if not available.
    2167             :  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
    2168             :  *      @node: Which node to prefer for allocation (modulo policy).
    2169             :  *      @hugepage: for hugepages try only the preferred node if possible
    2170             :  *
    2171             :  *      This function allocates a page from the kernel page pool and applies
    2172             :  *      a NUMA policy associated with the VMA or the current process.
    2173             :  *      When VMA is not NULL caller must read-lock the mmap_lock of the
    2174             :  *      mm_struct of the VMA to prevent it from going away. Should be used for
    2175             :  *      all allocations for pages that will be mapped into user space. Returns
    2176             :  *      NULL when no page can be allocated.
    2177             :  */
    2178             : struct page *
    2179       72061 : alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
    2180             :                 unsigned long addr, int node, bool hugepage)
    2181             : {
    2182       72061 :         struct mempolicy *pol;
    2183       72061 :         struct page *page;
    2184       72061 :         int preferred_nid;
    2185       72061 :         nodemask_t *nmask;
    2186             : 
    2187       72061 :         pol = get_vma_policy(vma, addr);
    2188             : 
    2189       72064 :         if (pol->mode == MPOL_INTERLEAVE) {
    2190           0 :                 unsigned nid;
    2191             : 
    2192           0 :                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
    2193           0 :                 mpol_cond_put(pol);
    2194           0 :                 page = alloc_page_interleave(gfp, order, nid);
    2195           0 :                 goto out;
    2196             :         }
    2197             : 
    2198       72064 :         if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
    2199          17 :                 int hpage_node = node;
    2200             : 
    2201             :                 /*
    2202             :                  * For hugepage allocation and non-interleave policy which
    2203             :                  * allows the current node (or other explicitly preferred
    2204             :                  * node) we only try to allocate from the current/preferred
    2205             :                  * node and don't fall back to other nodes, as the cost of
    2206             :                  * remote accesses would likely offset THP benefits.
    2207             :                  *
    2208             :                  * If the policy is interleave, or does not allow the current
    2209             :                  * node in its nodemask, we allocate the standard way.
    2210             :                  */
    2211          17 :                 if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
    2212          17 :                         hpage_node = pol->v.preferred_node;
    2213             : 
    2214          17 :                 nmask = policy_nodemask(gfp, pol);
    2215          17 :                 if (!nmask || node_isset(hpage_node, *nmask)) {
    2216          17 :                         mpol_cond_put(pol);
    2217             :                         /*
    2218             :                          * First, try to allocate THP only on local node, but
    2219             :                          * don't reclaim unnecessarily, just compact.
    2220             :                          */
    2221          17 :                         page = __alloc_pages_node(hpage_node,
    2222             :                                 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
    2223             : 
    2224             :                         /*
    2225             :                          * If hugepage allocations are configured to always
    2226             :                          * synchronous compact or the vma has been madvised
    2227             :                          * to prefer hugepage backing, retry allowing remote
    2228             :                          * memory with both reclaim and compact as well.
    2229             :                          */
    2230          17 :                         if (!page && (gfp & __GFP_DIRECT_RECLAIM))
    2231           0 :                                 page = __alloc_pages_node(hpage_node,
    2232             :                                                                 gfp, order);
    2233             : 
    2234          17 :                         goto out;
    2235             :                 }
    2236             :         }
    2237             : 
    2238       72047 :         nmask = policy_nodemask(gfp, pol);
    2239       72044 :         preferred_nid = policy_node(gfp, pol, node);
    2240       72045 :         page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
    2241       72037 :         mpol_cond_put(pol);
    2242       72056 : out:
    2243       72056 :         return page;
    2244             : }
    2245             : EXPORT_SYMBOL(alloc_pages_vma);
    2246             : 
    2247             : /**
    2248             :  *      alloc_pages_current - Allocate pages.
    2249             :  *
    2250             :  *      @gfp:
    2251             :  *              %GFP_USER   user allocation,
    2252             :  *              %GFP_KERNEL kernel allocation,
    2253             :  *              %GFP_HIGHMEM highmem allocation,
    2254             :  *              %GFP_FS     don't call back into a file system.
    2255             :  *              %GFP_ATOMIC don't sleep.
    2256             :  *      @order: Power of two of allocation size in pages. 0 is a single page.
    2257             :  *
    2258             :  *      Allocate a page from the kernel page pool.  When not in
    2259             :  *      interrupt context and apply the current process NUMA policy.
    2260             :  *      Returns NULL when no page can be allocated.
    2261             :  */
    2262      120428 : struct page *alloc_pages_current(gfp_t gfp, unsigned order)
    2263             : {
    2264      120428 :         struct mempolicy *pol = &default_policy;
    2265      120428 :         struct page *page;
    2266             : 
    2267      120428 :         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
    2268      120354 :                 pol = get_task_policy(current);
    2269             : 
    2270             :         /*
    2271             :          * No reference counting needed for current->mempolicy
    2272             :          * nor system default_policy
    2273             :          */
    2274      120428 :         if (pol->mode == MPOL_INTERLEAVE)
    2275        3720 :                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
    2276             :         else
    2277      116708 :                 page = __alloc_pages_nodemask(gfp, order,
    2278             :                                 policy_node(gfp, pol, numa_node_id()),
    2279             :                                 policy_nodemask(gfp, pol));
    2280             : 
    2281      120429 :         return page;
    2282             : }
    2283             : EXPORT_SYMBOL(alloc_pages_current);
    2284             : 
    2285       86786 : int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
    2286             : {
    2287       86786 :         struct mempolicy *pol = mpol_dup(vma_policy(src));
    2288             : 
    2289       86786 :         if (IS_ERR(pol))
    2290           0 :                 return PTR_ERR(pol);
    2291       86786 :         dst->vm_policy = pol;
    2292       86786 :         return 0;
    2293             : }
    2294             : 
    2295             : /*
    2296             :  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
    2297             :  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
    2298             :  * with the mems_allowed returned by cpuset_mems_allowed().  This
    2299             :  * keeps mempolicies cpuset relative after its cpuset moves.  See
    2300             :  * further kernel/cpuset.c update_nodemask().
    2301             :  *
    2302             :  * current's mempolicy may be rebinded by the other task(the task that changes
    2303             :  * cpuset's mems), so we needn't do rebind work for current task.
    2304             :  */
    2305             : 
    2306             : /* Slow path of a mempolicy duplicate */
    2307           4 : struct mempolicy *__mpol_dup(struct mempolicy *old)
    2308             : {
    2309           4 :         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
    2310             : 
    2311           4 :         if (!new)
    2312           4 :                 return ERR_PTR(-ENOMEM);
    2313             : 
    2314             :         /* task's mempolicy is protected by alloc_lock */
    2315           4 :         if (old == current->mempolicy) {
    2316           4 :                 task_lock(current);
    2317           4 :                 *new = *old;
    2318           4 :                 task_unlock(current);
    2319             :         } else
    2320           0 :                 *new = *old;
    2321             : 
    2322           4 :         if (current_cpuset_is_being_rebound()) {
    2323             :                 nodemask_t mems = cpuset_mems_allowed(current);
    2324             :                 mpol_rebind_policy(new, &mems);
    2325             :         }
    2326           4 :         atomic_set(&new->refcnt, 1);
    2327           4 :         return new;
    2328             : }
    2329             : 
    2330             : /* Slow path of a mempolicy comparison */
    2331           0 : bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
    2332             : {
    2333           0 :         if (!a || !b)
    2334             :                 return false;
    2335           0 :         if (a->mode != b->mode)
    2336             :                 return false;
    2337           0 :         if (a->flags != b->flags)
    2338             :                 return false;
    2339           0 :         if (mpol_store_user_nodemask(a))
    2340           0 :                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
    2341             :                         return false;
    2342             : 
    2343           0 :         switch (a->mode) {
    2344           0 :         case MPOL_BIND:
    2345             :         case MPOL_INTERLEAVE:
    2346           0 :                 return !!nodes_equal(a->v.nodes, b->v.nodes);
    2347           0 :         case MPOL_PREFERRED:
    2348             :                 /* a's ->flags is the same as b's */
    2349           0 :                 if (a->flags & MPOL_F_LOCAL)
    2350             :                         return true;
    2351           0 :                 return a->v.preferred_node == b->v.preferred_node;
    2352           0 :         default:
    2353           0 :                 BUG();
    2354             :                 return false;
    2355             :         }
    2356             : }
    2357             : 
    2358             : /*
    2359             :  * Shared memory backing store policy support.
    2360             :  *
    2361             :  * Remember policies even when nobody has shared memory mapped.
    2362             :  * The policies are kept in Red-Black tree linked from the inode.
    2363             :  * They are protected by the sp->lock rwlock, which should be held
    2364             :  * for any accesses to the tree.
    2365             :  */
    2366             : 
    2367             : /*
    2368             :  * lookup first element intersecting start-end.  Caller holds sp->lock for
    2369             :  * reading or for writing
    2370             :  */
    2371             : static struct sp_node *
    2372           0 : sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
    2373             : {
    2374           0 :         struct rb_node *n = sp->root.rb_node;
    2375             : 
    2376           0 :         while (n) {
    2377           0 :                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
    2378             : 
    2379           0 :                 if (start >= p->end)
    2380           0 :                         n = n->rb_right;
    2381           0 :                 else if (end <= p->start)
    2382           0 :                         n = n->rb_left;
    2383             :                 else
    2384             :                         break;
    2385             :         }
    2386           0 :         if (!n)
    2387             :                 return NULL;
    2388           0 :         for (;;) {
    2389           0 :                 struct sp_node *w = NULL;
    2390           0 :                 struct rb_node *prev = rb_prev(n);
    2391           0 :                 if (!prev)
    2392             :                         break;
    2393           0 :                 w = rb_entry(prev, struct sp_node, nd);
    2394           0 :                 if (w->end <= start)
    2395             :                         break;
    2396             :                 n = prev;
    2397             :         }
    2398           0 :         return rb_entry(n, struct sp_node, nd);
    2399             : }
    2400             : 
    2401             : /*
    2402             :  * Insert a new shared policy into the list.  Caller holds sp->lock for
    2403             :  * writing.
    2404             :  */
    2405           0 : static void sp_insert(struct shared_policy *sp, struct sp_node *new)
    2406             : {
    2407           0 :         struct rb_node **p = &sp->root.rb_node;
    2408           0 :         struct rb_node *parent = NULL;
    2409           0 :         struct sp_node *nd;
    2410             : 
    2411           0 :         while (*p) {
    2412           0 :                 parent = *p;
    2413           0 :                 nd = rb_entry(parent, struct sp_node, nd);
    2414           0 :                 if (new->start < nd->start)
    2415           0 :                         p = &(*p)->rb_left;
    2416           0 :                 else if (new->end > nd->end)
    2417           0 :                         p = &(*p)->rb_right;
    2418             :                 else
    2419           0 :                         BUG();
    2420             :         }
    2421           0 :         rb_link_node(&new->nd, parent, p);
    2422           0 :         rb_insert_color(&new->nd, &sp->root);
    2423           0 :         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
    2424             :                  new->policy ? new->policy->mode : 0);
    2425           0 : }
    2426             : 
    2427             : /* Find shared policy intersecting idx */
    2428             : struct mempolicy *
    2429        1994 : mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
    2430             : {
    2431        1994 :         struct mempolicy *pol = NULL;
    2432        1994 :         struct sp_node *sn;
    2433             : 
    2434        1994 :         if (!sp->root.rb_node)
    2435             :                 return NULL;
    2436           0 :         read_lock(&sp->lock);
    2437           0 :         sn = sp_lookup(sp, idx, idx+1);
    2438           0 :         if (sn) {
    2439           0 :                 mpol_get(sn->policy);
    2440           0 :                 pol = sn->policy;
    2441             :         }
    2442           0 :         read_unlock(&sp->lock);
    2443           0 :         return pol;
    2444             : }
    2445             : 
    2446           0 : static void sp_free(struct sp_node *n)
    2447             : {
    2448           0 :         mpol_put(n->policy);
    2449           0 :         kmem_cache_free(sn_cache, n);
    2450           0 : }
    2451             : 
    2452             : /**
    2453             :  * mpol_misplaced - check whether current page node is valid in policy
    2454             :  *
    2455             :  * @page: page to be checked
    2456             :  * @vma: vm area where page mapped
    2457             :  * @addr: virtual address where page mapped
    2458             :  *
    2459             :  * Lookup current policy node id for vma,addr and "compare to" page's
    2460             :  * node id.
    2461             :  *
    2462             :  * Returns:
    2463             :  *      -1      - not misplaced, page is in the right node
    2464             :  *      node    - node id where the page should be
    2465             :  *
    2466             :  * Policy determination "mimics" alloc_page_vma().
    2467             :  * Called from fault path where we know the vma and faulting address.
    2468             :  */
    2469           0 : int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
    2470             : {
    2471           0 :         struct mempolicy *pol;
    2472           0 :         struct zoneref *z;
    2473           0 :         int curnid = page_to_nid(page);
    2474           0 :         unsigned long pgoff;
    2475           0 :         int thiscpu = raw_smp_processor_id();
    2476           0 :         int thisnid = cpu_to_node(thiscpu);
    2477           0 :         int polnid = NUMA_NO_NODE;
    2478           0 :         int ret = -1;
    2479             : 
    2480           0 :         pol = get_vma_policy(vma, addr);
    2481           0 :         if (!(pol->flags & MPOL_F_MOF))
    2482           0 :                 goto out;
    2483             : 
    2484           0 :         switch (pol->mode) {
    2485           0 :         case MPOL_INTERLEAVE:
    2486           0 :                 pgoff = vma->vm_pgoff;
    2487           0 :                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
    2488           0 :                 polnid = offset_il_node(pol, pgoff);
    2489           0 :                 break;
    2490             : 
    2491           0 :         case MPOL_PREFERRED:
    2492           0 :                 if (pol->flags & MPOL_F_LOCAL)
    2493           0 :                         polnid = numa_node_id();
    2494             :                 else
    2495           0 :                         polnid = pol->v.preferred_node;
    2496             :                 break;
    2497             : 
    2498           0 :         case MPOL_BIND:
    2499             :                 /* Optimize placement among multiple nodes via NUMA balancing */
    2500           0 :                 if (pol->flags & MPOL_F_MORON) {
    2501           0 :                         if (node_isset(thisnid, pol->v.nodes))
    2502             :                                 break;
    2503           0 :                         goto out;
    2504             :                 }
    2505             : 
    2506             :                 /*
    2507             :                  * allows binding to multiple nodes.
    2508             :                  * use current page if in policy nodemask,
    2509             :                  * else select nearest allowed node, if any.
    2510             :                  * If no allowed nodes, use current [!misplaced].
    2511             :                  */
    2512           0 :                 if (node_isset(curnid, pol->v.nodes))
    2513           0 :                         goto out;
    2514           0 :                 z = first_zones_zonelist(
    2515             :                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
    2516             :                                 gfp_zone(GFP_HIGHUSER),
    2517             :                                 &pol->v.nodes);
    2518           0 :                 polnid = zone_to_nid(z->zone);
    2519           0 :                 break;
    2520             : 
    2521           0 :         default:
    2522           0 :                 BUG();
    2523             :         }
    2524             : 
    2525             :         /* Migrate the page towards the node whose CPU is referencing it */
    2526           0 :         if (pol->flags & MPOL_F_MORON) {
    2527           0 :                 polnid = thisnid;
    2528             : 
    2529           0 :                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
    2530             :                         goto out;
    2531             :         }
    2532             : 
    2533           0 :         if (curnid != polnid)
    2534           0 :                 ret = polnid;
    2535           0 : out:
    2536           0 :         mpol_cond_put(pol);
    2537             : 
    2538           0 :         return ret;
    2539             : }
    2540             : 
    2541             : /*
    2542             :  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
    2543             :  * dropped after task->mempolicy is set to NULL so that any allocation done as
    2544             :  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
    2545             :  * policy.
    2546             :  */
    2547        1257 : void mpol_put_task_policy(struct task_struct *task)
    2548             : {
    2549        1257 :         struct mempolicy *pol;
    2550             : 
    2551        1257 :         task_lock(task);
    2552        1257 :         pol = task->mempolicy;
    2553        1257 :         task->mempolicy = NULL;
    2554        1257 :         task_unlock(task);
    2555        1257 :         mpol_put(pol);
    2556        1257 : }
    2557             : 
    2558           0 : static void sp_delete(struct shared_policy *sp, struct sp_node *n)
    2559             : {
    2560           0 :         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
    2561           0 :         rb_erase(&n->nd, &sp->root);
    2562           0 :         sp_free(n);
    2563           0 : }
    2564             : 
    2565           0 : static void sp_node_init(struct sp_node *node, unsigned long start,
    2566             :                         unsigned long end, struct mempolicy *pol)
    2567             : {
    2568           0 :         node->start = start;
    2569           0 :         node->end = end;
    2570           0 :         node->policy = pol;
    2571             : }
    2572             : 
    2573           0 : static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
    2574             :                                 struct mempolicy *pol)
    2575             : {
    2576           0 :         struct sp_node *n;
    2577           0 :         struct mempolicy *newpol;
    2578             : 
    2579           0 :         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
    2580           0 :         if (!n)
    2581             :                 return NULL;
    2582             : 
    2583           0 :         newpol = mpol_dup(pol);
    2584           0 :         if (IS_ERR(newpol)) {
    2585           0 :                 kmem_cache_free(sn_cache, n);
    2586           0 :                 return NULL;
    2587             :         }
    2588           0 :         newpol->flags |= MPOL_F_SHARED;
    2589           0 :         sp_node_init(n, start, end, newpol);
    2590             : 
    2591           0 :         return n;
    2592             : }
    2593             : 
    2594             : /* Replace a policy range. */
    2595           0 : static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
    2596             :                                  unsigned long end, struct sp_node *new)
    2597             : {
    2598           0 :         struct sp_node *n;
    2599           0 :         struct sp_node *n_new = NULL;
    2600           0 :         struct mempolicy *mpol_new = NULL;
    2601           0 :         int ret = 0;
    2602             : 
    2603           0 : restart:
    2604           0 :         write_lock(&sp->lock);
    2605           0 :         n = sp_lookup(sp, start, end);
    2606             :         /* Take care of old policies in the same range. */
    2607           0 :         while (n && n->start < end) {
    2608           0 :                 struct rb_node *next = rb_next(&n->nd);
    2609           0 :                 if (n->start >= start) {
    2610           0 :                         if (n->end <= end)
    2611           0 :                                 sp_delete(sp, n);
    2612             :                         else
    2613           0 :                                 n->start = end;
    2614             :                 } else {
    2615             :                         /* Old policy spanning whole new range. */
    2616           0 :                         if (n->end > end) {
    2617           0 :                                 if (!n_new)
    2618           0 :                                         goto alloc_new;
    2619             : 
    2620           0 :                                 *mpol_new = *n->policy;
    2621           0 :                                 atomic_set(&mpol_new->refcnt, 1);
    2622           0 :                                 sp_node_init(n_new, end, n->end, mpol_new);
    2623           0 :                                 n->end = start;
    2624           0 :                                 sp_insert(sp, n_new);
    2625           0 :                                 n_new = NULL;
    2626           0 :                                 mpol_new = NULL;
    2627           0 :                                 break;
    2628             :                         } else
    2629           0 :                                 n->end = start;
    2630             :                 }
    2631           0 :                 if (!next)
    2632             :                         break;
    2633           0 :                 n = rb_entry(next, struct sp_node, nd);
    2634             :         }
    2635           0 :         if (new)
    2636           0 :                 sp_insert(sp, new);
    2637           0 :         write_unlock(&sp->lock);
    2638           0 :         ret = 0;
    2639             : 
    2640           0 : err_out:
    2641           0 :         if (mpol_new)
    2642           0 :                 mpol_put(mpol_new);
    2643           0 :         if (n_new)
    2644           0 :                 kmem_cache_free(sn_cache, n_new);
    2645             : 
    2646           0 :         return ret;
    2647             : 
    2648           0 : alloc_new:
    2649           0 :         write_unlock(&sp->lock);
    2650           0 :         ret = -ENOMEM;
    2651           0 :         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
    2652           0 :         if (!n_new)
    2653           0 :                 goto err_out;
    2654           0 :         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
    2655           0 :         if (!mpol_new)
    2656           0 :                 goto err_out;
    2657           0 :         goto restart;
    2658             : }
    2659             : 
    2660             : /**
    2661             :  * mpol_shared_policy_init - initialize shared policy for inode
    2662             :  * @sp: pointer to inode shared policy
    2663             :  * @mpol:  struct mempolicy to install
    2664             :  *
    2665             :  * Install non-NULL @mpol in inode's shared policy rb-tree.
    2666             :  * On entry, the current task has a reference on a non-NULL @mpol.
    2667             :  * This must be released on exit.
    2668             :  * This is called at get_inode() calls and we can use GFP_KERNEL.
    2669             :  */
    2670         967 : void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
    2671             : {
    2672         967 :         int ret;
    2673             : 
    2674         967 :         sp->root = RB_ROOT;          /* empty tree == default mempolicy */
    2675         967 :         rwlock_init(&sp->lock);
    2676             : 
    2677         967 :         if (mpol) {
    2678           0 :                 struct vm_area_struct pvma;
    2679           0 :                 struct mempolicy *new;
    2680           0 :                 NODEMASK_SCRATCH(scratch);
    2681             : 
    2682           0 :                 if (!scratch)
    2683             :                         goto put_mpol;
    2684             :                 /* contextualize the tmpfs mount point mempolicy */
    2685           0 :                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
    2686           0 :                 if (IS_ERR(new))
    2687           0 :                         goto free_scratch; /* no valid nodemask intersection */
    2688             : 
    2689           0 :                 task_lock(current);
    2690           0 :                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
    2691           0 :                 task_unlock(current);
    2692           0 :                 if (ret)
    2693           0 :                         goto put_new;
    2694             : 
    2695             :                 /* Create pseudo-vma that contains just the policy */
    2696           0 :                 vma_init(&pvma, NULL);
    2697           0 :                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
    2698           0 :                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
    2699             : 
    2700           0 : put_new:
    2701           0 :                 mpol_put(new);                  /* drop initial ref */
    2702           0 : free_scratch:
    2703           0 :                 NODEMASK_SCRATCH_FREE(scratch);
    2704           0 : put_mpol:
    2705           0 :                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
    2706             :         }
    2707         967 : }
    2708             : 
    2709           0 : int mpol_set_shared_policy(struct shared_policy *info,
    2710             :                         struct vm_area_struct *vma, struct mempolicy *npol)
    2711             : {
    2712           0 :         int err;
    2713           0 :         struct sp_node *new = NULL;
    2714           0 :         unsigned long sz = vma_pages(vma);
    2715             : 
    2716           0 :         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
    2717             :                  vma->vm_pgoff,
    2718             :                  sz, npol ? npol->mode : -1,
    2719             :                  npol ? npol->flags : -1,
    2720             :                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
    2721             : 
    2722           0 :         if (npol) {
    2723           0 :                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
    2724           0 :                 if (!new)
    2725             :                         return -ENOMEM;
    2726             :         }
    2727           0 :         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
    2728           0 :         if (err && new)
    2729           0 :                 sp_free(new);
    2730             :         return err;
    2731             : }
    2732             : 
    2733             : /* Free a backing policy store on inode delete. */
    2734         588 : void mpol_free_shared_policy(struct shared_policy *p)
    2735             : {
    2736         588 :         struct sp_node *n;
    2737         588 :         struct rb_node *next;
    2738             : 
    2739         588 :         if (!p->root.rb_node)
    2740             :                 return;
    2741           0 :         write_lock(&p->lock);
    2742           0 :         next = rb_first(&p->root);
    2743           0 :         while (next) {
    2744           0 :                 n = rb_entry(next, struct sp_node, nd);
    2745           0 :                 next = rb_next(&n->nd);
    2746           0 :                 sp_delete(p, n);
    2747             :         }
    2748           0 :         write_unlock(&p->lock);
    2749             : }
    2750             : 
    2751             : #ifdef CONFIG_NUMA_BALANCING
    2752             : static int __initdata numabalancing_override;
    2753             : 
    2754             : static void __init check_numabalancing_enable(void)
    2755             : {
    2756             :         bool numabalancing_default = false;
    2757             : 
    2758             :         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
    2759             :                 numabalancing_default = true;
    2760             : 
    2761             :         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
    2762             :         if (numabalancing_override)
    2763             :                 set_numabalancing_state(numabalancing_override == 1);
    2764             : 
    2765             :         if (num_online_nodes() > 1 && !numabalancing_override) {
    2766             :                 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
    2767             :                         numabalancing_default ? "Enabling" : "Disabling");
    2768             :                 set_numabalancing_state(numabalancing_default);
    2769             :         }
    2770             : }
    2771             : 
    2772             : static int __init setup_numabalancing(char *str)
    2773             : {
    2774             :         int ret = 0;
    2775             :         if (!str)
    2776             :                 goto out;
    2777             : 
    2778             :         if (!strcmp(str, "enable")) {
    2779             :                 numabalancing_override = 1;
    2780             :                 ret = 1;
    2781             :         } else if (!strcmp(str, "disable")) {
    2782             :                 numabalancing_override = -1;
    2783             :                 ret = 1;
    2784             :         }
    2785             : out:
    2786             :         if (!ret)
    2787             :                 pr_warn("Unable to parse numa_balancing=\n");
    2788             : 
    2789             :         return ret;
    2790             : }
    2791             : __setup("numa_balancing=", setup_numabalancing);
    2792             : #else
    2793           1 : static inline void __init check_numabalancing_enable(void)
    2794             : {
    2795           1 : }
    2796             : #endif /* CONFIG_NUMA_BALANCING */
    2797             : 
    2798             : /* assumes fs == KERNEL_DS */
    2799           1 : void __init numa_policy_init(void)
    2800             : {
    2801           1 :         nodemask_t interleave_nodes;
    2802           1 :         unsigned long largest = 0;
    2803           1 :         int nid, prefer = 0;
    2804             : 
    2805           1 :         policy_cache = kmem_cache_create("numa_policy",
    2806             :                                          sizeof(struct mempolicy),
    2807             :                                          0, SLAB_PANIC, NULL);
    2808             : 
    2809           1 :         sn_cache = kmem_cache_create("shared_policy_node",
    2810             :                                      sizeof(struct sp_node),
    2811             :                                      0, SLAB_PANIC, NULL);
    2812             : 
    2813           2 :         for_each_node(nid) {
    2814           1 :                 preferred_node_policy[nid] = (struct mempolicy) {
    2815             :                         .refcnt = ATOMIC_INIT(1),
    2816             :                         .mode = MPOL_PREFERRED,
    2817             :                         .flags = MPOL_F_MOF | MPOL_F_MORON,
    2818             :                         .v = { .preferred_node = nid, },
    2819             :                 };
    2820             :         }
    2821             : 
    2822             :         /*
    2823             :          * Set interleaving policy for system init. Interleaving is only
    2824             :          * enabled across suitably sized nodes (default is >= 16MB), or
    2825             :          * fall back to the largest node if they're all smaller.
    2826             :          */
    2827           1 :         nodes_clear(interleave_nodes);
    2828           2 :         for_each_node_state(nid, N_MEMORY) {
    2829           1 :                 unsigned long total_pages = node_present_pages(nid);
    2830             : 
    2831             :                 /* Preserve the largest node */
    2832           1 :                 if (largest < total_pages) {
    2833           1 :                         largest = total_pages;
    2834           1 :                         prefer = nid;
    2835             :                 }
    2836             : 
    2837             :                 /* Interleave this node? */
    2838           1 :                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
    2839           2 :                         node_set(nid, interleave_nodes);
    2840             :         }
    2841             : 
    2842             :         /* All too small, use the largest */
    2843           1 :         if (unlikely(nodes_empty(interleave_nodes)))
    2844           0 :                 node_set(prefer, interleave_nodes);
    2845             : 
    2846           1 :         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
    2847           0 :                 pr_err("%s: interleaving failed\n", __func__);
    2848             : 
    2849           1 :         check_numabalancing_enable();
    2850           1 : }
    2851             : 
    2852             : /* Reset policy of current process to default */
    2853           2 : void numa_default_policy(void)
    2854             : {
    2855           2 :         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
    2856           2 : }
    2857             : 
    2858             : /*
    2859             :  * Parse and format mempolicy from/to strings
    2860             :  */
    2861             : 
    2862             : /*
    2863             :  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
    2864             :  */
    2865             : static const char * const policy_modes[] =
    2866             : {
    2867             :         [MPOL_DEFAULT]    = "default",
    2868             :         [MPOL_PREFERRED]  = "prefer",
    2869             :         [MPOL_BIND]       = "bind",
    2870             :         [MPOL_INTERLEAVE] = "interleave",
    2871             :         [MPOL_LOCAL]      = "local",
    2872             : };
    2873             : 
    2874             : 
    2875             : #ifdef CONFIG_TMPFS
    2876             : /**
    2877             :  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
    2878             :  * @str:  string containing mempolicy to parse
    2879             :  * @mpol:  pointer to struct mempolicy pointer, returned on success.
    2880             :  *
    2881             :  * Format of input:
    2882             :  *      <mode>[=<flags>][:<nodelist>]
    2883             :  *
    2884             :  * On success, returns 0, else 1
    2885             :  */
    2886           0 : int mpol_parse_str(char *str, struct mempolicy **mpol)
    2887             : {
    2888           0 :         struct mempolicy *new = NULL;
    2889           0 :         unsigned short mode_flags;
    2890           0 :         nodemask_t nodes;
    2891           0 :         char *nodelist = strchr(str, ':');
    2892           0 :         char *flags = strchr(str, '=');
    2893           0 :         int err = 1, mode;
    2894             : 
    2895           0 :         if (flags)
    2896           0 :                 *flags++ = '\0';        /* terminate mode string */
    2897             : 
    2898           0 :         if (nodelist) {
    2899             :                 /* NUL-terminate mode or flags string */
    2900           0 :                 *nodelist++ = '\0';
    2901           0 :                 if (nodelist_parse(nodelist, nodes))
    2902           0 :                         goto out;
    2903           0 :                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
    2904           0 :                         goto out;
    2905             :         } else
    2906           0 :                 nodes_clear(nodes);
    2907             : 
    2908           0 :         mode = match_string(policy_modes, MPOL_MAX, str);
    2909           0 :         if (mode < 0)
    2910           0 :                 goto out;
    2911             : 
    2912           0 :         switch (mode) {
    2913           0 :         case MPOL_PREFERRED:
    2914             :                 /*
    2915             :                  * Insist on a nodelist of one node only, although later
    2916             :                  * we use first_node(nodes) to grab a single node, so here
    2917             :                  * nodelist (or nodes) cannot be empty.
    2918             :                  */
    2919           0 :                 if (nodelist) {
    2920             :                         char *rest = nodelist;
    2921           0 :                         while (isdigit(*rest))
    2922           0 :                                 rest++;
    2923           0 :                         if (*rest)
    2924           0 :                                 goto out;
    2925           0 :                         if (nodes_empty(nodes))
    2926           0 :                                 goto out;
    2927             :                 }
    2928             :                 break;
    2929           0 :         case MPOL_INTERLEAVE:
    2930             :                 /*
    2931             :                  * Default to online nodes with memory if no nodelist
    2932             :                  */
    2933           0 :                 if (!nodelist)
    2934           0 :                         nodes = node_states[N_MEMORY];
    2935             :                 break;
    2936           0 :         case MPOL_LOCAL:
    2937             :                 /*
    2938             :                  * Don't allow a nodelist;  mpol_new() checks flags
    2939             :                  */
    2940           0 :                 if (nodelist)
    2941           0 :                         goto out;
    2942             :                 mode = MPOL_PREFERRED;
    2943             :                 break;
    2944           0 :         case MPOL_DEFAULT:
    2945             :                 /*
    2946             :                  * Insist on a empty nodelist
    2947             :                  */
    2948           0 :                 if (!nodelist)
    2949           0 :                         err = 0;
    2950           0 :                 goto out;
    2951           0 :         case MPOL_BIND:
    2952             :                 /*
    2953             :                  * Insist on a nodelist
    2954             :                  */
    2955           0 :                 if (!nodelist)
    2956           0 :                         goto out;
    2957             :         }
    2958             : 
    2959           0 :         mode_flags = 0;
    2960           0 :         if (flags) {
    2961             :                 /*
    2962             :                  * Currently, we only support two mutually exclusive
    2963             :                  * mode flags.
    2964             :                  */
    2965           0 :                 if (!strcmp(flags, "static"))
    2966             :                         mode_flags |= MPOL_F_STATIC_NODES;
    2967           0 :                 else if (!strcmp(flags, "relative"))
    2968             :                         mode_flags |= MPOL_F_RELATIVE_NODES;
    2969             :                 else
    2970           0 :                         goto out;
    2971             :         }
    2972             : 
    2973           0 :         new = mpol_new(mode, mode_flags, &nodes);
    2974           0 :         if (IS_ERR(new))
    2975           0 :                 goto out;
    2976             : 
    2977             :         /*
    2978             :          * Save nodes for mpol_to_str() to show the tmpfs mount options
    2979             :          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
    2980             :          */
    2981           0 :         if (mode != MPOL_PREFERRED)
    2982           0 :                 new->v.nodes = nodes;
    2983           0 :         else if (nodelist)
    2984           0 :                 new->v.preferred_node = first_node(nodes);
    2985             :         else
    2986           0 :                 new->flags |= MPOL_F_LOCAL;
    2987             : 
    2988             :         /*
    2989             :          * Save nodes for contextualization: this will be used to "clone"
    2990             :          * the mempolicy in a specific context [cpuset] at a later time.
    2991             :          */
    2992           0 :         new->w.user_nodemask = nodes;
    2993             : 
    2994           0 :         err = 0;
    2995             : 
    2996           0 : out:
    2997             :         /* Restore string for error message */
    2998           0 :         if (nodelist)
    2999           0 :                 *--nodelist = ':';
    3000           0 :         if (flags)
    3001           0 :                 *--flags = '=';
    3002           0 :         if (!err)
    3003           0 :                 *mpol = new;
    3004           0 :         return err;
    3005             : }
    3006             : #endif /* CONFIG_TMPFS */
    3007             : 
    3008             : /**
    3009             :  * mpol_to_str - format a mempolicy structure for printing
    3010             :  * @buffer:  to contain formatted mempolicy string
    3011             :  * @maxlen:  length of @buffer
    3012             :  * @pol:  pointer to mempolicy to be formatted
    3013             :  *
    3014             :  * Convert @pol into a string.  If @buffer is too short, truncate the string.
    3015             :  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
    3016             :  * longest flag, "relative", and to display at least a few node ids.
    3017             :  */
    3018           0 : void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
    3019             : {
    3020           0 :         char *p = buffer;
    3021           0 :         nodemask_t nodes = NODE_MASK_NONE;
    3022           0 :         unsigned short mode = MPOL_DEFAULT;
    3023           0 :         unsigned short flags = 0;
    3024             : 
    3025           0 :         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
    3026           0 :                 mode = pol->mode;
    3027           0 :                 flags = pol->flags;
    3028             :         }
    3029             : 
    3030           0 :         switch (mode) {
    3031             :         case MPOL_DEFAULT:
    3032             :                 break;
    3033           0 :         case MPOL_PREFERRED:
    3034           0 :                 if (flags & MPOL_F_LOCAL)
    3035             :                         mode = MPOL_LOCAL;
    3036             :                 else
    3037           0 :                         node_set(pol->v.preferred_node, nodes);
    3038             :                 break;
    3039           0 :         case MPOL_BIND:
    3040             :         case MPOL_INTERLEAVE:
    3041           0 :                 nodes = pol->v.nodes;
    3042           0 :                 break;
    3043             :         default:
    3044           0 :                 WARN_ON_ONCE(1);
    3045           0 :                 snprintf(p, maxlen, "unknown");
    3046           0 :                 return;
    3047             :         }
    3048             : 
    3049           0 :         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
    3050             : 
    3051           0 :         if (flags & MPOL_MODE_FLAGS) {
    3052           0 :                 p += snprintf(p, buffer + maxlen - p, "=");
    3053             : 
    3054             :                 /*
    3055             :                  * Currently, the only defined flags are mutually exclusive
    3056             :                  */
    3057           0 :                 if (flags & MPOL_F_STATIC_NODES)
    3058           0 :                         p += snprintf(p, buffer + maxlen - p, "static");
    3059           0 :                 else if (flags & MPOL_F_RELATIVE_NODES)
    3060           0 :                         p += snprintf(p, buffer + maxlen - p, "relative");
    3061             :         }
    3062             : 
    3063           0 :         if (!nodes_empty(nodes))
    3064           0 :                 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
    3065             :                                nodemask_pr_args(&nodes));
    3066             : }

Generated by: LCOV version 1.14