LCOV - code coverage report
Current view: top level - kernel/sched - topology.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 444 738 60.2 %
Date: 2021-04-22 12:43:58 Functions: 30 51 58.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Scheduler topology setup/handling methods
       4             :  */
       5             : #include "sched.h"
       6             : 
       7             : DEFINE_MUTEX(sched_domains_mutex);
       8             : 
       9             : /* Protected by sched_domains_mutex: */
      10             : static cpumask_var_t sched_domains_tmpmask;
      11             : static cpumask_var_t sched_domains_tmpmask2;
      12             : 
      13             : #ifdef CONFIG_SCHED_DEBUG
      14             : 
      15             : static int __init sched_debug_setup(char *str)
      16             : {
      17             :         sched_debug_enabled = true;
      18             : 
      19             :         return 0;
      20             : }
      21             : early_param("sched_debug", sched_debug_setup);
      22             : 
      23             : static inline bool sched_debug(void)
      24             : {
      25             :         return sched_debug_enabled;
      26             : }
      27             : 
      28             : #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
      29             : const struct sd_flag_debug sd_flag_debug[] = {
      30             : #include <linux/sched/sd_flags.h>
      31             : };
      32             : #undef SD_FLAG
      33             : 
      34             : static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
      35             :                                   struct cpumask *groupmask)
      36             : {
      37             :         struct sched_group *group = sd->groups;
      38             :         unsigned long flags = sd->flags;
      39             :         unsigned int idx;
      40             : 
      41             :         cpumask_clear(groupmask);
      42             : 
      43             :         printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
      44             :         printk(KERN_CONT "span=%*pbl level=%s\n",
      45             :                cpumask_pr_args(sched_domain_span(sd)), sd->name);
      46             : 
      47             :         if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
      48             :                 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
      49             :         }
      50             :         if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
      51             :                 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
      52             :         }
      53             : 
      54             :         for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
      55             :                 unsigned int flag = BIT(idx);
      56             :                 unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
      57             : 
      58             :                 if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
      59             :                     !(sd->child->flags & flag))
      60             :                         printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
      61             :                                sd_flag_debug[idx].name);
      62             : 
      63             :                 if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
      64             :                     !(sd->parent->flags & flag))
      65             :                         printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
      66             :                                sd_flag_debug[idx].name);
      67             :         }
      68             : 
      69             :         printk(KERN_DEBUG "%*s groups:", level + 1, "");
      70             :         do {
      71             :                 if (!group) {
      72             :                         printk("\n");
      73             :                         printk(KERN_ERR "ERROR: group is NULL\n");
      74             :                         break;
      75             :                 }
      76             : 
      77             :                 if (!cpumask_weight(sched_group_span(group))) {
      78             :                         printk(KERN_CONT "\n");
      79             :                         printk(KERN_ERR "ERROR: empty group\n");
      80             :                         break;
      81             :                 }
      82             : 
      83             :                 if (!(sd->flags & SD_OVERLAP) &&
      84             :                     cpumask_intersects(groupmask, sched_group_span(group))) {
      85             :                         printk(KERN_CONT "\n");
      86             :                         printk(KERN_ERR "ERROR: repeated CPUs\n");
      87             :                         break;
      88             :                 }
      89             : 
      90             :                 cpumask_or(groupmask, groupmask, sched_group_span(group));
      91             : 
      92             :                 printk(KERN_CONT " %d:{ span=%*pbl",
      93             :                                 group->sgc->id,
      94             :                                 cpumask_pr_args(sched_group_span(group)));
      95             : 
      96             :                 if ((sd->flags & SD_OVERLAP) &&
      97             :                     !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
      98             :                         printk(KERN_CONT " mask=%*pbl",
      99             :                                 cpumask_pr_args(group_balance_mask(group)));
     100             :                 }
     101             : 
     102             :                 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
     103             :                         printk(KERN_CONT " cap=%lu", group->sgc->capacity);
     104             : 
     105             :                 if (group == sd->groups && sd->child &&
     106             :                     !cpumask_equal(sched_domain_span(sd->child),
     107             :                                    sched_group_span(group))) {
     108             :                         printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
     109             :                 }
     110             : 
     111             :                 printk(KERN_CONT " }");
     112             : 
     113             :                 group = group->next;
     114             : 
     115             :                 if (group != sd->groups)
     116             :                         printk(KERN_CONT ",");
     117             : 
     118             :         } while (group != sd->groups);
     119             :         printk(KERN_CONT "\n");
     120             : 
     121             :         if (!cpumask_equal(sched_domain_span(sd), groupmask))
     122             :                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
     123             : 
     124             :         if (sd->parent &&
     125             :             !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
     126             :                 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
     127             :         return 0;
     128             : }
     129             : 
     130             : static void sched_domain_debug(struct sched_domain *sd, int cpu)
     131             : {
     132             :         int level = 0;
     133             : 
     134             :         if (!sched_debug_enabled)
     135             :                 return;
     136             : 
     137             :         if (!sd) {
     138             :                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
     139             :                 return;
     140             :         }
     141             : 
     142             :         printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
     143             : 
     144             :         for (;;) {
     145             :                 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
     146             :                         break;
     147             :                 level++;
     148             :                 sd = sd->parent;
     149             :                 if (!sd)
     150             :                         break;
     151             :         }
     152             : }
     153             : #else /* !CONFIG_SCHED_DEBUG */
     154             : 
     155             : # define sched_debug_enabled 0
     156             : # define sched_domain_debug(sd, cpu) do { } while (0)
     157           1 : static inline bool sched_debug(void)
     158             : {
     159           1 :         return false;
     160             : }
     161             : #endif /* CONFIG_SCHED_DEBUG */
     162             : 
     163             : /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
     164             : #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
     165             : static const unsigned int SD_DEGENERATE_GROUPS_MASK =
     166             : #include <linux/sched/sd_flags.h>
     167             : 0;
     168             : #undef SD_FLAG
     169             : 
     170          12 : static int sd_degenerate(struct sched_domain *sd)
     171             : {
     172          12 :         if (cpumask_weight(sched_domain_span(sd)) == 1)
     173             :                 return 1;
     174             : 
     175             :         /* Following flags need at least 2 groups */
     176           4 :         if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
     177           4 :             (sd->groups != sd->groups->next))
     178             :                 return 0;
     179             : 
     180             :         /* Following flags don't use groups */
     181           0 :         if (sd->flags & (SD_WAKE_AFFINE))
     182           0 :                 return 0;
     183             : 
     184             :         return 1;
     185             : }
     186             : 
     187             : static int
     188           8 : sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
     189             : {
     190           8 :         unsigned long cflags = sd->flags, pflags = parent->flags;
     191             : 
     192           8 :         if (sd_degenerate(parent))
     193             :                 return 1;
     194             : 
     195           4 :         if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
     196             :                 return 0;
     197             : 
     198             :         /* Flags needing groups don't count if only 1 group in parent */
     199           0 :         if (parent->groups == parent->groups->next)
     200           0 :                 pflags &= ~SD_DEGENERATE_GROUPS_MASK;
     201             : 
     202           0 :         if (~cflags & pflags)
     203           0 :                 return 0;
     204             : 
     205             :         return 1;
     206             : }
     207             : 
     208             : #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
     209             : DEFINE_STATIC_KEY_FALSE(sched_energy_present);
     210             : unsigned int sysctl_sched_energy_aware = 1;
     211             : DEFINE_MUTEX(sched_energy_mutex);
     212             : bool sched_energy_update;
     213             : 
     214             : void rebuild_sched_domains_energy(void)
     215             : {
     216             :         mutex_lock(&sched_energy_mutex);
     217             :         sched_energy_update = true;
     218             :         rebuild_sched_domains();
     219             :         sched_energy_update = false;
     220             :         mutex_unlock(&sched_energy_mutex);
     221             : }
     222             : 
     223             : #ifdef CONFIG_PROC_SYSCTL
     224             : int sched_energy_aware_handler(struct ctl_table *table, int write,
     225             :                 void *buffer, size_t *lenp, loff_t *ppos)
     226             : {
     227             :         int ret, state;
     228             : 
     229             :         if (write && !capable(CAP_SYS_ADMIN))
     230             :                 return -EPERM;
     231             : 
     232             :         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
     233             :         if (!ret && write) {
     234             :                 state = static_branch_unlikely(&sched_energy_present);
     235             :                 if (state != sysctl_sched_energy_aware)
     236             :                         rebuild_sched_domains_energy();
     237             :         }
     238             : 
     239             :         return ret;
     240             : }
     241             : #endif
     242             : 
     243             : static void free_pd(struct perf_domain *pd)
     244             : {
     245             :         struct perf_domain *tmp;
     246             : 
     247             :         while (pd) {
     248             :                 tmp = pd->next;
     249             :                 kfree(pd);
     250             :                 pd = tmp;
     251             :         }
     252             : }
     253             : 
     254             : static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
     255             : {
     256             :         while (pd) {
     257             :                 if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
     258             :                         return pd;
     259             :                 pd = pd->next;
     260             :         }
     261             : 
     262             :         return NULL;
     263             : }
     264             : 
     265             : static struct perf_domain *pd_init(int cpu)
     266             : {
     267             :         struct em_perf_domain *obj = em_cpu_get(cpu);
     268             :         struct perf_domain *pd;
     269             : 
     270             :         if (!obj) {
     271             :                 if (sched_debug())
     272             :                         pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
     273             :                 return NULL;
     274             :         }
     275             : 
     276             :         pd = kzalloc(sizeof(*pd), GFP_KERNEL);
     277             :         if (!pd)
     278             :                 return NULL;
     279             :         pd->em_pd = obj;
     280             : 
     281             :         return pd;
     282             : }
     283             : 
     284             : static void perf_domain_debug(const struct cpumask *cpu_map,
     285             :                                                 struct perf_domain *pd)
     286             : {
     287             :         if (!sched_debug() || !pd)
     288             :                 return;
     289             : 
     290             :         printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
     291             : 
     292             :         while (pd) {
     293             :                 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
     294             :                                 cpumask_first(perf_domain_span(pd)),
     295             :                                 cpumask_pr_args(perf_domain_span(pd)),
     296             :                                 em_pd_nr_perf_states(pd->em_pd));
     297             :                 pd = pd->next;
     298             :         }
     299             : 
     300             :         printk(KERN_CONT "\n");
     301             : }
     302             : 
     303             : static void destroy_perf_domain_rcu(struct rcu_head *rp)
     304             : {
     305             :         struct perf_domain *pd;
     306             : 
     307             :         pd = container_of(rp, struct perf_domain, rcu);
     308             :         free_pd(pd);
     309             : }
     310             : 
     311             : static void sched_energy_set(bool has_eas)
     312             : {
     313             :         if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
     314             :                 if (sched_debug())
     315             :                         pr_info("%s: stopping EAS\n", __func__);
     316             :                 static_branch_disable_cpuslocked(&sched_energy_present);
     317             :         } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
     318             :                 if (sched_debug())
     319             :                         pr_info("%s: starting EAS\n", __func__);
     320             :                 static_branch_enable_cpuslocked(&sched_energy_present);
     321             :         }
     322             : }
     323             : 
     324             : /*
     325             :  * EAS can be used on a root domain if it meets all the following conditions:
     326             :  *    1. an Energy Model (EM) is available;
     327             :  *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
     328             :  *    3. no SMT is detected.
     329             :  *    4. the EM complexity is low enough to keep scheduling overheads low;
     330             :  *    5. schedutil is driving the frequency of all CPUs of the rd;
     331             :  *    6. frequency invariance support is present;
     332             :  *
     333             :  * The complexity of the Energy Model is defined as:
     334             :  *
     335             :  *              C = nr_pd * (nr_cpus + nr_ps)
     336             :  *
     337             :  * with parameters defined as:
     338             :  *  - nr_pd:    the number of performance domains
     339             :  *  - nr_cpus:  the number of CPUs
     340             :  *  - nr_ps:    the sum of the number of performance states of all performance
     341             :  *              domains (for example, on a system with 2 performance domains,
     342             :  *              with 10 performance states each, nr_ps = 2 * 10 = 20).
     343             :  *
     344             :  * It is generally not a good idea to use such a model in the wake-up path on
     345             :  * very complex platforms because of the associated scheduling overheads. The
     346             :  * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
     347             :  * with per-CPU DVFS and less than 8 performance states each, for example.
     348             :  */
     349             : #define EM_MAX_COMPLEXITY 2048
     350             : 
     351             : extern struct cpufreq_governor schedutil_gov;
     352             : static bool build_perf_domains(const struct cpumask *cpu_map)
     353             : {
     354             :         int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
     355             :         struct perf_domain *pd = NULL, *tmp;
     356             :         int cpu = cpumask_first(cpu_map);
     357             :         struct root_domain *rd = cpu_rq(cpu)->rd;
     358             :         struct cpufreq_policy *policy;
     359             :         struct cpufreq_governor *gov;
     360             : 
     361             :         if (!sysctl_sched_energy_aware)
     362             :                 goto free;
     363             : 
     364             :         /* EAS is enabled for asymmetric CPU capacity topologies. */
     365             :         if (!per_cpu(sd_asym_cpucapacity, cpu)) {
     366             :                 if (sched_debug()) {
     367             :                         pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
     368             :                                         cpumask_pr_args(cpu_map));
     369             :                 }
     370             :                 goto free;
     371             :         }
     372             : 
     373             :         /* EAS definitely does *not* handle SMT */
     374             :         if (sched_smt_active()) {
     375             :                 pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
     376             :                         cpumask_pr_args(cpu_map));
     377             :                 goto free;
     378             :         }
     379             : 
     380             :         if (!arch_scale_freq_invariant()) {
     381             :                 if (sched_debug()) {
     382             :                         pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
     383             :                                 cpumask_pr_args(cpu_map));
     384             :                 }
     385             :                 goto free;
     386             :         }
     387             : 
     388             :         for_each_cpu(i, cpu_map) {
     389             :                 /* Skip already covered CPUs. */
     390             :                 if (find_pd(pd, i))
     391             :                         continue;
     392             : 
     393             :                 /* Do not attempt EAS if schedutil is not being used. */
     394             :                 policy = cpufreq_cpu_get(i);
     395             :                 if (!policy)
     396             :                         goto free;
     397             :                 gov = policy->governor;
     398             :                 cpufreq_cpu_put(policy);
     399             :                 if (gov != &schedutil_gov) {
     400             :                         if (rd->pd)
     401             :                                 pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
     402             :                                                 cpumask_pr_args(cpu_map));
     403             :                         goto free;
     404             :                 }
     405             : 
     406             :                 /* Create the new pd and add it to the local list. */
     407             :                 tmp = pd_init(i);
     408             :                 if (!tmp)
     409             :                         goto free;
     410             :                 tmp->next = pd;
     411             :                 pd = tmp;
     412             : 
     413             :                 /*
     414             :                  * Count performance domains and performance states for the
     415             :                  * complexity check.
     416             :                  */
     417             :                 nr_pd++;
     418             :                 nr_ps += em_pd_nr_perf_states(pd->em_pd);
     419             :         }
     420             : 
     421             :         /* Bail out if the Energy Model complexity is too high. */
     422             :         if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
     423             :                 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
     424             :                                                 cpumask_pr_args(cpu_map));
     425             :                 goto free;
     426             :         }
     427             : 
     428             :         perf_domain_debug(cpu_map, pd);
     429             : 
     430             :         /* Attach the new list of performance domains to the root domain. */
     431             :         tmp = rd->pd;
     432             :         rcu_assign_pointer(rd->pd, pd);
     433             :         if (tmp)
     434             :                 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
     435             : 
     436             :         return !!pd;
     437             : 
     438             : free:
     439             :         free_pd(pd);
     440             :         tmp = rd->pd;
     441             :         rcu_assign_pointer(rd->pd, NULL);
     442             :         if (tmp)
     443             :                 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
     444             : 
     445             :         return false;
     446             : }
     447             : #else
     448           0 : static void free_pd(struct perf_domain *pd) { }
     449             : #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
     450             : 
     451           0 : static void free_rootdomain(struct rcu_head *rcu)
     452             : {
     453           0 :         struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
     454             : 
     455           0 :         cpupri_cleanup(&rd->cpupri);
     456           0 :         cpudl_cleanup(&rd->cpudl);
     457           0 :         free_cpumask_var(rd->dlo_mask);
     458           0 :         free_cpumask_var(rd->rto_mask);
     459           0 :         free_cpumask_var(rd->online);
     460           0 :         free_cpumask_var(rd->span);
     461           0 :         free_pd(rd->pd);
     462           0 :         kfree(rd);
     463           0 : }
     464             : 
     465           8 : void rq_attach_root(struct rq *rq, struct root_domain *rd)
     466             : {
     467           8 :         struct root_domain *old_rd = NULL;
     468           8 :         unsigned long flags;
     469             : 
     470           8 :         raw_spin_lock_irqsave(&rq->lock, flags);
     471             : 
     472           8 :         if (rq->rd) {
     473           4 :                 old_rd = rq->rd;
     474             : 
     475           4 :                 if (cpumask_test_cpu(rq->cpu, old_rd->online))
     476           4 :                         set_rq_offline(rq);
     477             : 
     478           4 :                 cpumask_clear_cpu(rq->cpu, old_rd->span);
     479             : 
     480             :                 /*
     481             :                  * If we dont want to free the old_rd yet then
     482             :                  * set old_rd to NULL to skip the freeing later
     483             :                  * in this function:
     484             :                  */
     485           8 :                 if (!atomic_dec_and_test(&old_rd->refcount))
     486           4 :                         old_rd = NULL;
     487             :         }
     488             : 
     489           8 :         atomic_inc(&rd->refcount);
     490           8 :         rq->rd = rd;
     491             : 
     492           8 :         cpumask_set_cpu(rq->cpu, rd->span);
     493           8 :         if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
     494           5 :                 set_rq_online(rq);
     495             : 
     496           8 :         raw_spin_unlock_irqrestore(&rq->lock, flags);
     497             : 
     498           8 :         if (old_rd)
     499           0 :                 call_rcu(&old_rd->rcu, free_rootdomain);
     500           8 : }
     501             : 
     502           0 : void sched_get_rd(struct root_domain *rd)
     503             : {
     504           0 :         atomic_inc(&rd->refcount);
     505           0 : }
     506             : 
     507           0 : void sched_put_rd(struct root_domain *rd)
     508             : {
     509           0 :         if (!atomic_dec_and_test(&rd->refcount))
     510             :                 return;
     511             : 
     512           0 :         call_rcu(&rd->rcu, free_rootdomain);
     513             : }
     514             : 
     515           2 : static int init_rootdomain(struct root_domain *rd)
     516             : {
     517           2 :         if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
     518             :                 goto out;
     519           2 :         if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
     520             :                 goto free_span;
     521           2 :         if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
     522             :                 goto free_online;
     523           2 :         if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
     524             :                 goto free_dlo_mask;
     525             : 
     526             : #ifdef HAVE_RT_PUSH_IPI
     527           2 :         rd->rto_cpu = -1;
     528           2 :         raw_spin_lock_init(&rd->rto_lock);
     529           2 :         init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
     530             : #endif
     531             : 
     532           2 :         rd->visit_gen = 0;
     533           2 :         init_dl_bw(&rd->dl_bw);
     534           2 :         if (cpudl_init(&rd->cpudl) != 0)
     535           0 :                 goto free_rto_mask;
     536             : 
     537           2 :         if (cpupri_init(&rd->cpupri) != 0)
     538           0 :                 goto free_cpudl;
     539             :         return 0;
     540             : 
     541           0 : free_cpudl:
     542           0 :         cpudl_cleanup(&rd->cpudl);
     543           0 : free_rto_mask:
     544           0 :         free_cpumask_var(rd->rto_mask);
     545           0 : free_dlo_mask:
     546           0 :         free_cpumask_var(rd->dlo_mask);
     547           0 : free_online:
     548           0 :         free_cpumask_var(rd->online);
     549           0 : free_span:
     550           0 :         free_cpumask_var(rd->span);
     551           0 : out:
     552           0 :         return -ENOMEM;
     553             : }
     554             : 
     555             : /*
     556             :  * By default the system creates a single root-domain with all CPUs as
     557             :  * members (mimicking the global state we have today).
     558             :  */
     559             : struct root_domain def_root_domain;
     560             : 
     561           1 : void init_defrootdomain(void)
     562             : {
     563           1 :         init_rootdomain(&def_root_domain);
     564             : 
     565           1 :         atomic_set(&def_root_domain.refcount, 1);
     566           1 : }
     567             : 
     568           1 : static struct root_domain *alloc_rootdomain(void)
     569             : {
     570           1 :         struct root_domain *rd;
     571             : 
     572           1 :         rd = kzalloc(sizeof(*rd), GFP_KERNEL);
     573           1 :         if (!rd)
     574             :                 return NULL;
     575             : 
     576           1 :         if (init_rootdomain(rd) != 0) {
     577           0 :                 kfree(rd);
     578           0 :                 return NULL;
     579             :         }
     580             : 
     581             :         return rd;
     582             : }
     583             : 
     584           8 : static void free_sched_groups(struct sched_group *sg, int free_sgc)
     585             : {
     586           8 :         struct sched_group *tmp, *first;
     587             : 
     588           8 :         if (!sg)
     589             :                 return;
     590             : 
     591           8 :         first = sg;
     592           8 :         do {
     593           8 :                 tmp = sg->next;
     594             : 
     595          16 :                 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
     596           8 :                         kfree(sg->sgc);
     597             : 
     598          16 :                 if (atomic_dec_and_test(&sg->ref))
     599           8 :                         kfree(sg);
     600           8 :                 sg = tmp;
     601           8 :         } while (sg != first);
     602             : }
     603             : 
     604           8 : static void destroy_sched_domain(struct sched_domain *sd)
     605             : {
     606             :         /*
     607             :          * A normal sched domain may have multiple group references, an
     608             :          * overlapping domain, having private groups, only one.  Iterate,
     609             :          * dropping group/capacity references, freeing where none remain.
     610             :          */
     611           8 :         free_sched_groups(sd->groups, 1);
     612             : 
     613          16 :         if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
     614           8 :                 kfree(sd->shared);
     615           8 :         kfree(sd);
     616           8 : }
     617             : 
     618           0 : static void destroy_sched_domains_rcu(struct rcu_head *rcu)
     619             : {
     620           0 :         struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
     621             : 
     622           0 :         while (sd) {
     623           0 :                 struct sched_domain *parent = sd->parent;
     624           0 :                 destroy_sched_domain(sd);
     625           0 :                 sd = parent;
     626             :         }
     627           0 : }
     628             : 
     629           4 : static void destroy_sched_domains(struct sched_domain *sd)
     630             : {
     631           4 :         if (sd)
     632           0 :                 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
     633             : }
     634             : 
     635             : /*
     636             :  * Keep a special pointer to the highest sched_domain that has
     637             :  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
     638             :  * allows us to avoid some pointer chasing select_idle_sibling().
     639             :  *
     640             :  * Also keep a unique ID per domain (we use the first CPU number in
     641             :  * the cpumask of the domain), this allows us to quickly tell if
     642             :  * two CPUs are in the same cache domain, see cpus_share_cache().
     643             :  */
     644             : DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
     645             : DEFINE_PER_CPU(int, sd_llc_size);
     646             : DEFINE_PER_CPU(int, sd_llc_id);
     647             : DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
     648             : DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
     649             : DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
     650             : DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
     651             : DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
     652             : 
     653           4 : static void update_top_cache_domain(int cpu)
     654             : {
     655           4 :         struct sched_domain_shared *sds = NULL;
     656           4 :         struct sched_domain *sd;
     657           4 :         int id = cpu;
     658           4 :         int size = 1;
     659             : 
     660           4 :         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
     661           4 :         if (sd) {
     662           0 :                 id = cpumask_first(sched_domain_span(sd));
     663           0 :                 size = cpumask_weight(sched_domain_span(sd));
     664           0 :                 sds = sd->shared;
     665             :         }
     666             : 
     667           4 :         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
     668           4 :         per_cpu(sd_llc_size, cpu) = size;
     669           4 :         per_cpu(sd_llc_id, cpu) = id;
     670           4 :         rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
     671             : 
     672           4 :         sd = lowest_flag_domain(cpu, SD_NUMA);
     673           4 :         rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
     674             : 
     675           4 :         sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
     676           4 :         rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
     677             : 
     678           4 :         sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
     679           4 :         rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
     680           4 : }
     681             : 
     682             : /*
     683             :  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
     684             :  * hold the hotplug lock.
     685             :  */
     686             : static void
     687           4 : cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
     688             : {
     689           4 :         struct rq *rq = cpu_rq(cpu);
     690           4 :         struct sched_domain *tmp;
     691           4 :         int numa_distance = 0;
     692             : 
     693             :         /* Remove the sched domains which do not contribute to scheduling. */
     694          16 :         for (tmp = sd; tmp; ) {
     695          12 :                 struct sched_domain *parent = tmp->parent;
     696          12 :                 if (!parent)
     697             :                         break;
     698             : 
     699           8 :                 if (sd_parent_degenerate(tmp, parent)) {
     700           4 :                         tmp->parent = parent->parent;
     701           4 :                         if (parent->parent)
     702           4 :                                 parent->parent->child = tmp;
     703             :                         /*
     704             :                          * Transfer SD_PREFER_SIBLING down in case of a
     705             :                          * degenerate parent; the spans match for this
     706             :                          * so the property transfers.
     707             :                          */
     708           4 :                         if (parent->flags & SD_PREFER_SIBLING)
     709           4 :                                 tmp->flags |= SD_PREFER_SIBLING;
     710           4 :                         destroy_sched_domain(parent);
     711             :                 } else
     712             :                         tmp = tmp->parent;
     713             :         }
     714             : 
     715           4 :         if (sd && sd_degenerate(sd)) {
     716           4 :                 tmp = sd;
     717           4 :                 sd = sd->parent;
     718           4 :                 destroy_sched_domain(tmp);
     719           4 :                 if (sd)
     720           4 :                         sd->child = NULL;
     721             :         }
     722             : 
     723           8 :         for (tmp = sd; tmp; tmp = tmp->parent)
     724           4 :                 numa_distance += !!(tmp->flags & SD_NUMA);
     725             : 
     726             :         /*
     727             :          * FIXME: Diameter >=3 is misrepresented.
     728             :          *
     729             :          * Smallest diameter=3 topology is:
     730             :          *
     731             :          *   node   0   1   2   3
     732             :          *     0:  10  20  30  40
     733             :          *     1:  20  10  20  30
     734             :          *     2:  30  20  10  20
     735             :          *     3:  40  30  20  10
     736             :          *
     737             :          *   0 --- 1 --- 2 --- 3
     738             :          *
     739             :          * NUMA-3       0-3             N/A             N/A             0-3
     740             :          *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
     741             :          *
     742             :          * NUMA-2       0-2             0-3             0-3             1-3
     743             :          *  groups:     {0-1},{1-3}     {0-2},{2-3}     {1-3},{0-1}     {2-3},{0-2}
     744             :          *
     745             :          * NUMA-1       0-1             0-2             1-3             2-3
     746             :          *  groups:     {0},{1}         {1},{2},{0}     {2},{3},{1}     {3},{2}
     747             :          *
     748             :          * NUMA-0       0               1               2               3
     749             :          *
     750             :          * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
     751             :          * group span isn't a subset of the domain span.
     752             :          */
     753           4 :         WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
     754             : 
     755           4 :         sched_domain_debug(sd, cpu);
     756             : 
     757           4 :         rq_attach_root(rq, rd);
     758           4 :         tmp = rq->sd;
     759           4 :         rcu_assign_pointer(rq->sd, sd);
     760           4 :         dirty_sched_domain_sysctl(cpu);
     761           4 :         destroy_sched_domains(tmp);
     762             : 
     763           4 :         update_top_cache_domain(cpu);
     764           4 : }
     765             : 
     766             : struct s_data {
     767             :         struct sched_domain * __percpu *sd;
     768             :         struct root_domain      *rd;
     769             : };
     770             : 
     771             : enum s_alloc {
     772             :         sa_rootdomain,
     773             :         sa_sd,
     774             :         sa_sd_storage,
     775             :         sa_none,
     776             : };
     777             : 
     778             : /*
     779             :  * Return the canonical balance CPU for this group, this is the first CPU
     780             :  * of this group that's also in the balance mask.
     781             :  *
     782             :  * The balance mask are all those CPUs that could actually end up at this
     783             :  * group. See build_balance_mask().
     784             :  *
     785             :  * Also see should_we_balance().
     786             :  */
     787        1721 : int group_balance_cpu(struct sched_group *sg)
     788             : {
     789        1709 :         return cpumask_first(group_balance_mask(sg));
     790             : }
     791             : 
     792             : 
     793             : /*
     794             :  * NUMA topology (first read the regular topology blurb below)
     795             :  *
     796             :  * Given a node-distance table, for example:
     797             :  *
     798             :  *   node   0   1   2   3
     799             :  *     0:  10  20  30  20
     800             :  *     1:  20  10  20  30
     801             :  *     2:  30  20  10  20
     802             :  *     3:  20  30  20  10
     803             :  *
     804             :  * which represents a 4 node ring topology like:
     805             :  *
     806             :  *   0 ----- 1
     807             :  *   |       |
     808             :  *   |       |
     809             :  *   |       |
     810             :  *   3 ----- 2
     811             :  *
     812             :  * We want to construct domains and groups to represent this. The way we go
     813             :  * about doing this is to build the domains on 'hops'. For each NUMA level we
     814             :  * construct the mask of all nodes reachable in @level hops.
     815             :  *
     816             :  * For the above NUMA topology that gives 3 levels:
     817             :  *
     818             :  * NUMA-2       0-3             0-3             0-3             0-3
     819             :  *  groups:     {0-1,3},{1-3}   {0-2},{0,2-3}   {1-3},{0-1,3}   {0,2-3},{0-2}
     820             :  *
     821             :  * NUMA-1       0-1,3           0-2             1-3             0,2-3
     822             :  *  groups:     {0},{1},{3}     {0},{1},{2}     {1},{2},{3}     {0},{2},{3}
     823             :  *
     824             :  * NUMA-0       0               1               2               3
     825             :  *
     826             :  *
     827             :  * As can be seen; things don't nicely line up as with the regular topology.
     828             :  * When we iterate a domain in child domain chunks some nodes can be
     829             :  * represented multiple times -- hence the "overlap" naming for this part of
     830             :  * the topology.
     831             :  *
     832             :  * In order to minimize this overlap, we only build enough groups to cover the
     833             :  * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
     834             :  *
     835             :  * Because:
     836             :  *
     837             :  *  - the first group of each domain is its child domain; this
     838             :  *    gets us the first 0-1,3
     839             :  *  - the only uncovered node is 2, who's child domain is 1-3.
     840             :  *
     841             :  * However, because of the overlap, computing a unique CPU for each group is
     842             :  * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
     843             :  * groups include the CPUs of Node-0, while those CPUs would not in fact ever
     844             :  * end up at those groups (they would end up in group: 0-1,3).
     845             :  *
     846             :  * To correct this we have to introduce the group balance mask. This mask
     847             :  * will contain those CPUs in the group that can reach this group given the
     848             :  * (child) domain tree.
     849             :  *
     850             :  * With this we can once again compute balance_cpu and sched_group_capacity
     851             :  * relations.
     852             :  *
     853             :  * XXX include words on how balance_cpu is unique and therefore can be
     854             :  * used for sched_group_capacity links.
     855             :  *
     856             :  *
     857             :  * Another 'interesting' topology is:
     858             :  *
     859             :  *   node   0   1   2   3
     860             :  *     0:  10  20  20  30
     861             :  *     1:  20  10  20  20
     862             :  *     2:  20  20  10  20
     863             :  *     3:  30  20  20  10
     864             :  *
     865             :  * Which looks a little like:
     866             :  *
     867             :  *   0 ----- 1
     868             :  *   |     / |
     869             :  *   |   /   |
     870             :  *   | /     |
     871             :  *   2 ----- 3
     872             :  *
     873             :  * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
     874             :  * are not.
     875             :  *
     876             :  * This leads to a few particularly weird cases where the sched_domain's are
     877             :  * not of the same number for each CPU. Consider:
     878             :  *
     879             :  * NUMA-2       0-3                                             0-3
     880             :  *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
     881             :  *
     882             :  * NUMA-1       0-2             0-3             0-3             1-3
     883             :  *
     884             :  * NUMA-0       0               1               2               3
     885             :  *
     886             :  */
     887             : 
     888             : 
     889             : /*
     890             :  * Build the balance mask; it contains only those CPUs that can arrive at this
     891             :  * group and should be considered to continue balancing.
     892             :  *
     893             :  * We do this during the group creation pass, therefore the group information
     894             :  * isn't complete yet, however since each group represents a (child) domain we
     895             :  * can fully construct this using the sched_domain bits (which are already
     896             :  * complete).
     897             :  */
     898             : static void
     899           0 : build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
     900             : {
     901           0 :         const struct cpumask *sg_span = sched_group_span(sg);
     902           0 :         struct sd_data *sdd = sd->private;
     903           0 :         struct sched_domain *sibling;
     904           0 :         int i;
     905             : 
     906           0 :         cpumask_clear(mask);
     907             : 
     908           0 :         for_each_cpu(i, sg_span) {
     909           0 :                 sibling = *per_cpu_ptr(sdd->sd, i);
     910             : 
     911             :                 /*
     912             :                  * Can happen in the asymmetric case, where these siblings are
     913             :                  * unused. The mask will not be empty because those CPUs that
     914             :                  * do have the top domain _should_ span the domain.
     915             :                  */
     916           0 :                 if (!sibling->child)
     917           0 :                         continue;
     918             : 
     919             :                 /* If we would not end up here, we can't continue from here */
     920           0 :                 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
     921           0 :                         continue;
     922             : 
     923           0 :                 cpumask_set_cpu(i, mask);
     924             :         }
     925             : 
     926             :         /* We must not have empty masks here */
     927           0 :         WARN_ON_ONCE(cpumask_empty(mask));
     928           0 : }
     929             : 
     930             : /*
     931             :  * XXX: This creates per-node group entries; since the load-balancer will
     932             :  * immediately access remote memory to construct this group's load-balance
     933             :  * statistics having the groups node local is of dubious benefit.
     934             :  */
     935             : static struct sched_group *
     936           0 : build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
     937             : {
     938           0 :         struct sched_group *sg;
     939           0 :         struct cpumask *sg_span;
     940             : 
     941           0 :         sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
     942             :                         GFP_KERNEL, cpu_to_node(cpu));
     943             : 
     944           0 :         if (!sg)
     945             :                 return NULL;
     946             : 
     947           0 :         sg_span = sched_group_span(sg);
     948           0 :         if (sd->child)
     949           0 :                 cpumask_copy(sg_span, sched_domain_span(sd->child));
     950             :         else
     951           0 :                 cpumask_copy(sg_span, sched_domain_span(sd));
     952             : 
     953           0 :         atomic_inc(&sg->ref);
     954           0 :         return sg;
     955             : }
     956             : 
     957           0 : static void init_overlap_sched_group(struct sched_domain *sd,
     958             :                                      struct sched_group *sg)
     959             : {
     960           0 :         struct cpumask *mask = sched_domains_tmpmask2;
     961           0 :         struct sd_data *sdd = sd->private;
     962           0 :         struct cpumask *sg_span;
     963           0 :         int cpu;
     964             : 
     965           0 :         build_balance_mask(sd, sg, mask);
     966           0 :         cpu = cpumask_first_and(sched_group_span(sg), mask);
     967             : 
     968           0 :         sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
     969           0 :         if (atomic_inc_return(&sg->sgc->ref) == 1)
     970           0 :                 cpumask_copy(group_balance_mask(sg), mask);
     971             :         else
     972           0 :                 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
     973             : 
     974             :         /*
     975             :          * Initialize sgc->capacity such that even if we mess up the
     976             :          * domains and no possible iteration will get us here, we won't
     977             :          * die on a /0 trap.
     978             :          */
     979           0 :         sg_span = sched_group_span(sg);
     980           0 :         sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
     981           0 :         sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
     982           0 :         sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
     983           0 : }
     984             : 
     985             : static int
     986           0 : build_overlap_sched_groups(struct sched_domain *sd, int cpu)
     987             : {
     988           0 :         struct sched_group *first = NULL, *last = NULL, *sg;
     989           0 :         const struct cpumask *span = sched_domain_span(sd);
     990           0 :         struct cpumask *covered = sched_domains_tmpmask;
     991           0 :         struct sd_data *sdd = sd->private;
     992           0 :         struct sched_domain *sibling;
     993           0 :         int i;
     994             : 
     995           0 :         cpumask_clear(covered);
     996             : 
     997           0 :         for_each_cpu_wrap(i, span, cpu) {
     998           0 :                 struct cpumask *sg_span;
     999             : 
    1000           0 :                 if (cpumask_test_cpu(i, covered))
    1001           0 :                         continue;
    1002             : 
    1003           0 :                 sibling = *per_cpu_ptr(sdd->sd, i);
    1004             : 
    1005             :                 /*
    1006             :                  * Asymmetric node setups can result in situations where the
    1007             :                  * domain tree is of unequal depth, make sure to skip domains
    1008             :                  * that already cover the entire range.
    1009             :                  *
    1010             :                  * In that case build_sched_domains() will have terminated the
    1011             :                  * iteration early and our sibling sd spans will be empty.
    1012             :                  * Domains should always include the CPU they're built on, so
    1013             :                  * check that.
    1014             :                  */
    1015           0 :                 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
    1016           0 :                         continue;
    1017             : 
    1018           0 :                 sg = build_group_from_child_sched_domain(sibling, cpu);
    1019           0 :                 if (!sg)
    1020           0 :                         goto fail;
    1021             : 
    1022           0 :                 sg_span = sched_group_span(sg);
    1023           0 :                 cpumask_or(covered, covered, sg_span);
    1024             : 
    1025           0 :                 init_overlap_sched_group(sd, sg);
    1026             : 
    1027           0 :                 if (!first)
    1028           0 :                         first = sg;
    1029           0 :                 if (last)
    1030           0 :                         last->next = sg;
    1031           0 :                 last = sg;
    1032           0 :                 last->next = first;
    1033             :         }
    1034           0 :         sd->groups = first;
    1035             : 
    1036           0 :         return 0;
    1037             : 
    1038           0 : fail:
    1039           0 :         free_sched_groups(first, 0);
    1040             : 
    1041           0 :         return -ENOMEM;
    1042             : }
    1043             : 
    1044             : 
    1045             : /*
    1046             :  * Package topology (also see the load-balance blurb in fair.c)
    1047             :  *
    1048             :  * The scheduler builds a tree structure to represent a number of important
    1049             :  * topology features. By default (default_topology[]) these include:
    1050             :  *
    1051             :  *  - Simultaneous multithreading (SMT)
    1052             :  *  - Multi-Core Cache (MC)
    1053             :  *  - Package (DIE)
    1054             :  *
    1055             :  * Where the last one more or less denotes everything up to a NUMA node.
    1056             :  *
    1057             :  * The tree consists of 3 primary data structures:
    1058             :  *
    1059             :  *      sched_domain -> sched_group -> sched_group_capacity
    1060             :  *          ^ ^             ^ ^
    1061             :  *          `-'             `-'
    1062             :  *
    1063             :  * The sched_domains are per-CPU and have a two way link (parent & child) and
    1064             :  * denote the ever growing mask of CPUs belonging to that level of topology.
    1065             :  *
    1066             :  * Each sched_domain has a circular (double) linked list of sched_group's, each
    1067             :  * denoting the domains of the level below (or individual CPUs in case of the
    1068             :  * first domain level). The sched_group linked by a sched_domain includes the
    1069             :  * CPU of that sched_domain [*].
    1070             :  *
    1071             :  * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
    1072             :  *
    1073             :  * CPU   0   1   2   3   4   5   6   7
    1074             :  *
    1075             :  * DIE  [                             ]
    1076             :  * MC   [             ] [             ]
    1077             :  * SMT  [     ] [     ] [     ] [     ]
    1078             :  *
    1079             :  *  - or -
    1080             :  *
    1081             :  * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
    1082             :  * MC   0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
    1083             :  * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
    1084             :  *
    1085             :  * CPU   0   1   2   3   4   5   6   7
    1086             :  *
    1087             :  * One way to think about it is: sched_domain moves you up and down among these
    1088             :  * topology levels, while sched_group moves you sideways through it, at child
    1089             :  * domain granularity.
    1090             :  *
    1091             :  * sched_group_capacity ensures each unique sched_group has shared storage.
    1092             :  *
    1093             :  * There are two related construction problems, both require a CPU that
    1094             :  * uniquely identify each group (for a given domain):
    1095             :  *
    1096             :  *  - The first is the balance_cpu (see should_we_balance() and the
    1097             :  *    load-balance blub in fair.c); for each group we only want 1 CPU to
    1098             :  *    continue balancing at a higher domain.
    1099             :  *
    1100             :  *  - The second is the sched_group_capacity; we want all identical groups
    1101             :  *    to share a single sched_group_capacity.
    1102             :  *
    1103             :  * Since these topologies are exclusive by construction. That is, its
    1104             :  * impossible for an SMT thread to belong to multiple cores, and cores to
    1105             :  * be part of multiple caches. There is a very clear and unique location
    1106             :  * for each CPU in the hierarchy.
    1107             :  *
    1108             :  * Therefore computing a unique CPU for each group is trivial (the iteration
    1109             :  * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
    1110             :  * group), we can simply pick the first CPU in each group.
    1111             :  *
    1112             :  *
    1113             :  * [*] in other words, the first group of each domain is its child domain.
    1114             :  */
    1115             : 
    1116          24 : static struct sched_group *get_group(int cpu, struct sd_data *sdd)
    1117             : {
    1118          24 :         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
    1119          24 :         struct sched_domain *child = sd->child;
    1120          24 :         struct sched_group *sg;
    1121          24 :         bool already_visited;
    1122             : 
    1123          24 :         if (child)
    1124          20 :                 cpu = cpumask_first(sched_domain_span(child));
    1125             : 
    1126          24 :         sg = *per_cpu_ptr(sdd->sg, cpu);
    1127          24 :         sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
    1128             : 
    1129             :         /* Increase refcounts for claim_allocations: */
    1130          24 :         already_visited = atomic_inc_return(&sg->ref) > 1;
    1131             :         /* sgc visits should follow a similar trend as sg */
    1132          48 :         WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
    1133             : 
    1134             :         /* If we have already visited that group, it's already initialized. */
    1135          24 :         if (already_visited)
    1136             :                 return sg;
    1137             : 
    1138          12 :         if (child) {
    1139           8 :                 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
    1140           8 :                 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
    1141             :         } else {
    1142           4 :                 cpumask_set_cpu(cpu, sched_group_span(sg));
    1143           4 :                 cpumask_set_cpu(cpu, group_balance_mask(sg));
    1144             :         }
    1145             : 
    1146          12 :         sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
    1147          12 :         sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
    1148          12 :         sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
    1149             : 
    1150          12 :         return sg;
    1151             : }
    1152             : 
    1153             : /*
    1154             :  * build_sched_groups will build a circular linked list of the groups
    1155             :  * covered by the given span, will set each group's ->cpumask correctly,
    1156             :  * and will initialize their ->sgc.
    1157             :  *
    1158             :  * Assumes the sched_domain tree is fully constructed
    1159             :  */
    1160             : static int
    1161          12 : build_sched_groups(struct sched_domain *sd, int cpu)
    1162             : {
    1163          12 :         struct sched_group *first = NULL, *last = NULL;
    1164          12 :         struct sd_data *sdd = sd->private;
    1165          12 :         const struct cpumask *span = sched_domain_span(sd);
    1166          12 :         struct cpumask *covered;
    1167          12 :         int i;
    1168             : 
    1169          36 :         lockdep_assert_held(&sched_domains_mutex);
    1170          12 :         covered = sched_domains_tmpmask;
    1171             : 
    1172          12 :         cpumask_clear(covered);
    1173             : 
    1174          36 :         for_each_cpu_wrap(i, span, cpu) {
    1175          24 :                 struct sched_group *sg;
    1176             : 
    1177          24 :                 if (cpumask_test_cpu(i, covered))
    1178           0 :                         continue;
    1179             : 
    1180          24 :                 sg = get_group(i, sdd);
    1181             : 
    1182          24 :                 cpumask_or(covered, covered, sched_group_span(sg));
    1183             : 
    1184          24 :                 if (!first)
    1185          12 :                         first = sg;
    1186          24 :                 if (last)
    1187          12 :                         last->next = sg;
    1188             :                 last = sg;
    1189             :         }
    1190          12 :         last->next = first;
    1191          12 :         sd->groups = first;
    1192             : 
    1193          12 :         return 0;
    1194             : }
    1195             : 
    1196             : /*
    1197             :  * Initialize sched groups cpu_capacity.
    1198             :  *
    1199             :  * cpu_capacity indicates the capacity of sched group, which is used while
    1200             :  * distributing the load between different sched groups in a sched domain.
    1201             :  * Typically cpu_capacity for all the groups in a sched domain will be same
    1202             :  * unless there are asymmetries in the topology. If there are asymmetries,
    1203             :  * group having more cpu_capacity will pickup more load compared to the
    1204             :  * group having less cpu_capacity.
    1205             :  */
    1206          12 : static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
    1207             : {
    1208          12 :         struct sched_group *sg = sd->groups;
    1209             : 
    1210          12 :         WARN_ON(!sg);
    1211             : 
    1212          24 :         do {
    1213          24 :                 int cpu, max_cpu = -1;
    1214             : 
    1215          24 :                 sg->group_weight = cpumask_weight(sched_group_span(sg));
    1216             : 
    1217          24 :                 if (!(sd->flags & SD_ASYM_PACKING))
    1218          24 :                         goto next;
    1219             : 
    1220           0 :                 for_each_cpu(cpu, sched_group_span(sg)) {
    1221           0 :                         if (max_cpu < 0)
    1222             :                                 max_cpu = cpu;
    1223           0 :                         else if (sched_asym_prefer(cpu, max_cpu))
    1224           0 :                                 max_cpu = cpu;
    1225             :                 }
    1226           0 :                 sg->asym_prefer_cpu = max_cpu;
    1227             : 
    1228          24 : next:
    1229          24 :                 sg = sg->next;
    1230          24 :         } while (sg != sd->groups);
    1231             : 
    1232          12 :         if (cpu != group_balance_cpu(sg))
    1233             :                 return;
    1234             : 
    1235          12 :         update_group_capacity(sd, cpu);
    1236             : }
    1237             : 
    1238             : /*
    1239             :  * Initializers for schedule domains
    1240             :  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
    1241             :  */
    1242             : 
    1243             : static int default_relax_domain_level = -1;
    1244             : int sched_domain_level_max;
    1245             : 
    1246           0 : static int __init setup_relax_domain_level(char *str)
    1247             : {
    1248           0 :         if (kstrtoint(str, 0, &default_relax_domain_level))
    1249           0 :                 pr_warn("Unable to set relax_domain_level\n");
    1250             : 
    1251           0 :         return 1;
    1252             : }
    1253             : __setup("relax_domain_level=", setup_relax_domain_level);
    1254             : 
    1255          12 : static void set_domain_attribute(struct sched_domain *sd,
    1256             :                                  struct sched_domain_attr *attr)
    1257             : {
    1258          12 :         int request;
    1259             : 
    1260          12 :         if (!attr || attr->relax_domain_level < 0) {
    1261          12 :                 if (default_relax_domain_level < 0)
    1262             :                         return;
    1263             :                 request = default_relax_domain_level;
    1264             :         } else
    1265             :                 request = attr->relax_domain_level;
    1266             : 
    1267           0 :         if (sd->level > request) {
    1268             :                 /* Turn off idle balance on this domain: */
    1269           0 :                 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
    1270             :         }
    1271             : }
    1272             : 
    1273             : static void __sdt_free(const struct cpumask *cpu_map);
    1274             : static int __sdt_alloc(const struct cpumask *cpu_map);
    1275             : 
    1276           1 : static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
    1277             :                                  const struct cpumask *cpu_map)
    1278             : {
    1279           1 :         switch (what) {
    1280           1 :         case sa_rootdomain:
    1281           1 :                 if (!atomic_read(&d->rd->refcount))
    1282           0 :                         free_rootdomain(&d->rd->rcu);
    1283           1 :                 fallthrough;
    1284             :         case sa_sd:
    1285           1 :                 free_percpu(d->sd);
    1286           1 :                 fallthrough;
    1287           1 :         case sa_sd_storage:
    1288           1 :                 __sdt_free(cpu_map);
    1289             :                 fallthrough;
    1290             :         case sa_none:
    1291             :                 break;
    1292             :         }
    1293           1 : }
    1294             : 
    1295             : static enum s_alloc
    1296           1 : __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
    1297             : {
    1298           1 :         memset(d, 0, sizeof(*d));
    1299             : 
    1300           1 :         if (__sdt_alloc(cpu_map))
    1301             :                 return sa_sd_storage;
    1302           1 :         d->sd = alloc_percpu(struct sched_domain *);
    1303           1 :         if (!d->sd)
    1304             :                 return sa_sd_storage;
    1305           1 :         d->rd = alloc_rootdomain();
    1306           1 :         if (!d->rd)
    1307           0 :                 return sa_sd;
    1308             : 
    1309             :         return sa_rootdomain;
    1310             : }
    1311             : 
    1312             : /*
    1313             :  * NULL the sd_data elements we've used to build the sched_domain and
    1314             :  * sched_group structure so that the subsequent __free_domain_allocs()
    1315             :  * will not free the data we're using.
    1316             :  */
    1317          12 : static void claim_allocations(int cpu, struct sched_domain *sd)
    1318             : {
    1319          12 :         struct sd_data *sdd = sd->private;
    1320             : 
    1321          12 :         WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
    1322          12 :         *per_cpu_ptr(sdd->sd, cpu) = NULL;
    1323             : 
    1324          12 :         if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
    1325           8 :                 *per_cpu_ptr(sdd->sds, cpu) = NULL;
    1326             : 
    1327          12 :         if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
    1328          12 :                 *per_cpu_ptr(sdd->sg, cpu) = NULL;
    1329             : 
    1330          12 :         if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
    1331          12 :                 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
    1332          12 : }
    1333             : 
    1334             : #ifdef CONFIG_NUMA
    1335             : enum numa_topology_type sched_numa_topology_type;
    1336             : 
    1337             : static int                      sched_domains_numa_levels;
    1338             : static int                      sched_domains_curr_level;
    1339             : 
    1340             : int                             sched_max_numa_distance;
    1341             : static int                      *sched_domains_numa_distance;
    1342             : static struct cpumask           ***sched_domains_numa_masks;
    1343             : int __read_mostly               node_reclaim_distance = RECLAIM_DISTANCE;
    1344             : #endif
    1345             : 
    1346             : /*
    1347             :  * SD_flags allowed in topology descriptions.
    1348             :  *
    1349             :  * These flags are purely descriptive of the topology and do not prescribe
    1350             :  * behaviour. Behaviour is artificial and mapped in the below sd_init()
    1351             :  * function:
    1352             :  *
    1353             :  *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
    1354             :  *   SD_SHARE_PKG_RESOURCES - describes shared caches
    1355             :  *   SD_NUMA                - describes NUMA topologies
    1356             :  *
    1357             :  * Odd one out, which beside describing the topology has a quirk also
    1358             :  * prescribes the desired behaviour that goes along with it:
    1359             :  *
    1360             :  *   SD_ASYM_PACKING        - describes SMT quirks
    1361             :  */
    1362             : #define TOPOLOGY_SD_FLAGS               \
    1363             :         (SD_SHARE_CPUCAPACITY   |       \
    1364             :          SD_SHARE_PKG_RESOURCES |       \
    1365             :          SD_NUMA                |       \
    1366             :          SD_ASYM_PACKING)
    1367             : 
    1368             : static struct sched_domain *
    1369          12 : sd_init(struct sched_domain_topology_level *tl,
    1370             :         const struct cpumask *cpu_map,
    1371             :         struct sched_domain *child, int dflags, int cpu)
    1372             : {
    1373          12 :         struct sd_data *sdd = &tl->data;
    1374          12 :         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
    1375          12 :         int sd_id, sd_weight, sd_flags = 0;
    1376             : 
    1377             : #ifdef CONFIG_NUMA
    1378             :         /*
    1379             :          * Ugly hack to pass state to sd_numa_mask()...
    1380             :          */
    1381          12 :         sched_domains_curr_level = tl->numa_level;
    1382             : #endif
    1383             : 
    1384          12 :         sd_weight = cpumask_weight(tl->mask(cpu));
    1385             : 
    1386          12 :         if (tl->sd_flags)
    1387           8 :                 sd_flags = (*tl->sd_flags)();
    1388          12 :         if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
    1389             :                         "wrong sd_flags in topology description\n"))
    1390           0 :                 sd_flags &= TOPOLOGY_SD_FLAGS;
    1391             : 
    1392             :         /* Apply detected topology flags */
    1393          12 :         sd_flags |= dflags;
    1394             : 
    1395          12 :         *sd = (struct sched_domain){
    1396             :                 .min_interval           = sd_weight,
    1397          12 :                 .max_interval           = 2*sd_weight,
    1398             :                 .busy_factor            = 16,
    1399             :                 .imbalance_pct          = 117,
    1400             : 
    1401             :                 .cache_nice_tries       = 0,
    1402             : 
    1403             :                 .flags                  = 1*SD_BALANCE_NEWIDLE
    1404             :                                         | 1*SD_BALANCE_EXEC
    1405             :                                         | 1*SD_BALANCE_FORK
    1406             :                                         | 0*SD_BALANCE_WAKE
    1407             :                                         | 1*SD_WAKE_AFFINE
    1408             :                                         | 0*SD_SHARE_CPUCAPACITY
    1409             :                                         | 0*SD_SHARE_PKG_RESOURCES
    1410             :                                         | 0*SD_SERIALIZE
    1411             :                                         | 1*SD_PREFER_SIBLING
    1412             :                                         | 0*SD_NUMA
    1413          12 :                                         | sd_flags
    1414             :                                         ,
    1415             : 
    1416             :                 .last_balance           = jiffies,
    1417             :                 .balance_interval       = sd_weight,
    1418             :                 .max_newidle_lb_cost    = 0,
    1419             :                 .next_decay_max_lb_cost = jiffies,
    1420             :                 .child                  = child,
    1421             : #ifdef CONFIG_SCHED_DEBUG
    1422             :                 .name                   = tl->name,
    1423             : #endif
    1424             :         };
    1425             : 
    1426          12 :         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
    1427          12 :         sd_id = cpumask_first(sched_domain_span(sd));
    1428             : 
    1429             :         /*
    1430             :          * Convert topological properties into behaviour.
    1431             :          */
    1432             : 
    1433             :         /* Don't attempt to spread across CPUs of different capacities. */
    1434          12 :         if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
    1435           0 :                 sd->child->flags &= ~SD_PREFER_SIBLING;
    1436             : 
    1437          12 :         if (sd->flags & SD_SHARE_CPUCAPACITY) {
    1438           4 :                 sd->imbalance_pct = 110;
    1439             : 
    1440           8 :         } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
    1441           4 :                 sd->imbalance_pct = 117;
    1442           4 :                 sd->cache_nice_tries = 1;
    1443             : 
    1444             : #ifdef CONFIG_NUMA
    1445           4 :         } else if (sd->flags & SD_NUMA) {
    1446           0 :                 sd->cache_nice_tries = 2;
    1447             : 
    1448           0 :                 sd->flags &= ~SD_PREFER_SIBLING;
    1449           0 :                 sd->flags |= SD_SERIALIZE;
    1450           0 :                 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
    1451           0 :                         sd->flags &= ~(SD_BALANCE_EXEC |
    1452             :                                        SD_BALANCE_FORK |
    1453             :                                        SD_WAKE_AFFINE);
    1454             :                 }
    1455             : 
    1456             : #endif
    1457             :         } else {
    1458           4 :                 sd->cache_nice_tries = 1;
    1459             :         }
    1460             : 
    1461             :         /*
    1462             :          * For all levels sharing cache; connect a sched_domain_shared
    1463             :          * instance.
    1464             :          */
    1465          12 :         if (sd->flags & SD_SHARE_PKG_RESOURCES) {
    1466           8 :                 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
    1467           8 :                 atomic_inc(&sd->shared->ref);
    1468           8 :                 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
    1469             :         }
    1470             : 
    1471          12 :         sd->private = sdd;
    1472             : 
    1473          12 :         return sd;
    1474             : }
    1475             : 
    1476             : /*
    1477             :  * Topology list, bottom-up.
    1478             :  */
    1479             : static struct sched_domain_topology_level default_topology[] = {
    1480             : #ifdef CONFIG_SCHED_SMT
    1481             :         { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
    1482             : #endif
    1483             : #ifdef CONFIG_SCHED_MC
    1484             :         { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
    1485             : #endif
    1486             :         { cpu_cpu_mask, SD_INIT_NAME(DIE) },
    1487             :         { NULL, },
    1488             : };
    1489             : 
    1490             : static struct sched_domain_topology_level *sched_domain_topology =
    1491             :         default_topology;
    1492             : 
    1493             : #define for_each_sd_topology(tl)                        \
    1494             :         for (tl = sched_domain_topology; tl->mask; tl++)
    1495             : 
    1496           1 : void set_sched_topology(struct sched_domain_topology_level *tl)
    1497             : {
    1498           1 :         if (WARN_ON_ONCE(sched_smp_initialized))
    1499             :                 return;
    1500             : 
    1501           1 :         sched_domain_topology = tl;
    1502             : }
    1503             : 
    1504             : #ifdef CONFIG_NUMA
    1505             : 
    1506           0 : static const struct cpumask *sd_numa_mask(int cpu)
    1507             : {
    1508           0 :         return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
    1509             : }
    1510             : 
    1511           0 : static void sched_numa_warn(const char *str)
    1512             : {
    1513           0 :         static int done = false;
    1514           0 :         int i,j;
    1515             : 
    1516           0 :         if (done)
    1517             :                 return;
    1518             : 
    1519           0 :         done = true;
    1520             : 
    1521           0 :         printk(KERN_WARNING "ERROR: %s\n\n", str);
    1522             : 
    1523           0 :         for (i = 0; i < nr_node_ids; i++) {
    1524           0 :                 printk(KERN_WARNING "  ");
    1525           0 :                 for (j = 0; j < nr_node_ids; j++)
    1526           0 :                         printk(KERN_CONT "%02d ", node_distance(i,j));
    1527           0 :                 printk(KERN_CONT "\n");
    1528             :         }
    1529           0 :         printk(KERN_WARNING "\n");
    1530             : }
    1531             : 
    1532           0 : bool find_numa_distance(int distance)
    1533             : {
    1534           0 :         int i;
    1535             : 
    1536           0 :         if (distance == node_distance(0, 0))
    1537             :                 return true;
    1538             : 
    1539           0 :         for (i = 0; i < sched_domains_numa_levels; i++) {
    1540           0 :                 if (sched_domains_numa_distance[i] == distance)
    1541             :                         return true;
    1542             :         }
    1543             : 
    1544             :         return false;
    1545             : }
    1546             : 
    1547             : /*
    1548             :  * A system can have three types of NUMA topology:
    1549             :  * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
    1550             :  * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
    1551             :  * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
    1552             :  *
    1553             :  * The difference between a glueless mesh topology and a backplane
    1554             :  * topology lies in whether communication between not directly
    1555             :  * connected nodes goes through intermediary nodes (where programs
    1556             :  * could run), or through backplane controllers. This affects
    1557             :  * placement of programs.
    1558             :  *
    1559             :  * The type of topology can be discerned with the following tests:
    1560             :  * - If the maximum distance between any nodes is 1 hop, the system
    1561             :  *   is directly connected.
    1562             :  * - If for two nodes A and B, located N > 1 hops away from each other,
    1563             :  *   there is an intermediary node C, which is < N hops away from both
    1564             :  *   nodes A and B, the system is a glueless mesh.
    1565             :  */
    1566           1 : static void init_numa_topology_type(void)
    1567             : {
    1568           1 :         int a, b, c, n;
    1569             : 
    1570           1 :         n = sched_max_numa_distance;
    1571             : 
    1572           1 :         if (sched_domains_numa_levels <= 2) {
    1573           1 :                 sched_numa_topology_type = NUMA_DIRECT;
    1574           1 :                 return;
    1575             :         }
    1576             : 
    1577           0 :         for_each_online_node(a) {
    1578           0 :                 for_each_online_node(b) {
    1579             :                         /* Find two nodes furthest removed from each other. */
    1580           0 :                         if (node_distance(a, b) < n)
    1581           0 :                                 continue;
    1582             : 
    1583             :                         /* Is there an intermediary node between a and b? */
    1584           0 :                         for_each_online_node(c) {
    1585           0 :                                 if (node_distance(a, c) < n &&
    1586           0 :                                     node_distance(b, c) < n) {
    1587           0 :                                         sched_numa_topology_type =
    1588             :                                                         NUMA_GLUELESS_MESH;
    1589           0 :                                         return;
    1590             :                                 }
    1591             :                         }
    1592             : 
    1593           0 :                         sched_numa_topology_type = NUMA_BACKPLANE;
    1594           0 :                         return;
    1595             :                 }
    1596             :         }
    1597             : }
    1598             : 
    1599             : 
    1600             : #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
    1601             : 
    1602           1 : void sched_init_numa(void)
    1603             : {
    1604           1 :         struct sched_domain_topology_level *tl;
    1605           1 :         unsigned long *distance_map;
    1606           1 :         int nr_levels = 0;
    1607           1 :         int i, j;
    1608             : 
    1609             :         /*
    1610             :          * O(nr_nodes^2) deduplicating selection sort -- in order to find the
    1611             :          * unique distances in the node_distance() table.
    1612             :          */
    1613           1 :         distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
    1614           1 :         if (!distance_map)
    1615             :                 return;
    1616             : 
    1617           1 :         bitmap_zero(distance_map, NR_DISTANCE_VALUES);
    1618           2 :         for (i = 0; i < nr_node_ids; i++) {
    1619           2 :                 for (j = 0; j < nr_node_ids; j++) {
    1620           1 :                         int distance = node_distance(i, j);
    1621             : 
    1622           1 :                         if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
    1623           0 :                                 sched_numa_warn("Invalid distance value range");
    1624           0 :                                 return;
    1625             :                         }
    1626             : 
    1627           1 :                         bitmap_set(distance_map, distance, 1);
    1628             :                 }
    1629             :         }
    1630             :         /*
    1631             :          * We can now figure out how many unique distance values there are and
    1632             :          * allocate memory accordingly.
    1633             :          */
    1634           1 :         nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
    1635             : 
    1636           1 :         sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
    1637           1 :         if (!sched_domains_numa_distance) {
    1638           0 :                 bitmap_free(distance_map);
    1639           0 :                 return;
    1640             :         }
    1641             : 
    1642           2 :         for (i = 0, j = 0; i < nr_levels; i++, j++) {
    1643           1 :                 j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
    1644           1 :                 sched_domains_numa_distance[i] = j;
    1645             :         }
    1646             : 
    1647           1 :         bitmap_free(distance_map);
    1648             : 
    1649             :         /*
    1650             :          * 'nr_levels' contains the number of unique distances
    1651             :          *
    1652             :          * The sched_domains_numa_distance[] array includes the actual distance
    1653             :          * numbers.
    1654             :          */
    1655             : 
    1656             :         /*
    1657             :          * Here, we should temporarily reset sched_domains_numa_levels to 0.
    1658             :          * If it fails to allocate memory for array sched_domains_numa_masks[][],
    1659             :          * the array will contain less then 'nr_levels' members. This could be
    1660             :          * dangerous when we use it to iterate array sched_domains_numa_masks[][]
    1661             :          * in other functions.
    1662             :          *
    1663             :          * We reset it to 'nr_levels' at the end of this function.
    1664             :          */
    1665           1 :         sched_domains_numa_levels = 0;
    1666             : 
    1667           1 :         sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
    1668           1 :         if (!sched_domains_numa_masks)
    1669             :                 return;
    1670             : 
    1671             :         /*
    1672             :          * Now for each level, construct a mask per node which contains all
    1673             :          * CPUs of nodes that are that many hops away from us.
    1674             :          */
    1675           2 :         for (i = 0; i < nr_levels; i++) {
    1676           2 :                 sched_domains_numa_masks[i] =
    1677           1 :                         kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
    1678           1 :                 if (!sched_domains_numa_masks[i])
    1679             :                         return;
    1680             : 
    1681           2 :                 for (j = 0; j < nr_node_ids; j++) {
    1682           1 :                         struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
    1683           1 :                         int k;
    1684             : 
    1685           1 :                         if (!mask)
    1686             :                                 return;
    1687             : 
    1688           1 :                         sched_domains_numa_masks[i][j] = mask;
    1689             : 
    1690           2 :                         for_each_node(k) {
    1691           1 :                                 if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
    1692             :                                         sched_numa_warn("Node-distance not symmetric");
    1693             : 
    1694           1 :                                 if (node_distance(j, k) > sched_domains_numa_distance[i])
    1695           0 :                                         continue;
    1696             : 
    1697           1 :                                 cpumask_or(mask, mask, cpumask_of_node(k));
    1698             :                         }
    1699             :                 }
    1700             :         }
    1701             : 
    1702             :         /* Compute default topology size */
    1703           4 :         for (i = 0; sched_domain_topology[i].mask; i++);
    1704             : 
    1705           1 :         tl = kzalloc((i + nr_levels + 1) *
    1706             :                         sizeof(struct sched_domain_topology_level), GFP_KERNEL);
    1707           1 :         if (!tl)
    1708             :                 return;
    1709             : 
    1710             :         /*
    1711             :          * Copy the default topology bits..
    1712             :          */
    1713           4 :         for (i = 0; sched_domain_topology[i].mask; i++)
    1714           3 :                 tl[i] = sched_domain_topology[i];
    1715             : 
    1716             :         /*
    1717             :          * Add the NUMA identity distance, aka single NODE.
    1718             :          */
    1719           1 :         tl[i++] = (struct sched_domain_topology_level){
    1720             :                 .mask = sd_numa_mask,
    1721             :                 .numa_level = 0,
    1722             :                 SD_INIT_NAME(NODE)
    1723             :         };
    1724             : 
    1725             :         /*
    1726             :          * .. and append 'j' levels of NUMA goodness.
    1727             :          */
    1728           1 :         for (j = 1; j < nr_levels; i++, j++) {
    1729           0 :                 tl[i] = (struct sched_domain_topology_level){
    1730             :                         .mask = sd_numa_mask,
    1731             :                         .sd_flags = cpu_numa_flags,
    1732             :                         .flags = SDTL_OVERLAP,
    1733             :                         .numa_level = j,
    1734             :                         SD_INIT_NAME(NUMA)
    1735             :                 };
    1736             :         }
    1737             : 
    1738           1 :         sched_domain_topology = tl;
    1739             : 
    1740           1 :         sched_domains_numa_levels = nr_levels;
    1741           1 :         sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
    1742             : 
    1743           1 :         init_numa_topology_type();
    1744             : }
    1745             : 
    1746           0 : void sched_domains_numa_masks_set(unsigned int cpu)
    1747             : {
    1748           0 :         int node = cpu_to_node(cpu);
    1749           0 :         int i, j;
    1750             : 
    1751           0 :         for (i = 0; i < sched_domains_numa_levels; i++) {
    1752           0 :                 for (j = 0; j < nr_node_ids; j++) {
    1753           0 :                         if (node_distance(j, node) <= sched_domains_numa_distance[i])
    1754           0 :                                 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
    1755             :                 }
    1756             :         }
    1757           0 : }
    1758             : 
    1759           0 : void sched_domains_numa_masks_clear(unsigned int cpu)
    1760             : {
    1761           0 :         int i, j;
    1762             : 
    1763           0 :         for (i = 0; i < sched_domains_numa_levels; i++) {
    1764           0 :                 for (j = 0; j < nr_node_ids; j++)
    1765           0 :                         cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
    1766             :         }
    1767           0 : }
    1768             : 
    1769             : /*
    1770             :  * sched_numa_find_closest() - given the NUMA topology, find the cpu
    1771             :  *                             closest to @cpu from @cpumask.
    1772             :  * cpumask: cpumask to find a cpu from
    1773             :  * cpu: cpu to be close to
    1774             :  *
    1775             :  * returns: cpu, or nr_cpu_ids when nothing found.
    1776             :  */
    1777           0 : int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
    1778             : {
    1779           0 :         int i, j = cpu_to_node(cpu);
    1780             : 
    1781           0 :         for (i = 0; i < sched_domains_numa_levels; i++) {
    1782           0 :                 cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
    1783           0 :                 if (cpu < nr_cpu_ids)
    1784           0 :                         return cpu;
    1785             :         }
    1786           0 :         return nr_cpu_ids;
    1787             : }
    1788             : 
    1789             : #endif /* CONFIG_NUMA */
    1790             : 
    1791           1 : static int __sdt_alloc(const struct cpumask *cpu_map)
    1792             : {
    1793           1 :         struct sched_domain_topology_level *tl;
    1794           1 :         int j;
    1795             : 
    1796           5 :         for_each_sd_topology(tl) {
    1797           4 :                 struct sd_data *sdd = &tl->data;
    1798             : 
    1799           4 :                 sdd->sd = alloc_percpu(struct sched_domain *);
    1800           4 :                 if (!sdd->sd)
    1801             :                         return -ENOMEM;
    1802             : 
    1803           4 :                 sdd->sds = alloc_percpu(struct sched_domain_shared *);
    1804           4 :                 if (!sdd->sds)
    1805             :                         return -ENOMEM;
    1806             : 
    1807           4 :                 sdd->sg = alloc_percpu(struct sched_group *);
    1808           4 :                 if (!sdd->sg)
    1809             :                         return -ENOMEM;
    1810             : 
    1811           4 :                 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
    1812           4 :                 if (!sdd->sgc)
    1813             :                         return -ENOMEM;
    1814             : 
    1815          20 :                 for_each_cpu(j, cpu_map) {
    1816          16 :                         struct sched_domain *sd;
    1817          16 :                         struct sched_domain_shared *sds;
    1818          16 :                         struct sched_group *sg;
    1819          16 :                         struct sched_group_capacity *sgc;
    1820             : 
    1821          16 :                         sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
    1822             :                                         GFP_KERNEL, cpu_to_node(j));
    1823          16 :                         if (!sd)
    1824             :                                 return -ENOMEM;
    1825             : 
    1826          16 :                         *per_cpu_ptr(sdd->sd, j) = sd;
    1827             : 
    1828          16 :                         sds = kzalloc_node(sizeof(struct sched_domain_shared),
    1829             :                                         GFP_KERNEL, cpu_to_node(j));
    1830          16 :                         if (!sds)
    1831             :                                 return -ENOMEM;
    1832             : 
    1833          16 :                         *per_cpu_ptr(sdd->sds, j) = sds;
    1834             : 
    1835          16 :                         sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
    1836             :                                         GFP_KERNEL, cpu_to_node(j));
    1837          16 :                         if (!sg)
    1838             :                                 return -ENOMEM;
    1839             : 
    1840          16 :                         sg->next = sg;
    1841             : 
    1842          16 :                         *per_cpu_ptr(sdd->sg, j) = sg;
    1843             : 
    1844          16 :                         sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
    1845             :                                         GFP_KERNEL, cpu_to_node(j));
    1846          16 :                         if (!sgc)
    1847             :                                 return -ENOMEM;
    1848             : 
    1849             : #ifdef CONFIG_SCHED_DEBUG
    1850             :                         sgc->id = j;
    1851             : #endif
    1852             : 
    1853          16 :                         *per_cpu_ptr(sdd->sgc, j) = sgc;
    1854             :                 }
    1855             :         }
    1856             : 
    1857             :         return 0;
    1858             : }
    1859             : 
    1860           1 : static void __sdt_free(const struct cpumask *cpu_map)
    1861             : {
    1862           1 :         struct sched_domain_topology_level *tl;
    1863           1 :         int j;
    1864             : 
    1865           5 :         for_each_sd_topology(tl) {
    1866          20 :                 struct sd_data *sdd = &tl->data;
    1867             : 
    1868          20 :                 for_each_cpu(j, cpu_map) {
    1869          16 :                         struct sched_domain *sd;
    1870             : 
    1871          16 :                         if (sdd->sd) {
    1872          16 :                                 sd = *per_cpu_ptr(sdd->sd, j);
    1873          16 :                                 if (sd && (sd->flags & SD_OVERLAP))
    1874           0 :                                         free_sched_groups(sd->groups, 0);
    1875          16 :                                 kfree(*per_cpu_ptr(sdd->sd, j));
    1876             :                         }
    1877             : 
    1878          16 :                         if (sdd->sds)
    1879          16 :                                 kfree(*per_cpu_ptr(sdd->sds, j));
    1880          16 :                         if (sdd->sg)
    1881          16 :                                 kfree(*per_cpu_ptr(sdd->sg, j));
    1882          16 :                         if (sdd->sgc)
    1883          16 :                                 kfree(*per_cpu_ptr(sdd->sgc, j));
    1884             :                 }
    1885           4 :                 free_percpu(sdd->sd);
    1886           4 :                 sdd->sd = NULL;
    1887           4 :                 free_percpu(sdd->sds);
    1888           4 :                 sdd->sds = NULL;
    1889           4 :                 free_percpu(sdd->sg);
    1890           4 :                 sdd->sg = NULL;
    1891           4 :                 free_percpu(sdd->sgc);
    1892           4 :                 sdd->sgc = NULL;
    1893             :         }
    1894           1 : }
    1895             : 
    1896          12 : static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
    1897             :                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
    1898             :                 struct sched_domain *child, int dflags, int cpu)
    1899             : {
    1900          12 :         struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
    1901             : 
    1902          12 :         if (child) {
    1903           8 :                 sd->level = child->level + 1;
    1904           8 :                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
    1905           8 :                 child->parent = sd;
    1906             : 
    1907           8 :                 if (!cpumask_subset(sched_domain_span(child),
    1908           8 :                                     sched_domain_span(sd))) {
    1909           0 :                         pr_err("BUG: arch topology borken\n");
    1910             : #ifdef CONFIG_SCHED_DEBUG
    1911             :                         pr_err("     the %s domain not a subset of the %s domain\n",
    1912             :                                         child->name, sd->name);
    1913             : #endif
    1914             :                         /* Fixup, ensure @sd has at least @child CPUs. */
    1915           0 :                         cpumask_or(sched_domain_span(sd),
    1916           0 :                                    sched_domain_span(sd),
    1917           0 :                                    sched_domain_span(child));
    1918             :                 }
    1919             : 
    1920             :         }
    1921          12 :         set_domain_attribute(sd, attr);
    1922             : 
    1923          12 :         return sd;
    1924             : }
    1925             : 
    1926             : /*
    1927             :  * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
    1928             :  * any two given CPUs at this (non-NUMA) topology level.
    1929             :  */
    1930          12 : static bool topology_span_sane(struct sched_domain_topology_level *tl,
    1931             :                               const struct cpumask *cpu_map, int cpu)
    1932             : {
    1933          12 :         int i;
    1934             : 
    1935             :         /* NUMA levels are allowed to overlap */
    1936          12 :         if (tl->flags & SDTL_OVERLAP)
    1937             :                 return true;
    1938             : 
    1939             :         /*
    1940             :          * Non-NUMA levels cannot partially overlap - they must be either
    1941             :          * completely equal or completely disjoint. Otherwise we can end up
    1942             :          * breaking the sched_group lists - i.e. a later get_group() pass
    1943             :          * breaks the linking done for an earlier span.
    1944             :          */
    1945          60 :         for_each_cpu(i, cpu_map) {
    1946          48 :                 if (i == cpu)
    1947          12 :                         continue;
    1948             :                 /*
    1949             :                  * We should 'and' all those masks with 'cpu_map' to exactly
    1950             :                  * match the topology we're about to build, but that can only
    1951             :                  * remove CPUs, which only lessens our ability to detect
    1952             :                  * overlaps
    1953             :                  */
    1954          60 :                 if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
    1955          24 :                     cpumask_intersects(tl->mask(cpu), tl->mask(i)))
    1956             :                         return false;
    1957             :         }
    1958             : 
    1959             :         return true;
    1960             : }
    1961             : 
    1962             : /*
    1963             :  * Find the sched_domain_topology_level where all CPU capacities are visible
    1964             :  * for all CPUs.
    1965             :  */
    1966             : static struct sched_domain_topology_level
    1967           1 : *asym_cpu_capacity_level(const struct cpumask *cpu_map)
    1968             : {
    1969           1 :         int i, j, asym_level = 0;
    1970           1 :         bool asym = false;
    1971           1 :         struct sched_domain_topology_level *tl, *asym_tl = NULL;
    1972           1 :         unsigned long cap;
    1973             : 
    1974             :         /* Is there any asymmetry? */
    1975           1 :         cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
    1976             : 
    1977           6 :         for_each_cpu(i, cpu_map) {
    1978           5 :                 if (arch_scale_cpu_capacity(i) != cap) {
    1979             :                         asym = true;
    1980             :                         break;
    1981             :                 }
    1982             :         }
    1983             : 
    1984           1 :         if (!asym)
    1985           1 :                 return NULL;
    1986             : 
    1987             :         /*
    1988             :          * Examine topology from all CPU's point of views to detect the lowest
    1989             :          * sched_domain_topology_level where a highest capacity CPU is visible
    1990             :          * to everyone.
    1991             :          */
    1992             :         for_each_cpu(i, cpu_map) {
    1993             :                 unsigned long max_capacity = arch_scale_cpu_capacity(i);
    1994             :                 int tl_id = 0;
    1995             : 
    1996             :                 for_each_sd_topology(tl) {
    1997             :                         if (tl_id < asym_level)
    1998             :                                 goto next_level;
    1999             : 
    2000             :                         for_each_cpu_and(j, tl->mask(i), cpu_map) {
    2001             :                                 unsigned long capacity;
    2002             : 
    2003             :                                 capacity = arch_scale_cpu_capacity(j);
    2004             : 
    2005             :                                 if (capacity <= max_capacity)
    2006             :                                         continue;
    2007             : 
    2008             :                                 max_capacity = capacity;
    2009             :                                 asym_level = tl_id;
    2010             :                                 asym_tl = tl;
    2011             :                         }
    2012             : next_level:
    2013             :                         tl_id++;
    2014             :                 }
    2015             :         }
    2016             : 
    2017             :         return asym_tl;
    2018             : }
    2019             : 
    2020             : 
    2021             : /*
    2022             :  * Build sched domains for a given set of CPUs and attach the sched domains
    2023             :  * to the individual CPUs
    2024             :  */
    2025             : static int
    2026           1 : build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
    2027             : {
    2028           1 :         enum s_alloc alloc_state = sa_none;
    2029           1 :         struct sched_domain *sd;
    2030           1 :         struct s_data d;
    2031           1 :         struct rq *rq = NULL;
    2032           1 :         int i, ret = -ENOMEM;
    2033           1 :         struct sched_domain_topology_level *tl_asym;
    2034           1 :         bool has_asym = false;
    2035             : 
    2036           1 :         if (WARN_ON(cpumask_empty(cpu_map)))
    2037           0 :                 goto error;
    2038             : 
    2039           1 :         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
    2040           1 :         if (alloc_state != sa_rootdomain)
    2041           0 :                 goto error;
    2042             : 
    2043           1 :         tl_asym = asym_cpu_capacity_level(cpu_map);
    2044             : 
    2045             :         /* Set up domains for CPUs specified by the cpu_map: */
    2046           6 :         for_each_cpu(i, cpu_map) {
    2047           4 :                 struct sched_domain_topology_level *tl;
    2048           4 :                 int dflags = 0;
    2049             : 
    2050           4 :                 sd = NULL;
    2051          12 :                 for_each_sd_topology(tl) {
    2052          12 :                         if (tl == tl_asym) {
    2053           0 :                                 dflags |= SD_ASYM_CPUCAPACITY;
    2054           0 :                                 has_asym = true;
    2055             :                         }
    2056             : 
    2057          12 :                         if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
    2058           0 :                                 goto error;
    2059             : 
    2060          12 :                         sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
    2061             : 
    2062          12 :                         if (tl == sched_domain_topology)
    2063           4 :                                 *per_cpu_ptr(d.sd, i) = sd;
    2064          12 :                         if (tl->flags & SDTL_OVERLAP)
    2065           0 :                                 sd->flags |= SD_OVERLAP;
    2066          12 :                         if (cpumask_equal(cpu_map, sched_domain_span(sd)))
    2067             :                                 break;
    2068             :                 }
    2069             :         }
    2070             : 
    2071             :         /* Build the groups for the domains */
    2072           5 :         for_each_cpu(i, cpu_map) {
    2073          16 :                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
    2074          12 :                         sd->span_weight = cpumask_weight(sched_domain_span(sd));
    2075          12 :                         if (sd->flags & SD_OVERLAP) {
    2076           0 :                                 if (build_overlap_sched_groups(sd, i))
    2077           0 :                                         goto error;
    2078             :                         } else {
    2079          12 :                                 if (build_sched_groups(sd, i))
    2080           0 :                                         goto error;
    2081             :                         }
    2082             :                 }
    2083             :         }
    2084             : 
    2085             :         /* Calculate CPU capacity for physical packages and nodes */
    2086          17 :         for (i = nr_cpumask_bits-1; i >= 0; i--) {
    2087          16 :                 if (!cpumask_test_cpu(i, cpu_map))
    2088          12 :                         continue;
    2089             : 
    2090          16 :                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
    2091          12 :                         claim_allocations(i, sd);
    2092          12 :                         init_sched_groups_capacity(i, sd);
    2093             :                 }
    2094             :         }
    2095             : 
    2096             :         /* Attach the domains */
    2097           1 :         rcu_read_lock();
    2098           5 :         for_each_cpu(i, cpu_map) {
    2099           4 :                 rq = cpu_rq(i);
    2100           4 :                 sd = *per_cpu_ptr(d.sd, i);
    2101             : 
    2102             :                 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
    2103           4 :                 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
    2104           1 :                         WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
    2105             : 
    2106           4 :                 cpu_attach_domain(sd, d.rd, i);
    2107             :         }
    2108           1 :         rcu_read_unlock();
    2109             : 
    2110           1 :         if (has_asym)
    2111           0 :                 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
    2112             : 
    2113             :         if (rq && sched_debug_enabled) {
    2114             :                 pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
    2115             :                         cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
    2116             :         }
    2117             : 
    2118             :         ret = 0;
    2119           1 : error:
    2120           1 :         __free_domain_allocs(&d, alloc_state, cpu_map);
    2121             : 
    2122           1 :         return ret;
    2123             : }
    2124             : 
    2125             : /* Current sched domains: */
    2126             : static cpumask_var_t                    *doms_cur;
    2127             : 
    2128             : /* Number of sched domains in 'doms_cur': */
    2129             : static int                              ndoms_cur;
    2130             : 
    2131             : /* Attribues of custom domains in 'doms_cur' */
    2132             : static struct sched_domain_attr         *dattr_cur;
    2133             : 
    2134             : /*
    2135             :  * Special case: If a kmalloc() of a doms_cur partition (array of
    2136             :  * cpumask) fails, then fallback to a single sched domain,
    2137             :  * as determined by the single cpumask fallback_doms.
    2138             :  */
    2139             : static cpumask_var_t                    fallback_doms;
    2140             : 
    2141             : /*
    2142             :  * arch_update_cpu_topology lets virtualized architectures update the
    2143             :  * CPU core maps. It is supposed to return 1 if the topology changed
    2144             :  * or 0 if it stayed the same.
    2145             :  */
    2146           0 : int __weak arch_update_cpu_topology(void)
    2147             : {
    2148           0 :         return 0;
    2149             : }
    2150             : 
    2151           1 : cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
    2152             : {
    2153           1 :         int i;
    2154           1 :         cpumask_var_t *doms;
    2155             : 
    2156           1 :         doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
    2157           1 :         if (!doms)
    2158             :                 return NULL;
    2159           1 :         for (i = 0; i < ndoms; i++) {
    2160             :                 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
    2161             :                         free_sched_domains(doms, i);
    2162             :                         return NULL;
    2163             :                 }
    2164             :         }
    2165             :         return doms;
    2166             : }
    2167             : 
    2168           0 : void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
    2169             : {
    2170           0 :         unsigned int i;
    2171           0 :         for (i = 0; i < ndoms; i++)
    2172             :                 free_cpumask_var(doms[i]);
    2173           0 :         kfree(doms);
    2174           0 : }
    2175             : 
    2176             : /*
    2177             :  * Set up scheduler domains and groups.  For now this just excludes isolated
    2178             :  * CPUs, but could be used to exclude other special cases in the future.
    2179             :  */
    2180           1 : int sched_init_domains(const struct cpumask *cpu_map)
    2181             : {
    2182           1 :         int err;
    2183             : 
    2184           1 :         zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
    2185           1 :         zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
    2186           1 :         zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
    2187             : 
    2188           1 :         arch_update_cpu_topology();
    2189           1 :         ndoms_cur = 1;
    2190           1 :         doms_cur = alloc_sched_domains(ndoms_cur);
    2191           1 :         if (!doms_cur)
    2192           0 :                 doms_cur = &fallback_doms;
    2193           1 :         cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
    2194           1 :         err = build_sched_domains(doms_cur[0], NULL);
    2195           1 :         register_sched_domain_sysctl();
    2196             : 
    2197           1 :         return err;
    2198             : }
    2199             : 
    2200             : /*
    2201             :  * Detach sched domains from a group of CPUs specified in cpu_map
    2202             :  * These CPUs will now be attached to the NULL domain
    2203             :  */
    2204           0 : static void detach_destroy_domains(const struct cpumask *cpu_map)
    2205             : {
    2206           0 :         unsigned int cpu = cpumask_any(cpu_map);
    2207           0 :         int i;
    2208             : 
    2209           0 :         if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
    2210           0 :                 static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
    2211             : 
    2212           0 :         rcu_read_lock();
    2213           0 :         for_each_cpu(i, cpu_map)
    2214           0 :                 cpu_attach_domain(NULL, &def_root_domain, i);
    2215           0 :         rcu_read_unlock();
    2216           0 : }
    2217             : 
    2218             : /* handle null as "default" */
    2219           0 : static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
    2220             :                         struct sched_domain_attr *new, int idx_new)
    2221             : {
    2222           0 :         struct sched_domain_attr tmp;
    2223             : 
    2224             :         /* Fast path: */
    2225           0 :         if (!new && !cur)
    2226             :                 return 1;
    2227             : 
    2228           0 :         tmp = SD_ATTR_INIT;
    2229             : 
    2230           0 :         return !memcmp(cur ? (cur + idx_cur) : &tmp,
    2231           0 :                         new ? (new + idx_new) : &tmp,
    2232             :                         sizeof(struct sched_domain_attr));
    2233             : }
    2234             : 
    2235             : /*
    2236             :  * Partition sched domains as specified by the 'ndoms_new'
    2237             :  * cpumasks in the array doms_new[] of cpumasks. This compares
    2238             :  * doms_new[] to the current sched domain partitioning, doms_cur[].
    2239             :  * It destroys each deleted domain and builds each new domain.
    2240             :  *
    2241             :  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
    2242             :  * The masks don't intersect (don't overlap.) We should setup one
    2243             :  * sched domain for each mask. CPUs not in any of the cpumasks will
    2244             :  * not be load balanced. If the same cpumask appears both in the
    2245             :  * current 'doms_cur' domains and in the new 'doms_new', we can leave
    2246             :  * it as it is.
    2247             :  *
    2248             :  * The passed in 'doms_new' should be allocated using
    2249             :  * alloc_sched_domains.  This routine takes ownership of it and will
    2250             :  * free_sched_domains it when done with it. If the caller failed the
    2251             :  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
    2252             :  * and partition_sched_domains() will fallback to the single partition
    2253             :  * 'fallback_doms', it also forces the domains to be rebuilt.
    2254             :  *
    2255             :  * If doms_new == NULL it will be replaced with cpu_online_mask.
    2256             :  * ndoms_new == 0 is a special case for destroying existing domains,
    2257             :  * and it will not create the default domain.
    2258             :  *
    2259             :  * Call with hotplug lock and sched_domains_mutex held
    2260             :  */
    2261           0 : void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
    2262             :                                     struct sched_domain_attr *dattr_new)
    2263             : {
    2264           0 :         bool __maybe_unused has_eas = false;
    2265           0 :         int i, j, n;
    2266           0 :         int new_topology;
    2267             : 
    2268           0 :         lockdep_assert_held(&sched_domains_mutex);
    2269             : 
    2270             :         /* Always unregister in case we don't destroy any domains: */
    2271           0 :         unregister_sched_domain_sysctl();
    2272             : 
    2273             :         /* Let the architecture update CPU core mappings: */
    2274           0 :         new_topology = arch_update_cpu_topology();
    2275             : 
    2276           0 :         if (!doms_new) {
    2277           0 :                 WARN_ON_ONCE(dattr_new);
    2278           0 :                 n = 0;
    2279           0 :                 doms_new = alloc_sched_domains(1);
    2280           0 :                 if (doms_new) {
    2281           0 :                         n = 1;
    2282           0 :                         cpumask_and(doms_new[0], cpu_active_mask,
    2283             :                                     housekeeping_cpumask(HK_FLAG_DOMAIN));
    2284             :                 }
    2285             :         } else {
    2286             :                 n = ndoms_new;
    2287             :         }
    2288             : 
    2289             :         /* Destroy deleted domains: */
    2290           0 :         for (i = 0; i < ndoms_cur; i++) {
    2291           0 :                 for (j = 0; j < n && !new_topology; j++) {
    2292           0 :                         if (cpumask_equal(doms_cur[i], doms_new[j]) &&
    2293           0 :                             dattrs_equal(dattr_cur, i, dattr_new, j)) {
    2294           0 :                                 struct root_domain *rd;
    2295             : 
    2296             :                                 /*
    2297             :                                  * This domain won't be destroyed and as such
    2298             :                                  * its dl_bw->total_bw needs to be cleared.  It
    2299             :                                  * will be recomputed in function
    2300             :                                  * update_tasks_root_domain().
    2301             :                                  */
    2302           0 :                                 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
    2303           0 :                                 dl_clear_root_domain(rd);
    2304           0 :                                 goto match1;
    2305             :                         }
    2306             :                 }
    2307             :                 /* No match - a current sched domain not in new doms_new[] */
    2308           0 :                 detach_destroy_domains(doms_cur[i]);
    2309           0 : match1:
    2310           0 :                 ;
    2311             :         }
    2312             : 
    2313           0 :         n = ndoms_cur;
    2314           0 :         if (!doms_new) {
    2315           0 :                 n = 0;
    2316           0 :                 doms_new = &fallback_doms;
    2317           0 :                 cpumask_and(doms_new[0], cpu_active_mask,
    2318             :                             housekeeping_cpumask(HK_FLAG_DOMAIN));
    2319             :         }
    2320             : 
    2321             :         /* Build new domains: */
    2322           0 :         for (i = 0; i < ndoms_new; i++) {
    2323           0 :                 for (j = 0; j < n && !new_topology; j++) {
    2324           0 :                         if (cpumask_equal(doms_new[i], doms_cur[j]) &&
    2325           0 :                             dattrs_equal(dattr_new, i, dattr_cur, j))
    2326           0 :                                 goto match2;
    2327             :                 }
    2328             :                 /* No match - add a new doms_new */
    2329           0 :                 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
    2330           0 : match2:
    2331           0 :                 ;
    2332             :         }
    2333             : 
    2334             : #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
    2335             :         /* Build perf. domains: */
    2336             :         for (i = 0; i < ndoms_new; i++) {
    2337             :                 for (j = 0; j < n && !sched_energy_update; j++) {
    2338             :                         if (cpumask_equal(doms_new[i], doms_cur[j]) &&
    2339             :                             cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
    2340             :                                 has_eas = true;
    2341             :                                 goto match3;
    2342             :                         }
    2343             :                 }
    2344             :                 /* No match - add perf. domains for a new rd */
    2345             :                 has_eas |= build_perf_domains(doms_new[i]);
    2346             : match3:
    2347             :                 ;
    2348             :         }
    2349             :         sched_energy_set(has_eas);
    2350             : #endif
    2351             : 
    2352             :         /* Remember the new sched domains: */
    2353           0 :         if (doms_cur != &fallback_doms)
    2354           0 :                 free_sched_domains(doms_cur, ndoms_cur);
    2355             : 
    2356           0 :         kfree(dattr_cur);
    2357           0 :         doms_cur = doms_new;
    2358           0 :         dattr_cur = dattr_new;
    2359           0 :         ndoms_cur = ndoms_new;
    2360             : 
    2361           0 :         register_sched_domain_sysctl();
    2362           0 : }
    2363             : 
    2364             : /*
    2365             :  * Call with hotplug lock held
    2366             :  */
    2367           0 : void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
    2368             :                              struct sched_domain_attr *dattr_new)
    2369             : {
    2370           0 :         mutex_lock(&sched_domains_mutex);
    2371           0 :         partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
    2372           0 :         mutex_unlock(&sched_domains_mutex);
    2373           0 : }

Generated by: LCOV version 1.14