Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * kernel/sched/cpupri.c
4 : *
5 : * CPU priority management
6 : *
7 : * Copyright (C) 2007-2008 Novell
8 : *
9 : * Author: Gregory Haskins <ghaskins@novell.com>
10 : *
11 : * This code tracks the priority of each CPU so that global migration
12 : * decisions are easy to calculate. Each CPU can be in a state as follows:
13 : *
14 : * (INVALID), NORMAL, RT1, ... RT99, HIGHER
15 : *
16 : * going from the lowest priority to the highest. CPUs in the INVALID state
17 : * are not eligible for routing. The system maintains this state with
18 : * a 2 dimensional bitmap (the first for priority class, the second for CPUs
19 : * in that class). Therefore a typical application without affinity
20 : * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
21 : * searches). For tasks with affinity restrictions, the algorithm has a
22 : * worst case complexity of O(min(101, nr_domcpus)), though the scenario that
23 : * yields the worst case search is fairly contrived.
24 : */
25 : #include "sched.h"
26 :
27 : /*
28 : * p->rt_priority p->prio newpri cpupri
29 : *
30 : * -1 -1 (CPUPRI_INVALID)
31 : *
32 : * 99 0 (CPUPRI_NORMAL)
33 : *
34 : * 1 98 98 1
35 : * ...
36 : * 49 50 50 49
37 : * 50 49 49 50
38 : * ...
39 : * 99 0 0 99
40 : *
41 : * 100 100 (CPUPRI_HIGHER)
42 : */
43 12 : static int convert_prio(int prio)
44 : {
45 12 : int cpupri;
46 :
47 12 : switch (prio) {
48 4 : case CPUPRI_INVALID:
49 4 : cpupri = CPUPRI_INVALID; /* -1 */
50 4 : break;
51 :
52 0 : case 0 ... 98:
53 0 : cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
54 0 : break;
55 :
56 8 : case MAX_RT_PRIO-1:
57 8 : cpupri = CPUPRI_NORMAL; /* 0 */
58 8 : break;
59 :
60 0 : case MAX_RT_PRIO:
61 0 : cpupri = CPUPRI_HIGHER; /* 100 */
62 0 : break;
63 : }
64 :
65 12 : return cpupri;
66 : }
67 :
68 0 : static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
69 : struct cpumask *lowest_mask, int idx)
70 : {
71 0 : struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
72 0 : int skip = 0;
73 :
74 0 : if (!atomic_read(&(vec)->count))
75 0 : skip = 1;
76 : /*
77 : * When looking at the vector, we need to read the counter,
78 : * do a memory barrier, then read the mask.
79 : *
80 : * Note: This is still all racey, but we can deal with it.
81 : * Ideally, we only want to look at masks that are set.
82 : *
83 : * If a mask is not set, then the only thing wrong is that we
84 : * did a little more work than necessary.
85 : *
86 : * If we read a zero count but the mask is set, because of the
87 : * memory barriers, that can only happen when the highest prio
88 : * task for a run queue has left the run queue, in which case,
89 : * it will be followed by a pull. If the task we are processing
90 : * fails to find a proper place to go, that pull request will
91 : * pull this task if the run queue is running at a lower
92 : * priority.
93 : */
94 0 : smp_rmb();
95 :
96 : /* Need to do the rmb for every iteration */
97 0 : if (skip)
98 0 : return 0;
99 :
100 0 : if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
101 : return 0;
102 :
103 0 : if (lowest_mask) {
104 0 : cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
105 :
106 : /*
107 : * We have to ensure that we have at least one bit
108 : * still set in the array, since the map could have
109 : * been concurrently emptied between the first and
110 : * second reads of vec->mask. If we hit this
111 : * condition, simply act as though we never hit this
112 : * priority level and continue on.
113 : */
114 0 : if (cpumask_empty(lowest_mask))
115 0 : return 0;
116 : }
117 :
118 : return 1;
119 : }
120 :
121 0 : int cpupri_find(struct cpupri *cp, struct task_struct *p,
122 : struct cpumask *lowest_mask)
123 : {
124 0 : return cpupri_find_fitness(cp, p, lowest_mask, NULL);
125 : }
126 :
127 : /**
128 : * cpupri_find_fitness - find the best (lowest-pri) CPU in the system
129 : * @cp: The cpupri context
130 : * @p: The task
131 : * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
132 : * @fitness_fn: A pointer to a function to do custom checks whether the CPU
133 : * fits a specific criteria so that we only return those CPUs.
134 : *
135 : * Note: This function returns the recommended CPUs as calculated during the
136 : * current invocation. By the time the call returns, the CPUs may have in
137 : * fact changed priorities any number of times. While not ideal, it is not
138 : * an issue of correctness since the normal rebalancer logic will correct
139 : * any discrepancies created by racing against the uncertainty of the current
140 : * priority configuration.
141 : *
142 : * Return: (int)bool - CPUs were found
143 : */
144 0 : int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
145 : struct cpumask *lowest_mask,
146 : bool (*fitness_fn)(struct task_struct *p, int cpu))
147 : {
148 0 : int task_pri = convert_prio(p->prio);
149 0 : int idx, cpu;
150 :
151 0 : BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
152 :
153 0 : for (idx = 0; idx < task_pri; idx++) {
154 :
155 0 : if (!__cpupri_find(cp, p, lowest_mask, idx))
156 0 : continue;
157 :
158 0 : if (!lowest_mask || !fitness_fn)
159 : return 1;
160 :
161 : /* Ensure the capacity of the CPUs fit the task */
162 0 : for_each_cpu(cpu, lowest_mask) {
163 0 : if (!fitness_fn(p, cpu))
164 0 : cpumask_clear_cpu(cpu, lowest_mask);
165 : }
166 :
167 : /*
168 : * If no CPU at the current priority can fit the task
169 : * continue looking
170 : */
171 0 : if (cpumask_empty(lowest_mask))
172 0 : continue;
173 :
174 : return 1;
175 : }
176 :
177 : /*
178 : * If we failed to find a fitting lowest_mask, kick off a new search
179 : * but without taking into account any fitness criteria this time.
180 : *
181 : * This rule favours honouring priority over fitting the task in the
182 : * correct CPU (Capacity Awareness being the only user now).
183 : * The idea is that if a higher priority task can run, then it should
184 : * run even if this ends up being on unfitting CPU.
185 : *
186 : * The cost of this trade-off is not entirely clear and will probably
187 : * be good for some workloads and bad for others.
188 : *
189 : * The main idea here is that if some CPUs were overcommitted, we try
190 : * to spread which is what the scheduler traditionally did. Sys admins
191 : * must do proper RT planning to avoid overloading the system if they
192 : * really care.
193 : */
194 0 : if (fitness_fn)
195 0 : return cpupri_find(cp, p, lowest_mask);
196 :
197 : return 0;
198 : }
199 :
200 : /**
201 : * cpupri_set - update the CPU priority setting
202 : * @cp: The cpupri context
203 : * @cpu: The target CPU
204 : * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
205 : *
206 : * Note: Assumes cpu_rq(cpu)->lock is locked
207 : *
208 : * Returns: (void)
209 : */
210 12 : void cpupri_set(struct cpupri *cp, int cpu, int newpri)
211 : {
212 12 : int *currpri = &cp->cpu_to_pri[cpu];
213 12 : int oldpri = *currpri;
214 12 : int do_mb = 0;
215 :
216 12 : newpri = convert_prio(newpri);
217 :
218 12 : BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
219 :
220 12 : if (newpri == oldpri)
221 : return;
222 :
223 : /*
224 : * If the CPU was currently mapped to a different value, we
225 : * need to map it to the new value then remove the old value.
226 : * Note, we must add the new value first, otherwise we risk the
227 : * cpu being missed by the priority loop in cpupri_find.
228 : */
229 12 : if (likely(newpri != CPUPRI_INVALID)) {
230 8 : struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
231 :
232 8 : cpumask_set_cpu(cpu, vec->mask);
233 : /*
234 : * When adding a new vector, we update the mask first,
235 : * do a write memory barrier, and then update the count, to
236 : * make sure the vector is visible when count is set.
237 : */
238 8 : smp_mb__before_atomic();
239 8 : atomic_inc(&(vec)->count);
240 8 : do_mb = 1;
241 : }
242 12 : if (likely(oldpri != CPUPRI_INVALID)) {
243 4 : struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
244 :
245 : /*
246 : * Because the order of modification of the vec->count
247 : * is important, we must make sure that the update
248 : * of the new prio is seen before we decrement the
249 : * old prio. This makes sure that the loop sees
250 : * one or the other when we raise the priority of
251 : * the run queue. We don't care about when we lower the
252 : * priority, as that will trigger an rt pull anyway.
253 : *
254 : * We only need to do a memory barrier if we updated
255 : * the new priority vec.
256 : */
257 4 : if (do_mb)
258 4 : smp_mb__after_atomic();
259 :
260 : /*
261 : * When removing from the vector, we decrement the counter first
262 : * do a memory barrier and then clear the mask.
263 : */
264 4 : atomic_dec(&(vec)->count);
265 4 : smp_mb__after_atomic();
266 4 : cpumask_clear_cpu(cpu, vec->mask);
267 : }
268 :
269 12 : *currpri = newpri;
270 : }
271 :
272 : /**
273 : * cpupri_init - initialize the cpupri structure
274 : * @cp: The cpupri context
275 : *
276 : * Return: -ENOMEM on memory allocation failure.
277 : */
278 2 : int cpupri_init(struct cpupri *cp)
279 : {
280 2 : int i;
281 :
282 204 : for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
283 202 : struct cpupri_vec *vec = &cp->pri_to_cpu[i];
284 :
285 202 : atomic_set(&vec->count, 0);
286 202 : if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
287 : goto cleanup;
288 : }
289 :
290 2 : cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
291 2 : if (!cp->cpu_to_pri)
292 0 : goto cleanup;
293 :
294 10 : for_each_possible_cpu(i)
295 8 : cp->cpu_to_pri[i] = CPUPRI_INVALID;
296 :
297 : return 0;
298 :
299 0 : cleanup:
300 0 : for (i--; i >= 0; i--)
301 : free_cpumask_var(cp->pri_to_cpu[i].mask);
302 : return -ENOMEM;
303 : }
304 :
305 : /**
306 : * cpupri_cleanup - clean up the cpupri structure
307 : * @cp: The cpupri context
308 : */
309 0 : void cpupri_cleanup(struct cpupri *cp)
310 : {
311 0 : int i;
312 :
313 0 : kfree(cp->cpu_to_pri);
314 0 : for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
315 : free_cpumask_var(cp->pri_to_cpu[i].mask);
316 0 : }
|