Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2016 Thomas Gleixner.
4 : * Copyright (C) 2016-2017 Christoph Hellwig.
5 : */
6 : #include <linux/interrupt.h>
7 : #include <linux/kernel.h>
8 : #include <linux/slab.h>
9 : #include <linux/cpu.h>
10 : #include <linux/sort.h>
11 :
12 0 : static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
13 : unsigned int cpus_per_vec)
14 : {
15 0 : const struct cpumask *siblmsk;
16 0 : int cpu, sibl;
17 :
18 0 : for ( ; cpus_per_vec > 0; ) {
19 0 : cpu = cpumask_first(nmsk);
20 :
21 : /* Should not happen, but I'm too lazy to think about it */
22 0 : if (cpu >= nr_cpu_ids)
23 : return;
24 :
25 0 : cpumask_clear_cpu(cpu, nmsk);
26 0 : cpumask_set_cpu(cpu, irqmsk);
27 0 : cpus_per_vec--;
28 :
29 : /* If the cpu has siblings, use them first */
30 0 : siblmsk = topology_sibling_cpumask(cpu);
31 0 : for (sibl = -1; cpus_per_vec > 0; ) {
32 0 : sibl = cpumask_next(sibl, siblmsk);
33 0 : if (sibl >= nr_cpu_ids)
34 : break;
35 0 : if (!cpumask_test_and_clear_cpu(sibl, nmsk))
36 0 : continue;
37 0 : cpumask_set_cpu(sibl, irqmsk);
38 0 : cpus_per_vec--;
39 : }
40 : }
41 : }
42 :
43 0 : static cpumask_var_t *alloc_node_to_cpumask(void)
44 : {
45 0 : cpumask_var_t *masks;
46 0 : int node;
47 :
48 0 : masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
49 0 : if (!masks)
50 : return NULL;
51 :
52 0 : for (node = 0; node < nr_node_ids; node++) {
53 0 : if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
54 : goto out_unwind;
55 : }
56 :
57 : return masks;
58 :
59 : out_unwind:
60 : while (--node >= 0)
61 : free_cpumask_var(masks[node]);
62 : kfree(masks);
63 : return NULL;
64 : }
65 :
66 0 : static void free_node_to_cpumask(cpumask_var_t *masks)
67 : {
68 0 : int node;
69 :
70 0 : for (node = 0; node < nr_node_ids; node++)
71 0 : free_cpumask_var(masks[node]);
72 0 : kfree(masks);
73 0 : }
74 :
75 0 : static void build_node_to_cpumask(cpumask_var_t *masks)
76 : {
77 0 : int cpu;
78 :
79 0 : for_each_possible_cpu(cpu)
80 0 : cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
81 0 : }
82 :
83 0 : static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
84 : const struct cpumask *mask, nodemask_t *nodemsk)
85 : {
86 0 : int n, nodes = 0;
87 :
88 : /* Calculate the number of nodes in the supplied affinity mask */
89 0 : for_each_node(n) {
90 0 : if (cpumask_intersects(mask, node_to_cpumask[n])) {
91 0 : node_set(n, *nodemsk);
92 0 : nodes++;
93 : }
94 : }
95 0 : return nodes;
96 : }
97 :
98 : struct node_vectors {
99 : unsigned id;
100 :
101 : union {
102 : unsigned nvectors;
103 : unsigned ncpus;
104 : };
105 : };
106 :
107 0 : static int ncpus_cmp_func(const void *l, const void *r)
108 : {
109 0 : const struct node_vectors *ln = l;
110 0 : const struct node_vectors *rn = r;
111 :
112 0 : return ln->ncpus - rn->ncpus;
113 : }
114 :
115 : /*
116 : * Allocate vector number for each node, so that for each node:
117 : *
118 : * 1) the allocated number is >= 1
119 : *
120 : * 2) the allocated numbver is <= active CPU number of this node
121 : *
122 : * The actual allocated total vectors may be less than @numvecs when
123 : * active total CPU number is less than @numvecs.
124 : *
125 : * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
126 : * for each node.
127 : */
128 0 : static void alloc_nodes_vectors(unsigned int numvecs,
129 : cpumask_var_t *node_to_cpumask,
130 : const struct cpumask *cpu_mask,
131 : const nodemask_t nodemsk,
132 : struct cpumask *nmsk,
133 : struct node_vectors *node_vectors)
134 : {
135 0 : unsigned n, remaining_ncpus = 0;
136 :
137 0 : for (n = 0; n < nr_node_ids; n++) {
138 0 : node_vectors[n].id = n;
139 0 : node_vectors[n].ncpus = UINT_MAX;
140 : }
141 :
142 0 : for_each_node_mask(n, nodemsk) {
143 0 : unsigned ncpus;
144 :
145 0 : cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
146 0 : ncpus = cpumask_weight(nmsk);
147 :
148 0 : if (!ncpus)
149 0 : continue;
150 0 : remaining_ncpus += ncpus;
151 0 : node_vectors[n].ncpus = ncpus;
152 : }
153 :
154 0 : numvecs = min_t(unsigned, remaining_ncpus, numvecs);
155 :
156 0 : sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
157 : ncpus_cmp_func, NULL);
158 :
159 : /*
160 : * Allocate vectors for each node according to the ratio of this
161 : * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
162 : * bigger than number of active numa nodes. Always start the
163 : * allocation from the node with minimized nr_cpus.
164 : *
165 : * This way guarantees that each active node gets allocated at
166 : * least one vector, and the theory is simple: over-allocation
167 : * is only done when this node is assigned by one vector, so
168 : * other nodes will be allocated >= 1 vector, since 'numvecs' is
169 : * bigger than number of numa nodes.
170 : *
171 : * One perfect invariant is that number of allocated vectors for
172 : * each node is <= CPU count of this node:
173 : *
174 : * 1) suppose there are two nodes: A and B
175 : * ncpu(X) is CPU count of node X
176 : * vecs(X) is the vector count allocated to node X via this
177 : * algorithm
178 : *
179 : * ncpu(A) <= ncpu(B)
180 : * ncpu(A) + ncpu(B) = N
181 : * vecs(A) + vecs(B) = V
182 : *
183 : * vecs(A) = max(1, round_down(V * ncpu(A) / N))
184 : * vecs(B) = V - vecs(A)
185 : *
186 : * both N and V are integer, and 2 <= V <= N, suppose
187 : * V = N - delta, and 0 <= delta <= N - 2
188 : *
189 : * 2) obviously vecs(A) <= ncpu(A) because:
190 : *
191 : * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
192 : * ncpu(A) >= 1
193 : *
194 : * otherwise,
195 : * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
196 : *
197 : * 3) prove how vecs(B) <= ncpu(B):
198 : *
199 : * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
200 : * over-allocated, so vecs(B) <= ncpu(B),
201 : *
202 : * otherwise:
203 : *
204 : * vecs(A) =
205 : * round_down(V * ncpu(A) / N) =
206 : * round_down((N - delta) * ncpu(A) / N) =
207 : * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
208 : * round_down((N * ncpu(A) - delta * N) / N) =
209 : * cpu(A) - delta
210 : *
211 : * then:
212 : *
213 : * vecs(A) - V >= ncpu(A) - delta - V
214 : * =>
215 : * V - vecs(A) <= V + delta - ncpu(A)
216 : * =>
217 : * vecs(B) <= N - ncpu(A)
218 : * =>
219 : * vecs(B) <= cpu(B)
220 : *
221 : * For nodes >= 3, it can be thought as one node and another big
222 : * node given that is exactly what this algorithm is implemented,
223 : * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
224 : * finally for each node X: vecs(X) <= ncpu(X).
225 : *
226 : */
227 0 : for (n = 0; n < nr_node_ids; n++) {
228 0 : unsigned nvectors, ncpus;
229 :
230 0 : if (node_vectors[n].ncpus == UINT_MAX)
231 0 : continue;
232 :
233 0 : WARN_ON_ONCE(numvecs == 0);
234 :
235 0 : ncpus = node_vectors[n].ncpus;
236 0 : nvectors = max_t(unsigned, 1,
237 : numvecs * ncpus / remaining_ncpus);
238 0 : WARN_ON_ONCE(nvectors > ncpus);
239 :
240 0 : node_vectors[n].nvectors = nvectors;
241 :
242 0 : remaining_ncpus -= ncpus;
243 0 : numvecs -= nvectors;
244 : }
245 0 : }
246 :
247 0 : static int __irq_build_affinity_masks(unsigned int startvec,
248 : unsigned int numvecs,
249 : unsigned int firstvec,
250 : cpumask_var_t *node_to_cpumask,
251 : const struct cpumask *cpu_mask,
252 : struct cpumask *nmsk,
253 : struct irq_affinity_desc *masks)
254 : {
255 0 : unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
256 0 : unsigned int last_affv = firstvec + numvecs;
257 0 : unsigned int curvec = startvec;
258 0 : nodemask_t nodemsk = NODE_MASK_NONE;
259 0 : struct node_vectors *node_vectors;
260 :
261 0 : if (!cpumask_weight(cpu_mask))
262 : return 0;
263 :
264 0 : nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
265 :
266 : /*
267 : * If the number of nodes in the mask is greater than or equal the
268 : * number of vectors we just spread the vectors across the nodes.
269 : */
270 0 : if (numvecs <= nodes) {
271 0 : for_each_node_mask(n, nodemsk) {
272 0 : cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
273 0 : node_to_cpumask[n]);
274 0 : if (++curvec == last_affv)
275 0 : curvec = firstvec;
276 : }
277 0 : return numvecs;
278 : }
279 :
280 0 : node_vectors = kcalloc(nr_node_ids,
281 : sizeof(struct node_vectors),
282 : GFP_KERNEL);
283 0 : if (!node_vectors)
284 : return -ENOMEM;
285 :
286 : /* allocate vector number for each node */
287 0 : alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
288 : nodemsk, nmsk, node_vectors);
289 :
290 0 : for (i = 0; i < nr_node_ids; i++) {
291 0 : unsigned int ncpus, v;
292 0 : struct node_vectors *nv = &node_vectors[i];
293 :
294 0 : if (nv->nvectors == UINT_MAX)
295 0 : continue;
296 :
297 : /* Get the cpus on this node which are in the mask */
298 0 : cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
299 0 : ncpus = cpumask_weight(nmsk);
300 0 : if (!ncpus)
301 0 : continue;
302 :
303 0 : WARN_ON_ONCE(nv->nvectors > ncpus);
304 :
305 : /* Account for rounding errors */
306 0 : extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
307 :
308 : /* Spread allocated vectors on CPUs of the current node */
309 0 : for (v = 0; v < nv->nvectors; v++, curvec++) {
310 0 : cpus_per_vec = ncpus / nv->nvectors;
311 :
312 : /* Account for extra vectors to compensate rounding errors */
313 0 : if (extra_vecs) {
314 0 : cpus_per_vec++;
315 0 : --extra_vecs;
316 : }
317 :
318 : /*
319 : * wrapping has to be considered given 'startvec'
320 : * may start anywhere
321 : */
322 0 : if (curvec >= last_affv)
323 0 : curvec = firstvec;
324 0 : irq_spread_init_one(&masks[curvec].mask, nmsk,
325 : cpus_per_vec);
326 : }
327 0 : done += nv->nvectors;
328 : }
329 0 : kfree(node_vectors);
330 0 : return done;
331 : }
332 :
333 : /*
334 : * build affinity in two stages:
335 : * 1) spread present CPU on these vectors
336 : * 2) spread other possible CPUs on these vectors
337 : */
338 0 : static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
339 : unsigned int firstvec,
340 : struct irq_affinity_desc *masks)
341 : {
342 0 : unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
343 0 : cpumask_var_t *node_to_cpumask;
344 0 : cpumask_var_t nmsk, npresmsk;
345 0 : int ret = -ENOMEM;
346 :
347 0 : if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
348 : return ret;
349 :
350 0 : if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
351 : goto fail_nmsk;
352 :
353 0 : node_to_cpumask = alloc_node_to_cpumask();
354 0 : if (!node_to_cpumask)
355 0 : goto fail_npresmsk;
356 :
357 : /* Stabilize the cpumasks */
358 0 : get_online_cpus();
359 0 : build_node_to_cpumask(node_to_cpumask);
360 :
361 : /* Spread on present CPUs starting from affd->pre_vectors */
362 0 : ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
363 : node_to_cpumask, cpu_present_mask,
364 : nmsk, masks);
365 0 : if (ret < 0)
366 0 : goto fail_build_affinity;
367 0 : nr_present = ret;
368 :
369 : /*
370 : * Spread on non present CPUs starting from the next vector to be
371 : * handled. If the spreading of present CPUs already exhausted the
372 : * vector space, assign the non present CPUs to the already spread
373 : * out vectors.
374 : */
375 0 : if (nr_present >= numvecs)
376 : curvec = firstvec;
377 : else
378 0 : curvec = firstvec + nr_present;
379 0 : cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
380 0 : ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
381 : node_to_cpumask, npresmsk, nmsk,
382 : masks);
383 0 : if (ret >= 0)
384 0 : nr_others = ret;
385 :
386 0 : fail_build_affinity:
387 0 : put_online_cpus();
388 :
389 0 : if (ret >= 0)
390 0 : WARN_ON(nr_present + nr_others < numvecs);
391 :
392 0 : free_node_to_cpumask(node_to_cpumask);
393 :
394 0 : fail_npresmsk:
395 0 : free_cpumask_var(npresmsk);
396 :
397 0 : fail_nmsk:
398 0 : free_cpumask_var(nmsk);
399 0 : return ret < 0 ? ret : 0;
400 : }
401 :
402 0 : static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
403 : {
404 0 : affd->nr_sets = 1;
405 0 : affd->set_size[0] = affvecs;
406 0 : }
407 :
408 : /**
409 : * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
410 : * @nvecs: The total number of vectors
411 : * @affd: Description of the affinity requirements
412 : *
413 : * Returns the irq_affinity_desc pointer or NULL if allocation failed.
414 : */
415 : struct irq_affinity_desc *
416 0 : irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
417 : {
418 0 : unsigned int affvecs, curvec, usedvecs, i;
419 0 : struct irq_affinity_desc *masks = NULL;
420 :
421 : /*
422 : * Determine the number of vectors which need interrupt affinities
423 : * assigned. If the pre/post request exhausts the available vectors
424 : * then nothing to do here except for invoking the calc_sets()
425 : * callback so the device driver can adjust to the situation.
426 : */
427 0 : if (nvecs > affd->pre_vectors + affd->post_vectors)
428 0 : affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
429 : else
430 : affvecs = 0;
431 :
432 : /*
433 : * Simple invocations do not provide a calc_sets() callback. Install
434 : * the generic one.
435 : */
436 0 : if (!affd->calc_sets)
437 0 : affd->calc_sets = default_calc_sets;
438 :
439 : /* Recalculate the sets */
440 0 : affd->calc_sets(affd, affvecs);
441 :
442 0 : if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
443 : return NULL;
444 :
445 : /* Nothing to assign? */
446 0 : if (!affvecs)
447 : return NULL;
448 :
449 0 : masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
450 0 : if (!masks)
451 : return NULL;
452 :
453 : /* Fill out vectors at the beginning that don't need affinity */
454 0 : for (curvec = 0; curvec < affd->pre_vectors; curvec++)
455 0 : cpumask_copy(&masks[curvec].mask, irq_default_affinity);
456 :
457 : /*
458 : * Spread on present CPUs starting from affd->pre_vectors. If we
459 : * have multiple sets, build each sets affinity mask separately.
460 : */
461 0 : for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
462 0 : unsigned int this_vecs = affd->set_size[i];
463 0 : int ret;
464 :
465 0 : ret = irq_build_affinity_masks(curvec, this_vecs,
466 : curvec, masks);
467 0 : if (ret) {
468 0 : kfree(masks);
469 0 : return NULL;
470 : }
471 0 : curvec += this_vecs;
472 0 : usedvecs += this_vecs;
473 : }
474 :
475 : /* Fill out vectors at the end that don't need affinity */
476 0 : if (usedvecs >= affvecs)
477 0 : curvec = affd->pre_vectors + affvecs;
478 : else
479 0 : curvec = affd->pre_vectors + usedvecs;
480 0 : for (; curvec < nvecs; curvec++)
481 0 : cpumask_copy(&masks[curvec].mask, irq_default_affinity);
482 :
483 : /* Mark the managed interrupts */
484 0 : for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
485 0 : masks[i].is_managed = 1;
486 :
487 : return masks;
488 : }
489 :
490 : /**
491 : * irq_calc_affinity_vectors - Calculate the optimal number of vectors
492 : * @minvec: The minimum number of vectors available
493 : * @maxvec: The maximum number of vectors available
494 : * @affd: Description of the affinity requirements
495 : */
496 0 : unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
497 : const struct irq_affinity *affd)
498 : {
499 0 : unsigned int resv = affd->pre_vectors + affd->post_vectors;
500 0 : unsigned int set_vecs;
501 :
502 0 : if (resv > minvec)
503 : return 0;
504 :
505 0 : if (affd->calc_sets) {
506 0 : set_vecs = maxvec - resv;
507 : } else {
508 0 : get_online_cpus();
509 0 : set_vecs = cpumask_weight(cpu_possible_mask);
510 0 : put_online_cpus();
511 : }
512 :
513 0 : return resv + min(set_vecs, maxvec - resv);
514 : }
|