Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Block multiqueue core code
4 : *
5 : * Copyright (C) 2013-2014 Jens Axboe
6 : * Copyright (C) 2013-2014 Christoph Hellwig
7 : */
8 : #include <linux/kernel.h>
9 : #include <linux/module.h>
10 : #include <linux/backing-dev.h>
11 : #include <linux/bio.h>
12 : #include <linux/blkdev.h>
13 : #include <linux/kmemleak.h>
14 : #include <linux/mm.h>
15 : #include <linux/init.h>
16 : #include <linux/slab.h>
17 : #include <linux/workqueue.h>
18 : #include <linux/smp.h>
19 : #include <linux/llist.h>
20 : #include <linux/list_sort.h>
21 : #include <linux/cpu.h>
22 : #include <linux/cache.h>
23 : #include <linux/sched/sysctl.h>
24 : #include <linux/sched/topology.h>
25 : #include <linux/sched/signal.h>
26 : #include <linux/delay.h>
27 : #include <linux/crash_dump.h>
28 : #include <linux/prefetch.h>
29 : #include <linux/blk-crypto.h>
30 :
31 : #include <trace/events/block.h>
32 :
33 : #include <linux/blk-mq.h>
34 : #include <linux/t10-pi.h>
35 : #include "blk.h"
36 : #include "blk-mq.h"
37 : #include "blk-mq-debugfs.h"
38 : #include "blk-mq-tag.h"
39 : #include "blk-pm.h"
40 : #include "blk-stat.h"
41 : #include "blk-mq-sched.h"
42 : #include "blk-rq-qos.h"
43 :
44 : static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
45 :
46 : static void blk_mq_poll_stats_start(struct request_queue *q);
47 : static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
48 :
49 0 : static int blk_mq_poll_stats_bkt(const struct request *rq)
50 : {
51 0 : int ddir, sectors, bucket;
52 :
53 0 : ddir = rq_data_dir(rq);
54 0 : sectors = blk_rq_stats_sectors(rq);
55 :
56 0 : bucket = ddir + 2 * ilog2(sectors);
57 :
58 0 : if (bucket < 0)
59 : return -1;
60 0 : else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
61 0 : return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
62 :
63 : return bucket;
64 : }
65 :
66 : /*
67 : * Check if any of the ctx, dispatch list or elevator
68 : * have pending work in this hardware queue.
69 : */
70 599 : static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
71 : {
72 945 : return !list_empty_careful(&hctx->dispatch) ||
73 678 : sbitmap_any_bit_set(&hctx->ctx_map) ||
74 332 : blk_mq_sched_has_work(hctx);
75 : }
76 :
77 : /*
78 : * Mark this ctx as having pending work in this hardware queue
79 : */
80 14 : static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
81 : struct blk_mq_ctx *ctx)
82 : {
83 14 : const int bit = ctx->index_hw[hctx->type];
84 :
85 14 : if (!sbitmap_test_bit(&hctx->ctx_map, bit))
86 14 : sbitmap_set_bit(&hctx->ctx_map, bit);
87 14 : }
88 :
89 0 : static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
90 : struct blk_mq_ctx *ctx)
91 : {
92 0 : const int bit = ctx->index_hw[hctx->type];
93 :
94 0 : sbitmap_clear_bit(&hctx->ctx_map, bit);
95 0 : }
96 :
97 : struct mq_inflight {
98 : struct block_device *part;
99 : unsigned int inflight[2];
100 : };
101 :
102 0 : static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
103 : struct request *rq, void *priv,
104 : bool reserved)
105 : {
106 0 : struct mq_inflight *mi = priv;
107 :
108 0 : if ((!mi->part->bd_partno || rq->part == mi->part) &&
109 0 : blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
110 0 : mi->inflight[rq_data_dir(rq)]++;
111 :
112 0 : return true;
113 : }
114 :
115 0 : unsigned int blk_mq_in_flight(struct request_queue *q,
116 : struct block_device *part)
117 : {
118 0 : struct mq_inflight mi = { .part = part };
119 :
120 0 : blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
121 :
122 0 : return mi.inflight[0] + mi.inflight[1];
123 : }
124 :
125 0 : void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
126 : unsigned int inflight[2])
127 : {
128 0 : struct mq_inflight mi = { .part = part };
129 :
130 0 : blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
131 0 : inflight[0] = mi.inflight[0];
132 0 : inflight[1] = mi.inflight[1];
133 0 : }
134 :
135 0 : void blk_freeze_queue_start(struct request_queue *q)
136 : {
137 0 : mutex_lock(&q->mq_freeze_lock);
138 0 : if (++q->mq_freeze_depth == 1) {
139 0 : percpu_ref_kill(&q->q_usage_counter);
140 0 : mutex_unlock(&q->mq_freeze_lock);
141 0 : if (queue_is_mq(q))
142 0 : blk_mq_run_hw_queues(q, false);
143 : } else {
144 0 : mutex_unlock(&q->mq_freeze_lock);
145 : }
146 0 : }
147 : EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
148 :
149 0 : void blk_mq_freeze_queue_wait(struct request_queue *q)
150 : {
151 0 : wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
152 0 : }
153 : EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
154 :
155 0 : int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
156 : unsigned long timeout)
157 : {
158 0 : return wait_event_timeout(q->mq_freeze_wq,
159 : percpu_ref_is_zero(&q->q_usage_counter),
160 : timeout);
161 : }
162 : EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
163 :
164 : /*
165 : * Guarantee no request is in use, so we can change any data structure of
166 : * the queue afterward.
167 : */
168 0 : void blk_freeze_queue(struct request_queue *q)
169 : {
170 : /*
171 : * In the !blk_mq case we are only calling this to kill the
172 : * q_usage_counter, otherwise this increases the freeze depth
173 : * and waits for it to return to zero. For this reason there is
174 : * no blk_unfreeze_queue(), and blk_freeze_queue() is not
175 : * exported to drivers as the only user for unfreeze is blk_mq.
176 : */
177 0 : blk_freeze_queue_start(q);
178 0 : blk_mq_freeze_queue_wait(q);
179 0 : }
180 :
181 0 : void blk_mq_freeze_queue(struct request_queue *q)
182 : {
183 : /*
184 : * ...just an alias to keep freeze and unfreeze actions balanced
185 : * in the blk_mq_* namespace
186 : */
187 0 : blk_freeze_queue(q);
188 0 : }
189 : EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
190 :
191 0 : void blk_mq_unfreeze_queue(struct request_queue *q)
192 : {
193 0 : mutex_lock(&q->mq_freeze_lock);
194 0 : q->mq_freeze_depth--;
195 0 : WARN_ON_ONCE(q->mq_freeze_depth < 0);
196 0 : if (!q->mq_freeze_depth) {
197 0 : percpu_ref_resurrect(&q->q_usage_counter);
198 0 : wake_up_all(&q->mq_freeze_wq);
199 : }
200 0 : mutex_unlock(&q->mq_freeze_lock);
201 0 : }
202 : EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
203 :
204 : /*
205 : * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
206 : * mpt3sas driver such that this function can be removed.
207 : */
208 0 : void blk_mq_quiesce_queue_nowait(struct request_queue *q)
209 : {
210 0 : blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
211 0 : }
212 : EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
213 :
214 : /**
215 : * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
216 : * @q: request queue.
217 : *
218 : * Note: this function does not prevent that the struct request end_io()
219 : * callback function is invoked. Once this function is returned, we make
220 : * sure no dispatch can happen until the queue is unquiesced via
221 : * blk_mq_unquiesce_queue().
222 : */
223 0 : void blk_mq_quiesce_queue(struct request_queue *q)
224 : {
225 0 : struct blk_mq_hw_ctx *hctx;
226 0 : unsigned int i;
227 0 : bool rcu = false;
228 :
229 0 : blk_mq_quiesce_queue_nowait(q);
230 :
231 0 : queue_for_each_hw_ctx(q, hctx, i) {
232 0 : if (hctx->flags & BLK_MQ_F_BLOCKING)
233 0 : synchronize_srcu(hctx->srcu);
234 : else
235 : rcu = true;
236 : }
237 0 : if (rcu)
238 0 : synchronize_rcu();
239 0 : }
240 : EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
241 :
242 : /*
243 : * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
244 : * @q: request queue.
245 : *
246 : * This function recovers queue into the state before quiescing
247 : * which is done by blk_mq_quiesce_queue.
248 : */
249 0 : void blk_mq_unquiesce_queue(struct request_queue *q)
250 : {
251 0 : blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
252 :
253 : /* dispatch requests which are inserted during quiescing */
254 0 : blk_mq_run_hw_queues(q, true);
255 0 : }
256 : EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
257 :
258 0 : void blk_mq_wake_waiters(struct request_queue *q)
259 : {
260 0 : struct blk_mq_hw_ctx *hctx;
261 0 : unsigned int i;
262 :
263 0 : queue_for_each_hw_ctx(q, hctx, i)
264 0 : if (blk_mq_hw_queue_mapped(hctx))
265 0 : blk_mq_tag_wakeup_all(hctx->tags, true);
266 0 : }
267 :
268 : /*
269 : * Only need start/end time stamping if we have iostat or
270 : * blk stats enabled, or using an IO scheduler.
271 : */
272 6759 : static inline bool blk_mq_need_time_stamp(struct request *rq)
273 : {
274 191 : return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
275 : }
276 :
277 3254 : static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
278 : unsigned int tag, u64 alloc_time_ns)
279 : {
280 3254 : struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281 3254 : struct request *rq = tags->static_rqs[tag];
282 :
283 3254 : if (data->q->elevator) {
284 0 : rq->tag = BLK_MQ_NO_TAG;
285 0 : rq->internal_tag = tag;
286 : } else {
287 3254 : rq->tag = tag;
288 3254 : rq->internal_tag = BLK_MQ_NO_TAG;
289 : }
290 :
291 : /* csd/requeue_work/fifo_time is initialized before use */
292 3254 : rq->q = data->q;
293 3254 : rq->mq_ctx = data->ctx;
294 3254 : rq->mq_hctx = data->hctx;
295 3254 : rq->rq_flags = 0;
296 3254 : rq->cmd_flags = data->cmd_flags;
297 3254 : if (data->flags & BLK_MQ_REQ_PM)
298 0 : rq->rq_flags |= RQF_PM;
299 3254 : if (blk_queue_io_stat(data->q))
300 3253 : rq->rq_flags |= RQF_IO_STAT;
301 3254 : INIT_LIST_HEAD(&rq->queuelist);
302 3254 : INIT_HLIST_NODE(&rq->hash);
303 3254 : RB_CLEAR_NODE(&rq->rb_node);
304 3254 : rq->rq_disk = NULL;
305 3254 : rq->part = NULL;
306 : #ifdef CONFIG_BLK_RQ_ALLOC_TIME
307 : rq->alloc_time_ns = alloc_time_ns;
308 : #endif
309 3254 : if (blk_mq_need_time_stamp(rq))
310 3254 : rq->start_time_ns = ktime_get_ns();
311 : else
312 0 : rq->start_time_ns = 0;
313 3253 : rq->io_start_time_ns = 0;
314 3253 : rq->stats_sectors = 0;
315 3253 : rq->nr_phys_segments = 0;
316 : #if defined(CONFIG_BLK_DEV_INTEGRITY)
317 : rq->nr_integrity_segments = 0;
318 : #endif
319 3253 : blk_crypto_rq_set_defaults(rq);
320 : /* tag was already set */
321 3253 : WRITE_ONCE(rq->deadline, 0);
322 :
323 3253 : rq->timeout = 0;
324 :
325 3253 : rq->end_io = NULL;
326 3253 : rq->end_io_data = NULL;
327 :
328 3253 : data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
329 3253 : refcount_set(&rq->ref, 1);
330 :
331 3253 : if (!op_is_flush(data->cmd_flags)) {
332 3121 : struct elevator_queue *e = data->q->elevator;
333 :
334 3121 : rq->elv.icq = NULL;
335 3121 : if (e && e->type->ops.prepare_request) {
336 0 : if (e->type->icq_cache)
337 0 : blk_mq_sched_assign_ioc(rq);
338 :
339 0 : e->type->ops.prepare_request(rq);
340 0 : rq->rq_flags |= RQF_ELVPRIV;
341 : }
342 : }
343 :
344 3253 : data->hctx->queued++;
345 3253 : return rq;
346 : }
347 :
348 3254 : static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
349 : {
350 3254 : struct request_queue *q = data->q;
351 3254 : struct elevator_queue *e = q->elevator;
352 3254 : u64 alloc_time_ns = 0;
353 3254 : unsigned int tag;
354 :
355 : /* alloc_time includes depth and tag waits */
356 3254 : if (blk_queue_rq_alloc_time(q))
357 : alloc_time_ns = ktime_get_ns();
358 :
359 3254 : if (data->cmd_flags & REQ_NOWAIT)
360 0 : data->flags |= BLK_MQ_REQ_NOWAIT;
361 :
362 3254 : if (e) {
363 : /*
364 : * Flush requests are special and go directly to the
365 : * dispatch list. Don't include reserved tags in the
366 : * limiting, as it isn't useful.
367 : */
368 0 : if (!op_is_flush(data->cmd_flags) &&
369 0 : e->type->ops.limit_depth &&
370 0 : !(data->flags & BLK_MQ_REQ_RESERVED))
371 0 : e->type->ops.limit_depth(data->cmd_flags, data);
372 : }
373 :
374 3254 : retry:
375 3254 : data->ctx = blk_mq_get_ctx(q);
376 3254 : data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
377 3254 : if (!e)
378 3254 : blk_mq_tag_busy(data->hctx);
379 :
380 : /*
381 : * Waiting allocations only fail because of an inactive hctx. In that
382 : * case just retry the hctx assignment and tag allocation as CPU hotplug
383 : * should have migrated us to an online CPU by now.
384 : */
385 3254 : tag = blk_mq_get_tag(data);
386 3254 : if (tag == BLK_MQ_NO_TAG) {
387 0 : if (data->flags & BLK_MQ_REQ_NOWAIT)
388 : return NULL;
389 :
390 : /*
391 : * Give up the CPU and sleep for a random short time to ensure
392 : * that thread using a realtime scheduling class are migrated
393 : * off the CPU, and thus off the hctx that is going away.
394 : */
395 0 : msleep(3);
396 0 : goto retry;
397 : }
398 3254 : return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
399 : }
400 :
401 2 : struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
402 : blk_mq_req_flags_t flags)
403 : {
404 2 : struct blk_mq_alloc_data data = {
405 : .q = q,
406 : .flags = flags,
407 : .cmd_flags = op,
408 : };
409 2 : struct request *rq;
410 2 : int ret;
411 :
412 2 : ret = blk_queue_enter(q, flags);
413 2 : if (ret)
414 0 : return ERR_PTR(ret);
415 :
416 2 : rq = __blk_mq_alloc_request(&data);
417 2 : if (!rq)
418 0 : goto out_queue_exit;
419 2 : rq->__data_len = 0;
420 2 : rq->__sector = (sector_t) -1;
421 2 : rq->bio = rq->biotail = NULL;
422 2 : return rq;
423 0 : out_queue_exit:
424 0 : blk_queue_exit(q);
425 0 : return ERR_PTR(-EWOULDBLOCK);
426 : }
427 : EXPORT_SYMBOL(blk_mq_alloc_request);
428 :
429 0 : struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
430 : unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
431 : {
432 0 : struct blk_mq_alloc_data data = {
433 : .q = q,
434 : .flags = flags,
435 : .cmd_flags = op,
436 : };
437 0 : u64 alloc_time_ns = 0;
438 0 : unsigned int cpu;
439 0 : unsigned int tag;
440 0 : int ret;
441 :
442 : /* alloc_time includes depth and tag waits */
443 0 : if (blk_queue_rq_alloc_time(q))
444 : alloc_time_ns = ktime_get_ns();
445 :
446 : /*
447 : * If the tag allocator sleeps we could get an allocation for a
448 : * different hardware context. No need to complicate the low level
449 : * allocator for this for the rare use case of a command tied to
450 : * a specific queue.
451 : */
452 0 : if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
453 0 : return ERR_PTR(-EINVAL);
454 :
455 0 : if (hctx_idx >= q->nr_hw_queues)
456 0 : return ERR_PTR(-EIO);
457 :
458 0 : ret = blk_queue_enter(q, flags);
459 0 : if (ret)
460 0 : return ERR_PTR(ret);
461 :
462 : /*
463 : * Check if the hardware context is actually mapped to anything.
464 : * If not tell the caller that it should skip this queue.
465 : */
466 0 : ret = -EXDEV;
467 0 : data.hctx = q->queue_hw_ctx[hctx_idx];
468 0 : if (!blk_mq_hw_queue_mapped(data.hctx))
469 0 : goto out_queue_exit;
470 0 : cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
471 0 : data.ctx = __blk_mq_get_ctx(q, cpu);
472 :
473 0 : if (!q->elevator)
474 0 : blk_mq_tag_busy(data.hctx);
475 :
476 0 : ret = -EWOULDBLOCK;
477 0 : tag = blk_mq_get_tag(&data);
478 0 : if (tag == BLK_MQ_NO_TAG)
479 0 : goto out_queue_exit;
480 0 : return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
481 :
482 0 : out_queue_exit:
483 0 : blk_queue_exit(q);
484 0 : return ERR_PTR(ret);
485 : }
486 : EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
487 :
488 3254 : static void __blk_mq_free_request(struct request *rq)
489 : {
490 3254 : struct request_queue *q = rq->q;
491 3254 : struct blk_mq_ctx *ctx = rq->mq_ctx;
492 3254 : struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
493 3254 : const int sched_tag = rq->internal_tag;
494 :
495 3254 : blk_crypto_free_request(rq);
496 3254 : blk_pm_mark_last_busy(rq);
497 3254 : rq->mq_hctx = NULL;
498 3254 : if (rq->tag != BLK_MQ_NO_TAG)
499 3254 : blk_mq_put_tag(hctx->tags, ctx, rq->tag);
500 3254 : if (sched_tag != BLK_MQ_NO_TAG)
501 0 : blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
502 3254 : blk_mq_sched_restart(hctx);
503 3254 : blk_queue_exit(q);
504 3254 : }
505 :
506 3254 : void blk_mq_free_request(struct request *rq)
507 : {
508 3254 : struct request_queue *q = rq->q;
509 3254 : struct elevator_queue *e = q->elevator;
510 3254 : struct blk_mq_ctx *ctx = rq->mq_ctx;
511 3254 : struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
512 :
513 3254 : if (rq->rq_flags & RQF_ELVPRIV) {
514 0 : if (e && e->type->ops.finish_request)
515 0 : e->type->ops.finish_request(rq);
516 0 : if (rq->elv.icq) {
517 0 : put_io_context(rq->elv.icq->ioc);
518 0 : rq->elv.icq = NULL;
519 : }
520 : }
521 :
522 3254 : ctx->rq_completed[rq_is_sync(rq)]++;
523 3254 : if (rq->rq_flags & RQF_MQ_INFLIGHT)
524 0 : __blk_mq_dec_active_requests(hctx);
525 :
526 3254 : if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
527 0 : laptop_io_completion(q->backing_dev_info);
528 :
529 3254 : rq_qos_done(q, rq);
530 :
531 3254 : WRITE_ONCE(rq->state, MQ_RQ_IDLE);
532 3254 : if (refcount_dec_and_test(&rq->ref))
533 3254 : __blk_mq_free_request(rq);
534 3254 : }
535 : EXPORT_SYMBOL_GPL(blk_mq_free_request);
536 :
537 3505 : inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
538 : {
539 3505 : u64 now = 0;
540 :
541 3696 : if (blk_mq_need_time_stamp(rq))
542 3314 : now = ktime_get_ns();
543 :
544 3505 : if (rq->rq_flags & RQF_STATS) {
545 0 : blk_mq_poll_stats_start(rq->q);
546 0 : blk_stat_add(rq, now);
547 : }
548 :
549 3505 : blk_mq_sched_completed_request(rq, now);
550 :
551 3505 : blk_account_io_done(rq, now);
552 :
553 3505 : if (rq->end_io) {
554 253 : rq_qos_done(rq->q, rq);
555 253 : rq->end_io(rq, error);
556 : } else {
557 3252 : blk_mq_free_request(rq);
558 : }
559 3505 : }
560 : EXPORT_SYMBOL(__blk_mq_end_request);
561 :
562 3504 : void blk_mq_end_request(struct request *rq, blk_status_t error)
563 : {
564 3504 : if (blk_update_request(rq, error, blk_rq_bytes(rq)))
565 0 : BUG();
566 3505 : __blk_mq_end_request(rq, error);
567 3505 : }
568 : EXPORT_SYMBOL(blk_mq_end_request);
569 :
570 2937 : static void blk_complete_reqs(struct llist_head *list)
571 : {
572 5874 : struct llist_node *entry = llist_reverse_order(llist_del_all(list));
573 2937 : struct request *rq, *next;
574 :
575 6310 : llist_for_each_entry_safe(rq, next, entry, ipi_list)
576 3373 : rq->q->mq_ops->complete(rq);
577 2937 : }
578 :
579 2937 : static __latent_entropy void blk_done_softirq(struct softirq_action *h)
580 : {
581 2937 : blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
582 2937 : }
583 :
584 0 : static int blk_softirq_cpu_dead(unsigned int cpu)
585 : {
586 0 : blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
587 0 : return 0;
588 : }
589 :
590 2495 : static void __blk_mq_complete_request_remote(void *data)
591 : {
592 2495 : __raise_softirq_irqoff(BLOCK_SOFTIRQ);
593 2495 : }
594 :
595 3373 : static inline bool blk_mq_complete_need_ipi(struct request *rq)
596 : {
597 3373 : int cpu = raw_smp_processor_id();
598 :
599 6746 : if (!IS_ENABLED(CONFIG_SMP) ||
600 3373 : !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
601 : return false;
602 : /*
603 : * With force threaded interrupts enabled, raising softirq from an SMP
604 : * function call will always result in waking the ksoftirqd thread.
605 : * This is probably worse than completing the request on a different
606 : * cache domain.
607 : */
608 3373 : if (force_irqthreads)
609 : return false;
610 :
611 : /* same CPU or cache domain? Complete locally */
612 3373 : if (cpu == rq->mq_ctx->cpu ||
613 5642 : (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
614 2821 : cpus_share_cache(cpu, rq->mq_ctx->cpu)))
615 552 : return false;
616 :
617 : /* don't try to IPI to an offline CPU */
618 2821 : return cpu_online(rq->mq_ctx->cpu);
619 : }
620 :
621 2821 : static void blk_mq_complete_send_ipi(struct request *rq)
622 : {
623 2821 : struct llist_head *list;
624 2821 : unsigned int cpu;
625 :
626 2821 : cpu = rq->mq_ctx->cpu;
627 2821 : list = &per_cpu(blk_cpu_done, cpu);
628 2821 : if (llist_add(&rq->ipi_list, list)) {
629 2495 : INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
630 2495 : smp_call_function_single_async(cpu, &rq->csd);
631 : }
632 2821 : }
633 :
634 552 : static void blk_mq_raise_softirq(struct request *rq)
635 : {
636 552 : struct llist_head *list;
637 :
638 552 : preempt_disable();
639 552 : list = this_cpu_ptr(&blk_cpu_done);
640 552 : if (llist_add(&rq->ipi_list, list))
641 442 : raise_softirq(BLOCK_SOFTIRQ);
642 552 : preempt_enable();
643 552 : }
644 :
645 3373 : bool blk_mq_complete_request_remote(struct request *rq)
646 : {
647 3373 : WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
648 :
649 : /*
650 : * For a polled request, always complete locallly, it's pointless
651 : * to redirect the completion.
652 : */
653 3373 : if (rq->cmd_flags & REQ_HIPRI)
654 : return false;
655 :
656 3373 : if (blk_mq_complete_need_ipi(rq)) {
657 2821 : blk_mq_complete_send_ipi(rq);
658 2821 : return true;
659 : }
660 :
661 552 : if (rq->q->nr_hw_queues == 1) {
662 552 : blk_mq_raise_softirq(rq);
663 552 : return true;
664 : }
665 : return false;
666 : }
667 : EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
668 :
669 : /**
670 : * blk_mq_complete_request - end I/O on a request
671 : * @rq: the request being processed
672 : *
673 : * Description:
674 : * Complete a request by scheduling the ->complete_rq operation.
675 : **/
676 3373 : void blk_mq_complete_request(struct request *rq)
677 : {
678 3373 : if (!blk_mq_complete_request_remote(rq))
679 0 : rq->q->mq_ops->complete(rq);
680 3373 : }
681 : EXPORT_SYMBOL(blk_mq_complete_request);
682 :
683 3972 : static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
684 : __releases(hctx->srcu)
685 : {
686 3972 : if (!(hctx->flags & BLK_MQ_F_BLOCKING))
687 3972 : rcu_read_unlock();
688 : else
689 0 : srcu_read_unlock(hctx->srcu, srcu_idx);
690 3972 : }
691 :
692 3972 : static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
693 : __acquires(hctx->srcu)
694 : {
695 3972 : if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
696 : /* shut up gcc false positive */
697 3972 : *srcu_idx = 0;
698 3972 : rcu_read_lock();
699 : } else
700 0 : *srcu_idx = srcu_read_lock(hctx->srcu);
701 3972 : }
702 :
703 : /**
704 : * blk_mq_start_request - Start processing a request
705 : * @rq: Pointer to request to be started
706 : *
707 : * Function used by device drivers to notify the block layer that a request
708 : * is going to be processed now, so blk layer can do proper initializations
709 : * such as starting the timeout timer.
710 : */
711 3373 : void blk_mq_start_request(struct request *rq)
712 : {
713 3373 : struct request_queue *q = rq->q;
714 :
715 3373 : trace_block_rq_issue(rq);
716 :
717 3373 : if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
718 0 : rq->io_start_time_ns = ktime_get_ns();
719 0 : rq->stats_sectors = blk_rq_sectors(rq);
720 0 : rq->rq_flags |= RQF_STATS;
721 0 : rq_qos_issue(q, rq);
722 : }
723 :
724 3373 : WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
725 :
726 3373 : blk_add_timer(rq);
727 3372 : WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
728 :
729 : #ifdef CONFIG_BLK_DEV_INTEGRITY
730 : if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
731 : q->integrity.profile->prepare_fn(rq);
732 : #endif
733 3372 : }
734 : EXPORT_SYMBOL(blk_mq_start_request);
735 :
736 0 : static void __blk_mq_requeue_request(struct request *rq)
737 : {
738 0 : struct request_queue *q = rq->q;
739 :
740 0 : blk_mq_put_driver_tag(rq);
741 :
742 0 : trace_block_rq_requeue(rq);
743 0 : rq_qos_requeue(q, rq);
744 :
745 0 : if (blk_mq_request_started(rq)) {
746 0 : WRITE_ONCE(rq->state, MQ_RQ_IDLE);
747 0 : rq->rq_flags &= ~RQF_TIMED_OUT;
748 : }
749 0 : }
750 :
751 0 : void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
752 : {
753 0 : __blk_mq_requeue_request(rq);
754 :
755 : /* this request will be re-inserted to io scheduler queue */
756 0 : blk_mq_sched_requeue_request(rq);
757 :
758 0 : BUG_ON(!list_empty(&rq->queuelist));
759 0 : blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
760 0 : }
761 : EXPORT_SYMBOL(blk_mq_requeue_request);
762 :
763 251 : static void blk_mq_requeue_work(struct work_struct *work)
764 : {
765 251 : struct request_queue *q =
766 251 : container_of(work, struct request_queue, requeue_work.work);
767 251 : LIST_HEAD(rq_list);
768 251 : struct request *rq, *next;
769 :
770 251 : spin_lock_irq(&q->requeue_lock);
771 251 : list_splice_init(&q->requeue_list, &rq_list);
772 251 : spin_unlock_irq(&q->requeue_lock);
773 :
774 502 : list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
775 251 : if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
776 191 : continue;
777 :
778 60 : rq->rq_flags &= ~RQF_SOFTBARRIER;
779 60 : list_del_init(&rq->queuelist);
780 : /*
781 : * If RQF_DONTPREP, rq has contained some driver specific
782 : * data, so insert it to hctx dispatch list to avoid any
783 : * merge.
784 : */
785 60 : if (rq->rq_flags & RQF_DONTPREP)
786 0 : blk_mq_request_bypass_insert(rq, false, false);
787 : else
788 60 : blk_mq_sched_insert_request(rq, true, false, false);
789 : }
790 :
791 442 : while (!list_empty(&rq_list)) {
792 191 : rq = list_entry(rq_list.next, struct request, queuelist);
793 191 : list_del_init(&rq->queuelist);
794 191 : blk_mq_sched_insert_request(rq, false, false, false);
795 : }
796 :
797 251 : blk_mq_run_hw_queues(q, false);
798 251 : }
799 :
800 251 : void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
801 : bool kick_requeue_list)
802 : {
803 251 : struct request_queue *q = rq->q;
804 251 : unsigned long flags;
805 :
806 : /*
807 : * We abuse this flag that is otherwise used by the I/O scheduler to
808 : * request head insertion from the workqueue.
809 : */
810 251 : BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
811 :
812 251 : spin_lock_irqsave(&q->requeue_lock, flags);
813 251 : if (at_head) {
814 60 : rq->rq_flags |= RQF_SOFTBARRIER;
815 60 : list_add(&rq->queuelist, &q->requeue_list);
816 : } else {
817 191 : list_add_tail(&rq->queuelist, &q->requeue_list);
818 : }
819 251 : spin_unlock_irqrestore(&q->requeue_lock, flags);
820 :
821 251 : if (kick_requeue_list)
822 502 : blk_mq_kick_requeue_list(q);
823 251 : }
824 :
825 251 : void blk_mq_kick_requeue_list(struct request_queue *q)
826 : {
827 251 : kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
828 251 : }
829 : EXPORT_SYMBOL(blk_mq_kick_requeue_list);
830 :
831 0 : void blk_mq_delay_kick_requeue_list(struct request_queue *q,
832 : unsigned long msecs)
833 : {
834 0 : kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
835 : msecs_to_jiffies(msecs));
836 0 : }
837 : EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
838 :
839 0 : struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
840 : {
841 0 : if (tag < tags->nr_tags) {
842 0 : prefetch(tags->rqs[tag]);
843 0 : return tags->rqs[tag];
844 : }
845 :
846 : return NULL;
847 : }
848 : EXPORT_SYMBOL(blk_mq_tag_to_rq);
849 :
850 0 : static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
851 : void *priv, bool reserved)
852 : {
853 : /*
854 : * If we find a request that isn't idle and the queue matches,
855 : * we know the queue is busy. Return false to stop the iteration.
856 : */
857 0 : if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
858 0 : bool *busy = priv;
859 :
860 0 : *busy = true;
861 0 : return false;
862 : }
863 :
864 : return true;
865 : }
866 :
867 0 : bool blk_mq_queue_inflight(struct request_queue *q)
868 : {
869 0 : bool busy = false;
870 :
871 0 : blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
872 0 : return busy;
873 : }
874 : EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
875 :
876 0 : static void blk_mq_rq_timed_out(struct request *req, bool reserved)
877 : {
878 0 : req->rq_flags |= RQF_TIMED_OUT;
879 0 : if (req->q->mq_ops->timeout) {
880 0 : enum blk_eh_timer_return ret;
881 :
882 0 : ret = req->q->mq_ops->timeout(req, reserved);
883 0 : if (ret == BLK_EH_DONE)
884 : return;
885 0 : WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
886 : }
887 :
888 0 : blk_add_timer(req);
889 : }
890 :
891 1 : static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
892 : {
893 1 : unsigned long deadline;
894 :
895 1 : if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
896 : return false;
897 0 : if (rq->rq_flags & RQF_TIMED_OUT)
898 : return false;
899 :
900 0 : deadline = READ_ONCE(rq->deadline);
901 0 : if (time_after_eq(jiffies, deadline))
902 : return true;
903 :
904 0 : if (*next == 0)
905 0 : *next = deadline;
906 0 : else if (time_after(*next, deadline))
907 0 : *next = deadline;
908 : return false;
909 : }
910 :
911 1 : static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
912 : struct request *rq, void *priv, bool reserved)
913 : {
914 1 : unsigned long *next = priv;
915 :
916 : /*
917 : * Just do a quick check if it is expired before locking the request in
918 : * so we're not unnecessarilly synchronizing across CPUs.
919 : */
920 1 : if (!blk_mq_req_expired(rq, next))
921 : return true;
922 :
923 : /*
924 : * We have reason to believe the request may be expired. Take a
925 : * reference on the request to lock this request lifetime into its
926 : * currently allocated context to prevent it from being reallocated in
927 : * the event the completion by-passes this timeout handler.
928 : *
929 : * If the reference was already released, then the driver beat the
930 : * timeout handler to posting a natural completion.
931 : */
932 0 : if (!refcount_inc_not_zero(&rq->ref))
933 : return true;
934 :
935 : /*
936 : * The request is now locked and cannot be reallocated underneath the
937 : * timeout handler's processing. Re-verify this exact request is truly
938 : * expired; if it is not expired, then the request was completed and
939 : * reallocated as a new request.
940 : */
941 0 : if (blk_mq_req_expired(rq, next))
942 0 : blk_mq_rq_timed_out(rq, reserved);
943 :
944 0 : if (is_flush_rq(rq, hctx))
945 0 : rq->end_io(rq, 0);
946 0 : else if (refcount_dec_and_test(&rq->ref))
947 0 : __blk_mq_free_request(rq);
948 :
949 : return true;
950 : }
951 :
952 8 : static void blk_mq_timeout_work(struct work_struct *work)
953 : {
954 8 : struct request_queue *q =
955 8 : container_of(work, struct request_queue, timeout_work);
956 8 : unsigned long next = 0;
957 8 : struct blk_mq_hw_ctx *hctx;
958 8 : int i;
959 :
960 : /* A deadlock might occur if a request is stuck requiring a
961 : * timeout at the same time a queue freeze is waiting
962 : * completion, since the timeout code would not be able to
963 : * acquire the queue reference here.
964 : *
965 : * That's why we don't use blk_queue_enter here; instead, we use
966 : * percpu_ref_tryget directly, because we need to be able to
967 : * obtain a reference even in the short window between the queue
968 : * starting to freeze, by dropping the first reference in
969 : * blk_freeze_queue_start, and the moment the last request is
970 : * consumed, marked by the instant q_usage_counter reaches
971 : * zero.
972 : */
973 8 : if (!percpu_ref_tryget(&q->q_usage_counter))
974 0 : return;
975 :
976 8 : blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
977 :
978 8 : if (next != 0) {
979 0 : mod_timer(&q->timeout, next);
980 : } else {
981 : /*
982 : * Request timeouts are handled as a forward rolling timer. If
983 : * we end up here it means that no requests are pending and
984 : * also that no request has been pending for a while. Mark
985 : * each hctx as idle.
986 : */
987 16 : queue_for_each_hw_ctx(q, hctx, i) {
988 : /* the hctx may be unmapped, so check it here */
989 16 : if (blk_mq_hw_queue_mapped(hctx))
990 8 : blk_mq_tag_idle(hctx);
991 : }
992 : }
993 8 : blk_queue_exit(q);
994 : }
995 :
996 : struct flush_busy_ctx_data {
997 : struct blk_mq_hw_ctx *hctx;
998 : struct list_head *list;
999 : };
1000 :
1001 14 : static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
1002 : {
1003 14 : struct flush_busy_ctx_data *flush_data = data;
1004 14 : struct blk_mq_hw_ctx *hctx = flush_data->hctx;
1005 14 : struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1006 14 : enum hctx_type type = hctx->type;
1007 :
1008 14 : spin_lock(&ctx->lock);
1009 14 : list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
1010 14 : sbitmap_clear_bit(sb, bitnr);
1011 14 : spin_unlock(&ctx->lock);
1012 14 : return true;
1013 : }
1014 :
1015 : /*
1016 : * Process software queues that have been marked busy, splicing them
1017 : * to the for-dispatch
1018 : */
1019 14 : void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1020 : {
1021 14 : struct flush_busy_ctx_data data = {
1022 : .hctx = hctx,
1023 : .list = list,
1024 : };
1025 :
1026 14 : sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
1027 14 : }
1028 : EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
1029 :
1030 : struct dispatch_rq_data {
1031 : struct blk_mq_hw_ctx *hctx;
1032 : struct request *rq;
1033 : };
1034 :
1035 0 : static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
1036 : void *data)
1037 : {
1038 0 : struct dispatch_rq_data *dispatch_data = data;
1039 0 : struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
1040 0 : struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1041 0 : enum hctx_type type = hctx->type;
1042 :
1043 0 : spin_lock(&ctx->lock);
1044 0 : if (!list_empty(&ctx->rq_lists[type])) {
1045 0 : dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
1046 0 : list_del_init(&dispatch_data->rq->queuelist);
1047 0 : if (list_empty(&ctx->rq_lists[type]))
1048 0 : sbitmap_clear_bit(sb, bitnr);
1049 : }
1050 0 : spin_unlock(&ctx->lock);
1051 :
1052 0 : return !dispatch_data->rq;
1053 : }
1054 :
1055 0 : struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1056 : struct blk_mq_ctx *start)
1057 : {
1058 0 : unsigned off = start ? start->index_hw[hctx->type] : 0;
1059 0 : struct dispatch_rq_data data = {
1060 : .hctx = hctx,
1061 : .rq = NULL,
1062 : };
1063 :
1064 0 : __sbitmap_for_each_set(&hctx->ctx_map, off,
1065 : dispatch_rq_from_ctx, &data);
1066 :
1067 0 : return data.rq;
1068 : }
1069 :
1070 267 : static inline unsigned int queued_to_index(unsigned int queued)
1071 : {
1072 267 : if (!queued)
1073 : return 0;
1074 :
1075 267 : return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
1076 : }
1077 :
1078 0 : static bool __blk_mq_get_driver_tag(struct request *rq)
1079 : {
1080 0 : struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
1081 0 : unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1082 0 : int tag;
1083 :
1084 0 : blk_mq_tag_busy(rq->mq_hctx);
1085 :
1086 0 : if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1087 0 : bt = rq->mq_hctx->tags->breserved_tags;
1088 0 : tag_offset = 0;
1089 : } else {
1090 0 : if (!hctx_may_queue(rq->mq_hctx, bt))
1091 : return false;
1092 : }
1093 :
1094 0 : tag = __sbitmap_queue_get(bt);
1095 0 : if (tag == BLK_MQ_NO_TAG)
1096 : return false;
1097 :
1098 0 : rq->tag = tag + tag_offset;
1099 0 : return true;
1100 : }
1101 :
1102 3373 : static bool blk_mq_get_driver_tag(struct request *rq)
1103 : {
1104 3373 : struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1105 :
1106 3373 : if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
1107 : return false;
1108 :
1109 3373 : if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1110 0 : !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
1111 0 : rq->rq_flags |= RQF_MQ_INFLIGHT;
1112 0 : __blk_mq_inc_active_requests(hctx);
1113 : }
1114 3373 : hctx->tags->rqs[rq->tag] = rq;
1115 3373 : return true;
1116 : }
1117 :
1118 0 : static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1119 : int flags, void *key)
1120 : {
1121 0 : struct blk_mq_hw_ctx *hctx;
1122 :
1123 0 : hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1124 :
1125 0 : spin_lock(&hctx->dispatch_wait_lock);
1126 0 : if (!list_empty(&wait->entry)) {
1127 0 : struct sbitmap_queue *sbq;
1128 :
1129 0 : list_del_init(&wait->entry);
1130 0 : sbq = hctx->tags->bitmap_tags;
1131 0 : atomic_dec(&sbq->ws_active);
1132 : }
1133 0 : spin_unlock(&hctx->dispatch_wait_lock);
1134 :
1135 0 : blk_mq_run_hw_queue(hctx, true);
1136 0 : return 1;
1137 : }
1138 :
1139 : /*
1140 : * Mark us waiting for a tag. For shared tags, this involves hooking us into
1141 : * the tag wakeups. For non-shared tags, we can simply mark us needing a
1142 : * restart. For both cases, take care to check the condition again after
1143 : * marking us as waiting.
1144 : */
1145 0 : static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1146 : struct request *rq)
1147 : {
1148 0 : struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
1149 0 : struct wait_queue_head *wq;
1150 0 : wait_queue_entry_t *wait;
1151 0 : bool ret;
1152 :
1153 0 : if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
1154 0 : blk_mq_sched_mark_restart_hctx(hctx);
1155 :
1156 : /*
1157 : * It's possible that a tag was freed in the window between the
1158 : * allocation failure and adding the hardware queue to the wait
1159 : * queue.
1160 : *
1161 : * Don't clear RESTART here, someone else could have set it.
1162 : * At most this will cost an extra queue run.
1163 : */
1164 0 : return blk_mq_get_driver_tag(rq);
1165 : }
1166 :
1167 0 : wait = &hctx->dispatch_wait;
1168 0 : if (!list_empty_careful(&wait->entry))
1169 : return false;
1170 :
1171 0 : wq = &bt_wait_ptr(sbq, hctx)->wait;
1172 :
1173 0 : spin_lock_irq(&wq->lock);
1174 0 : spin_lock(&hctx->dispatch_wait_lock);
1175 0 : if (!list_empty(&wait->entry)) {
1176 0 : spin_unlock(&hctx->dispatch_wait_lock);
1177 0 : spin_unlock_irq(&wq->lock);
1178 0 : return false;
1179 : }
1180 :
1181 0 : atomic_inc(&sbq->ws_active);
1182 0 : wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1183 0 : __add_wait_queue(wq, wait);
1184 :
1185 : /*
1186 : * It's possible that a tag was freed in the window between the
1187 : * allocation failure and adding the hardware queue to the wait
1188 : * queue.
1189 : */
1190 0 : ret = blk_mq_get_driver_tag(rq);
1191 0 : if (!ret) {
1192 0 : spin_unlock(&hctx->dispatch_wait_lock);
1193 0 : spin_unlock_irq(&wq->lock);
1194 0 : return false;
1195 : }
1196 :
1197 : /*
1198 : * We got a tag, remove ourselves from the wait queue to ensure
1199 : * someone else gets the wakeup.
1200 : */
1201 0 : list_del_init(&wait->entry);
1202 0 : atomic_dec(&sbq->ws_active);
1203 0 : spin_unlock(&hctx->dispatch_wait_lock);
1204 0 : spin_unlock_irq(&wq->lock);
1205 :
1206 0 : return true;
1207 : }
1208 :
1209 : #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
1210 : #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
1211 : /*
1212 : * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1213 : * - EWMA is one simple way to compute running average value
1214 : * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1215 : * - take 4 as factor for avoiding to get too small(0) result, and this
1216 : * factor doesn't matter because EWMA decreases exponentially
1217 : */
1218 3373 : static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1219 : {
1220 3373 : unsigned int ewma;
1221 :
1222 3373 : if (hctx->queue->elevator)
1223 : return;
1224 :
1225 3373 : ewma = hctx->dispatch_busy;
1226 :
1227 3373 : if (!ewma && !busy)
1228 : return;
1229 :
1230 0 : ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1231 0 : if (busy)
1232 0 : ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1233 0 : ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1234 :
1235 0 : hctx->dispatch_busy = ewma;
1236 : }
1237 :
1238 : #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
1239 :
1240 0 : static void blk_mq_handle_dev_resource(struct request *rq,
1241 : struct list_head *list)
1242 : {
1243 0 : struct request *next =
1244 0 : list_first_entry_or_null(list, struct request, queuelist);
1245 :
1246 : /*
1247 : * If an I/O scheduler has been configured and we got a driver tag for
1248 : * the next request already, free it.
1249 : */
1250 0 : if (next)
1251 0 : blk_mq_put_driver_tag(next);
1252 :
1253 0 : list_add(&rq->queuelist, list);
1254 0 : __blk_mq_requeue_request(rq);
1255 0 : }
1256 :
1257 0 : static void blk_mq_handle_zone_resource(struct request *rq,
1258 : struct list_head *zone_list)
1259 : {
1260 : /*
1261 : * If we end up here it is because we cannot dispatch a request to a
1262 : * specific zone due to LLD level zone-write locking or other zone
1263 : * related resource not being available. In this case, set the request
1264 : * aside in zone_list for retrying it later.
1265 : */
1266 0 : list_add(&rq->queuelist, zone_list);
1267 0 : __blk_mq_requeue_request(rq);
1268 0 : }
1269 :
1270 : enum prep_dispatch {
1271 : PREP_DISPATCH_OK,
1272 : PREP_DISPATCH_NO_TAG,
1273 : PREP_DISPATCH_NO_BUDGET,
1274 : };
1275 :
1276 267 : static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
1277 : bool need_budget)
1278 : {
1279 267 : struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1280 :
1281 267 : if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
1282 0 : blk_mq_put_driver_tag(rq);
1283 0 : return PREP_DISPATCH_NO_BUDGET;
1284 : }
1285 :
1286 267 : if (!blk_mq_get_driver_tag(rq)) {
1287 : /*
1288 : * The initial allocation attempt failed, so we need to
1289 : * rerun the hardware queue when a tag is freed. The
1290 : * waitqueue takes care of that. If the queue is run
1291 : * before we add this entry back on the dispatch list,
1292 : * we'll re-run it below.
1293 : */
1294 0 : if (!blk_mq_mark_tag_wait(hctx, rq)) {
1295 : /*
1296 : * All budgets not got from this function will be put
1297 : * together during handling partial dispatch
1298 : */
1299 0 : if (need_budget)
1300 0 : blk_mq_put_dispatch_budget(rq->q);
1301 0 : return PREP_DISPATCH_NO_TAG;
1302 : }
1303 : }
1304 :
1305 : return PREP_DISPATCH_OK;
1306 : }
1307 :
1308 : /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1309 0 : static void blk_mq_release_budgets(struct request_queue *q,
1310 : unsigned int nr_budgets)
1311 : {
1312 0 : int i;
1313 :
1314 0 : for (i = 0; i < nr_budgets; i++)
1315 0 : blk_mq_put_dispatch_budget(q);
1316 0 : }
1317 :
1318 : /*
1319 : * Returns true if we did some work AND can potentially do more.
1320 : */
1321 267 : bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
1322 : unsigned int nr_budgets)
1323 : {
1324 267 : enum prep_dispatch prep;
1325 267 : struct request_queue *q = hctx->queue;
1326 267 : struct request *rq, *nxt;
1327 267 : int errors, queued;
1328 267 : blk_status_t ret = BLK_STS_OK;
1329 267 : LIST_HEAD(zone_list);
1330 :
1331 267 : if (list_empty(list))
1332 : return false;
1333 :
1334 : /*
1335 : * Now process all the entries, sending them to the driver.
1336 : */
1337 : errors = queued = 0;
1338 267 : do {
1339 267 : struct blk_mq_queue_data bd;
1340 :
1341 267 : rq = list_first_entry(list, struct request, queuelist);
1342 :
1343 267 : WARN_ON_ONCE(hctx != rq->mq_hctx);
1344 267 : prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
1345 267 : if (prep != PREP_DISPATCH_OK)
1346 : break;
1347 :
1348 267 : list_del_init(&rq->queuelist);
1349 :
1350 267 : bd.rq = rq;
1351 :
1352 : /*
1353 : * Flag last if we have no more requests, or if we have more
1354 : * but can't assign a driver tag to it.
1355 : */
1356 267 : if (list_empty(list))
1357 267 : bd.last = true;
1358 : else {
1359 0 : nxt = list_first_entry(list, struct request, queuelist);
1360 0 : bd.last = !blk_mq_get_driver_tag(nxt);
1361 : }
1362 :
1363 : /*
1364 : * once the request is queued to lld, no need to cover the
1365 : * budget any more
1366 : */
1367 267 : if (nr_budgets)
1368 0 : nr_budgets--;
1369 267 : ret = q->mq_ops->queue_rq(hctx, &bd);
1370 267 : switch (ret) {
1371 267 : case BLK_STS_OK:
1372 267 : queued++;
1373 267 : break;
1374 0 : case BLK_STS_RESOURCE:
1375 : case BLK_STS_DEV_RESOURCE:
1376 0 : blk_mq_handle_dev_resource(rq, list);
1377 0 : goto out;
1378 : case BLK_STS_ZONE_RESOURCE:
1379 : /*
1380 : * Move the request to zone_list and keep going through
1381 : * the dispatch list to find more requests the drive can
1382 : * accept.
1383 : */
1384 0 : blk_mq_handle_zone_resource(rq, &zone_list);
1385 : break;
1386 0 : default:
1387 0 : errors++;
1388 0 : blk_mq_end_request(rq, ret);
1389 : }
1390 267 : } while (!list_empty(list));
1391 267 : out:
1392 267 : if (!list_empty(&zone_list))
1393 0 : list_splice_tail_init(&zone_list, list);
1394 :
1395 267 : hctx->dispatched[queued_to_index(queued)]++;
1396 :
1397 : /* If we didn't flush the entire list, we could have told the driver
1398 : * there was more coming, but that turned out to be a lie.
1399 : */
1400 267 : if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued)
1401 0 : q->mq_ops->commit_rqs(hctx);
1402 : /*
1403 : * Any items that need requeuing? Stuff them into hctx->dispatch,
1404 : * that is where we will continue on next queue run.
1405 : */
1406 267 : if (!list_empty(list)) {
1407 0 : bool needs_restart;
1408 : /* For non-shared tags, the RESTART check will suffice */
1409 0 : bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
1410 0 : (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
1411 0 : bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
1412 :
1413 0 : blk_mq_release_budgets(q, nr_budgets);
1414 :
1415 0 : spin_lock(&hctx->lock);
1416 0 : list_splice_tail_init(list, &hctx->dispatch);
1417 0 : spin_unlock(&hctx->lock);
1418 :
1419 : /*
1420 : * Order adding requests to hctx->dispatch and checking
1421 : * SCHED_RESTART flag. The pair of this smp_mb() is the one
1422 : * in blk_mq_sched_restart(). Avoid restart code path to
1423 : * miss the new added requests to hctx->dispatch, meantime
1424 : * SCHED_RESTART is observed here.
1425 : */
1426 0 : smp_mb();
1427 :
1428 : /*
1429 : * If SCHED_RESTART was set by the caller of this function and
1430 : * it is no longer set that means that it was cleared by another
1431 : * thread and hence that a queue rerun is needed.
1432 : *
1433 : * If 'no_tag' is set, that means that we failed getting
1434 : * a driver tag with an I/O scheduler attached. If our dispatch
1435 : * waitqueue is no longer active, ensure that we run the queue
1436 : * AFTER adding our entries back to the list.
1437 : *
1438 : * If no I/O scheduler has been configured it is possible that
1439 : * the hardware queue got stopped and restarted before requests
1440 : * were pushed back onto the dispatch list. Rerun the queue to
1441 : * avoid starvation. Notes:
1442 : * - blk_mq_run_hw_queue() checks whether or not a queue has
1443 : * been stopped before rerunning a queue.
1444 : * - Some but not all block drivers stop a queue before
1445 : * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1446 : * and dm-rq.
1447 : *
1448 : * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1449 : * bit is set, run queue after a delay to avoid IO stalls
1450 : * that could otherwise occur if the queue is idle. We'll do
1451 : * similar if we couldn't get budget and SCHED_RESTART is set.
1452 : */
1453 0 : needs_restart = blk_mq_sched_needs_restart(hctx);
1454 0 : if (!needs_restart ||
1455 0 : (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1456 0 : blk_mq_run_hw_queue(hctx, true);
1457 0 : else if (needs_restart && (ret == BLK_STS_RESOURCE ||
1458 : no_budget_avail))
1459 0 : blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1460 :
1461 0 : blk_mq_update_dispatch_busy(hctx, true);
1462 0 : return false;
1463 : } else
1464 267 : blk_mq_update_dispatch_busy(hctx, false);
1465 :
1466 267 : return (queued + errors) != 0;
1467 : }
1468 :
1469 : /**
1470 : * __blk_mq_run_hw_queue - Run a hardware queue.
1471 : * @hctx: Pointer to the hardware queue to run.
1472 : *
1473 : * Send pending requests to the hardware.
1474 : */
1475 267 : static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1476 : {
1477 267 : int srcu_idx;
1478 :
1479 : /*
1480 : * We can't run the queue inline with ints disabled. Ensure that
1481 : * we catch bad users of this early.
1482 : */
1483 267 : WARN_ON_ONCE(in_interrupt());
1484 :
1485 267 : might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1486 :
1487 267 : hctx_lock(hctx, &srcu_idx);
1488 267 : blk_mq_sched_dispatch_requests(hctx);
1489 267 : hctx_unlock(hctx, srcu_idx);
1490 267 : }
1491 :
1492 9 : static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
1493 : {
1494 9 : int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
1495 :
1496 9 : if (cpu >= nr_cpu_ids)
1497 0 : cpu = cpumask_first(hctx->cpumask);
1498 9 : return cpu;
1499 : }
1500 :
1501 : /*
1502 : * It'd be great if the workqueue API had a way to pass
1503 : * in a mask and had some smarts for more clever placement.
1504 : * For now we just round-robin here, switching for every
1505 : * BLK_MQ_CPU_WORK_BATCH queued items.
1506 : */
1507 14 : static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1508 : {
1509 14 : bool tried = false;
1510 14 : int next_cpu = hctx->next_cpu;
1511 :
1512 14 : if (hctx->queue->nr_hw_queues == 1)
1513 : return WORK_CPU_UNBOUND;
1514 :
1515 0 : if (--hctx->next_cpu_batch <= 0) {
1516 0 : select_cpu:
1517 0 : next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
1518 : cpu_online_mask);
1519 0 : if (next_cpu >= nr_cpu_ids)
1520 0 : next_cpu = blk_mq_first_mapped_cpu(hctx);
1521 0 : hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1522 : }
1523 :
1524 : /*
1525 : * Do unbound schedule if we can't find a online CPU for this hctx,
1526 : * and it should only happen in the path of handling CPU DEAD.
1527 : */
1528 0 : if (!cpu_online(next_cpu)) {
1529 0 : if (!tried) {
1530 0 : tried = true;
1531 0 : goto select_cpu;
1532 : }
1533 :
1534 : /*
1535 : * Make sure to re-select CPU next time once after CPUs
1536 : * in hctx->cpumask become online again.
1537 : */
1538 0 : hctx->next_cpu = next_cpu;
1539 0 : hctx->next_cpu_batch = 1;
1540 0 : return WORK_CPU_UNBOUND;
1541 : }
1542 :
1543 0 : hctx->next_cpu = next_cpu;
1544 0 : return next_cpu;
1545 : }
1546 :
1547 : /**
1548 : * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
1549 : * @hctx: Pointer to the hardware queue to run.
1550 : * @async: If we want to run the queue asynchronously.
1551 : * @msecs: Milliseconds of delay to wait before running the queue.
1552 : *
1553 : * If !@async, try to run the queue now. Else, run the queue asynchronously and
1554 : * with a delay of @msecs.
1555 : */
1556 267 : static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1557 : unsigned long msecs)
1558 : {
1559 267 : if (unlikely(blk_mq_hctx_stopped(hctx)))
1560 : return;
1561 :
1562 267 : if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1563 253 : int cpu = get_cpu();
1564 253 : if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1565 253 : __blk_mq_run_hw_queue(hctx);
1566 253 : put_cpu();
1567 253 : return;
1568 : }
1569 :
1570 0 : put_cpu();
1571 : }
1572 :
1573 28 : kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
1574 : msecs_to_jiffies(msecs));
1575 : }
1576 :
1577 : /**
1578 : * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
1579 : * @hctx: Pointer to the hardware queue to run.
1580 : * @msecs: Milliseconds of delay to wait before running the queue.
1581 : *
1582 : * Run a hardware queue asynchronously with a delay of @msecs.
1583 : */
1584 0 : void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1585 : {
1586 0 : __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1587 0 : }
1588 : EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1589 :
1590 : /**
1591 : * blk_mq_run_hw_queue - Start to run a hardware queue.
1592 : * @hctx: Pointer to the hardware queue to run.
1593 : * @async: If we want to run the queue asynchronously.
1594 : *
1595 : * Check if the request queue is not in a quiesced state and if there are
1596 : * pending requests to be sent. If this is true, run the queue to send requests
1597 : * to hardware.
1598 : */
1599 599 : void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1600 : {
1601 599 : int srcu_idx;
1602 599 : bool need_run;
1603 :
1604 : /*
1605 : * When queue is quiesced, we may be switching io scheduler, or
1606 : * updating nr_hw_queues, or other things, and we can't run queue
1607 : * any more, even __blk_mq_hctx_has_pending() can't be called safely.
1608 : *
1609 : * And queue will be rerun in blk_mq_unquiesce_queue() if it is
1610 : * quiesced.
1611 : */
1612 599 : hctx_lock(hctx, &srcu_idx);
1613 1198 : need_run = !blk_queue_quiesced(hctx->queue) &&
1614 599 : blk_mq_hctx_has_pending(hctx);
1615 599 : hctx_unlock(hctx, srcu_idx);
1616 :
1617 599 : if (need_run)
1618 267 : __blk_mq_delay_run_hw_queue(hctx, async, 0);
1619 599 : }
1620 : EXPORT_SYMBOL(blk_mq_run_hw_queue);
1621 :
1622 : /*
1623 : * Is the request queue handled by an IO scheduler that does not respect
1624 : * hardware queues when dispatching?
1625 : */
1626 251 : static bool blk_mq_has_sqsched(struct request_queue *q)
1627 : {
1628 251 : struct elevator_queue *e = q->elevator;
1629 :
1630 251 : if (e && e->type->ops.dispatch_request &&
1631 0 : !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
1632 0 : return true;
1633 : return false;
1634 : }
1635 :
1636 : /*
1637 : * Return prefered queue to dispatch from (if any) for non-mq aware IO
1638 : * scheduler.
1639 : */
1640 0 : static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
1641 : {
1642 0 : struct blk_mq_hw_ctx *hctx;
1643 :
1644 : /*
1645 : * If the IO scheduler does not respect hardware queues when
1646 : * dispatching, we just don't bother with multiple HW queues and
1647 : * dispatch from hctx for the current CPU since running multiple queues
1648 : * just causes lock contention inside the scheduler and pointless cache
1649 : * bouncing.
1650 : */
1651 0 : hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
1652 0 : raw_smp_processor_id());
1653 0 : if (!blk_mq_hctx_stopped(hctx))
1654 0 : return hctx;
1655 : return NULL;
1656 : }
1657 :
1658 : /**
1659 : * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
1660 : * @q: Pointer to the request queue to run.
1661 : * @async: If we want to run the queue asynchronously.
1662 : */
1663 251 : void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1664 : {
1665 251 : struct blk_mq_hw_ctx *hctx, *sq_hctx;
1666 251 : int i;
1667 :
1668 251 : sq_hctx = NULL;
1669 251 : if (blk_mq_has_sqsched(q))
1670 0 : sq_hctx = blk_mq_get_sq_hctx(q);
1671 502 : queue_for_each_hw_ctx(q, hctx, i) {
1672 251 : if (blk_mq_hctx_stopped(hctx))
1673 0 : continue;
1674 : /*
1675 : * Dispatch from this hctx either if there's no hctx preferred
1676 : * by IO scheduler or if it has requests that bypass the
1677 : * scheduler.
1678 : */
1679 251 : if (!sq_hctx || sq_hctx == hctx ||
1680 251 : !list_empty_careful(&hctx->dispatch))
1681 251 : blk_mq_run_hw_queue(hctx, async);
1682 : }
1683 251 : }
1684 : EXPORT_SYMBOL(blk_mq_run_hw_queues);
1685 :
1686 : /**
1687 : * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
1688 : * @q: Pointer to the request queue to run.
1689 : * @msecs: Milliseconds of delay to wait before running the queues.
1690 : */
1691 0 : void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
1692 : {
1693 0 : struct blk_mq_hw_ctx *hctx, *sq_hctx;
1694 0 : int i;
1695 :
1696 0 : sq_hctx = NULL;
1697 0 : if (blk_mq_has_sqsched(q))
1698 0 : sq_hctx = blk_mq_get_sq_hctx(q);
1699 0 : queue_for_each_hw_ctx(q, hctx, i) {
1700 0 : if (blk_mq_hctx_stopped(hctx))
1701 0 : continue;
1702 : /*
1703 : * Dispatch from this hctx either if there's no hctx preferred
1704 : * by IO scheduler or if it has requests that bypass the
1705 : * scheduler.
1706 : */
1707 0 : if (!sq_hctx || sq_hctx == hctx ||
1708 0 : !list_empty_careful(&hctx->dispatch))
1709 0 : blk_mq_delay_run_hw_queue(hctx, msecs);
1710 : }
1711 0 : }
1712 : EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
1713 :
1714 : /**
1715 : * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1716 : * @q: request queue.
1717 : *
1718 : * The caller is responsible for serializing this function against
1719 : * blk_mq_{start,stop}_hw_queue().
1720 : */
1721 0 : bool blk_mq_queue_stopped(struct request_queue *q)
1722 : {
1723 0 : struct blk_mq_hw_ctx *hctx;
1724 0 : int i;
1725 :
1726 0 : queue_for_each_hw_ctx(q, hctx, i)
1727 0 : if (blk_mq_hctx_stopped(hctx))
1728 : return true;
1729 :
1730 : return false;
1731 : }
1732 : EXPORT_SYMBOL(blk_mq_queue_stopped);
1733 :
1734 : /*
1735 : * This function is often used for pausing .queue_rq() by driver when
1736 : * there isn't enough resource or some conditions aren't satisfied, and
1737 : * BLK_STS_RESOURCE is usually returned.
1738 : *
1739 : * We do not guarantee that dispatch can be drained or blocked
1740 : * after blk_mq_stop_hw_queue() returns. Please use
1741 : * blk_mq_quiesce_queue() for that requirement.
1742 : */
1743 0 : void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1744 : {
1745 0 : cancel_delayed_work(&hctx->run_work);
1746 :
1747 0 : set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1748 0 : }
1749 : EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1750 :
1751 : /*
1752 : * This function is often used for pausing .queue_rq() by driver when
1753 : * there isn't enough resource or some conditions aren't satisfied, and
1754 : * BLK_STS_RESOURCE is usually returned.
1755 : *
1756 : * We do not guarantee that dispatch can be drained or blocked
1757 : * after blk_mq_stop_hw_queues() returns. Please use
1758 : * blk_mq_quiesce_queue() for that requirement.
1759 : */
1760 0 : void blk_mq_stop_hw_queues(struct request_queue *q)
1761 : {
1762 0 : struct blk_mq_hw_ctx *hctx;
1763 0 : int i;
1764 :
1765 0 : queue_for_each_hw_ctx(q, hctx, i)
1766 0 : blk_mq_stop_hw_queue(hctx);
1767 0 : }
1768 : EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1769 :
1770 0 : void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1771 : {
1772 0 : clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1773 :
1774 0 : blk_mq_run_hw_queue(hctx, false);
1775 0 : }
1776 : EXPORT_SYMBOL(blk_mq_start_hw_queue);
1777 :
1778 0 : void blk_mq_start_hw_queues(struct request_queue *q)
1779 : {
1780 0 : struct blk_mq_hw_ctx *hctx;
1781 0 : int i;
1782 :
1783 0 : queue_for_each_hw_ctx(q, hctx, i)
1784 0 : blk_mq_start_hw_queue(hctx);
1785 0 : }
1786 : EXPORT_SYMBOL(blk_mq_start_hw_queues);
1787 :
1788 2672 : void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1789 : {
1790 2672 : if (!blk_mq_hctx_stopped(hctx))
1791 : return;
1792 :
1793 0 : clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1794 0 : blk_mq_run_hw_queue(hctx, async);
1795 : }
1796 : EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
1797 :
1798 2672 : void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1799 : {
1800 2672 : struct blk_mq_hw_ctx *hctx;
1801 2672 : int i;
1802 :
1803 5344 : queue_for_each_hw_ctx(q, hctx, i)
1804 2672 : blk_mq_start_stopped_hw_queue(hctx, async);
1805 2672 : }
1806 : EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1807 :
1808 14 : static void blk_mq_run_work_fn(struct work_struct *work)
1809 : {
1810 14 : struct blk_mq_hw_ctx *hctx;
1811 :
1812 14 : hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1813 :
1814 : /*
1815 : * If we are stopped, don't run the queue.
1816 : */
1817 14 : if (blk_mq_hctx_stopped(hctx))
1818 : return;
1819 :
1820 14 : __blk_mq_run_hw_queue(hctx);
1821 : }
1822 :
1823 0 : static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1824 : struct request *rq,
1825 : bool at_head)
1826 : {
1827 0 : struct blk_mq_ctx *ctx = rq->mq_ctx;
1828 0 : enum hctx_type type = hctx->type;
1829 :
1830 0 : lockdep_assert_held(&ctx->lock);
1831 :
1832 0 : trace_block_rq_insert(rq);
1833 :
1834 0 : if (at_head)
1835 0 : list_add(&rq->queuelist, &ctx->rq_lists[type]);
1836 : else
1837 0 : list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1838 0 : }
1839 :
1840 0 : void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1841 : bool at_head)
1842 : {
1843 0 : struct blk_mq_ctx *ctx = rq->mq_ctx;
1844 :
1845 0 : lockdep_assert_held(&ctx->lock);
1846 :
1847 0 : __blk_mq_insert_req_list(hctx, rq, at_head);
1848 0 : blk_mq_hctx_mark_pending(hctx, ctx);
1849 0 : }
1850 :
1851 : /**
1852 : * blk_mq_request_bypass_insert - Insert a request at dispatch list.
1853 : * @rq: Pointer to request to be inserted.
1854 : * @at_head: true if the request should be inserted at the head of the list.
1855 : * @run_queue: If we should run the hardware queue after inserting the request.
1856 : *
1857 : * Should only be used carefully, when the caller knows we want to
1858 : * bypass a potential IO scheduler on the target device.
1859 : */
1860 253 : void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
1861 : bool run_queue)
1862 : {
1863 253 : struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1864 :
1865 253 : spin_lock(&hctx->lock);
1866 253 : if (at_head)
1867 251 : list_add(&rq->queuelist, &hctx->dispatch);
1868 : else
1869 2 : list_add_tail(&rq->queuelist, &hctx->dispatch);
1870 253 : spin_unlock(&hctx->lock);
1871 :
1872 253 : if (run_queue)
1873 0 : blk_mq_run_hw_queue(hctx, false);
1874 253 : }
1875 :
1876 14 : void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1877 : struct list_head *list)
1878 :
1879 : {
1880 14 : struct request *rq;
1881 14 : enum hctx_type type = hctx->type;
1882 :
1883 : /*
1884 : * preemption doesn't flush plug list, so it's possible ctx->cpu is
1885 : * offline now
1886 : */
1887 28 : list_for_each_entry(rq, list, queuelist) {
1888 14 : BUG_ON(rq->mq_ctx != ctx);
1889 14 : trace_block_rq_insert(rq);
1890 : }
1891 :
1892 14 : spin_lock(&ctx->lock);
1893 14 : list_splice_tail_init(list, &ctx->rq_lists[type]);
1894 14 : blk_mq_hctx_mark_pending(hctx, ctx);
1895 14 : spin_unlock(&ctx->lock);
1896 14 : }
1897 :
1898 0 : static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
1899 : {
1900 0 : struct request *rqa = container_of(a, struct request, queuelist);
1901 0 : struct request *rqb = container_of(b, struct request, queuelist);
1902 :
1903 0 : if (rqa->mq_ctx != rqb->mq_ctx)
1904 0 : return rqa->mq_ctx > rqb->mq_ctx;
1905 0 : if (rqa->mq_hctx != rqb->mq_hctx)
1906 0 : return rqa->mq_hctx > rqb->mq_hctx;
1907 :
1908 0 : return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1909 : }
1910 :
1911 1880 : void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1912 : {
1913 1880 : LIST_HEAD(list);
1914 :
1915 1880 : if (list_empty(&plug->mq_list))
1916 0 : return;
1917 1880 : list_splice_init(&plug->mq_list, &list);
1918 :
1919 1880 : if (plug->rq_count > 2 && plug->multiple_queues)
1920 0 : list_sort(NULL, &list, plug_rq_cmp);
1921 :
1922 1880 : plug->rq_count = 0;
1923 :
1924 1882 : do {
1925 1882 : struct list_head rq_list;
1926 1882 : struct request *rq, *head_rq = list_entry_rq(list.next);
1927 1882 : struct list_head *pos = &head_rq->queuelist; /* skip first */
1928 1882 : struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
1929 1882 : struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
1930 1882 : unsigned int depth = 1;
1931 :
1932 2514 : list_for_each_continue(pos, &list) {
1933 634 : rq = list_entry_rq(pos);
1934 634 : BUG_ON(!rq->q);
1935 634 : if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
1936 : break;
1937 632 : depth++;
1938 : }
1939 :
1940 1882 : list_cut_before(&rq_list, &list, pos);
1941 1882 : trace_block_unplug(head_rq->q, depth, !from_schedule);
1942 1882 : blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1943 : from_schedule);
1944 1882 : } while(!list_empty(&list));
1945 : }
1946 :
1947 3252 : static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
1948 : unsigned int nr_segs)
1949 : {
1950 3252 : int err;
1951 :
1952 3252 : if (bio->bi_opf & REQ_RAHEAD)
1953 1943 : rq->cmd_flags |= REQ_FAILFAST_MASK;
1954 :
1955 3252 : rq->__sector = bio->bi_iter.bi_sector;
1956 3252 : rq->write_hint = bio->bi_write_hint;
1957 3252 : blk_rq_bio_prep(rq, bio, nr_segs);
1958 :
1959 : /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
1960 3252 : err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
1961 3252 : WARN_ON_ONCE(err);
1962 :
1963 3252 : blk_account_io_start(rq);
1964 3252 : }
1965 :
1966 3106 : static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1967 : struct request *rq,
1968 : blk_qc_t *cookie, bool last)
1969 : {
1970 3106 : struct request_queue *q = rq->q;
1971 3106 : struct blk_mq_queue_data bd = {
1972 : .rq = rq,
1973 : .last = last,
1974 : };
1975 3106 : blk_qc_t new_cookie;
1976 3106 : blk_status_t ret;
1977 :
1978 3106 : new_cookie = request_to_qc_t(hctx, rq);
1979 :
1980 : /*
1981 : * For OK queue, we are done. For error, caller may kill it.
1982 : * Any other error (busy), just add it to our list as we
1983 : * previously would have done.
1984 : */
1985 3106 : ret = q->mq_ops->queue_rq(hctx, &bd);
1986 3106 : switch (ret) {
1987 3106 : case BLK_STS_OK:
1988 3106 : blk_mq_update_dispatch_busy(hctx, false);
1989 3106 : *cookie = new_cookie;
1990 3106 : break;
1991 0 : case BLK_STS_RESOURCE:
1992 : case BLK_STS_DEV_RESOURCE:
1993 0 : blk_mq_update_dispatch_busy(hctx, true);
1994 0 : __blk_mq_requeue_request(rq);
1995 0 : break;
1996 0 : default:
1997 0 : blk_mq_update_dispatch_busy(hctx, false);
1998 0 : *cookie = BLK_QC_T_NONE;
1999 0 : break;
2000 : }
2001 :
2002 3106 : return ret;
2003 : }
2004 :
2005 3106 : static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2006 : struct request *rq,
2007 : blk_qc_t *cookie,
2008 : bool bypass_insert, bool last)
2009 : {
2010 3106 : struct request_queue *q = rq->q;
2011 3106 : bool run_queue = true;
2012 :
2013 : /*
2014 : * RCU or SRCU read lock is needed before checking quiesced flag.
2015 : *
2016 : * When queue is stopped or quiesced, ignore 'bypass_insert' from
2017 : * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
2018 : * and avoid driver to try to dispatch again.
2019 : */
2020 3106 : if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
2021 0 : run_queue = false;
2022 0 : bypass_insert = false;
2023 0 : goto insert;
2024 : }
2025 :
2026 3106 : if (q->elevator && !bypass_insert)
2027 0 : goto insert;
2028 :
2029 3106 : if (!blk_mq_get_dispatch_budget(q))
2030 0 : goto insert;
2031 :
2032 3106 : if (!blk_mq_get_driver_tag(rq)) {
2033 0 : blk_mq_put_dispatch_budget(q);
2034 0 : goto insert;
2035 : }
2036 :
2037 3106 : return __blk_mq_issue_directly(hctx, rq, cookie, last);
2038 0 : insert:
2039 0 : if (bypass_insert)
2040 : return BLK_STS_RESOURCE;
2041 :
2042 0 : blk_mq_sched_insert_request(rq, false, run_queue, false);
2043 :
2044 0 : return BLK_STS_OK;
2045 : }
2046 :
2047 : /**
2048 : * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2049 : * @hctx: Pointer of the associated hardware queue.
2050 : * @rq: Pointer to request to be sent.
2051 : * @cookie: Request queue cookie.
2052 : *
2053 : * If the device has enough resources to accept a new request now, send the
2054 : * request directly to device driver. Else, insert at hctx->dispatch queue, so
2055 : * we can try send it another time in the future. Requests inserted at this
2056 : * queue have higher priority.
2057 : */
2058 606 : static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2059 : struct request *rq, blk_qc_t *cookie)
2060 : {
2061 606 : blk_status_t ret;
2062 606 : int srcu_idx;
2063 :
2064 606 : might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
2065 :
2066 606 : hctx_lock(hctx, &srcu_idx);
2067 :
2068 606 : ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
2069 606 : if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
2070 0 : blk_mq_request_bypass_insert(rq, false, true);
2071 606 : else if (ret != BLK_STS_OK)
2072 0 : blk_mq_end_request(rq, ret);
2073 :
2074 606 : hctx_unlock(hctx, srcu_idx);
2075 606 : }
2076 :
2077 2500 : blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
2078 : {
2079 2500 : blk_status_t ret;
2080 2500 : int srcu_idx;
2081 2500 : blk_qc_t unused_cookie;
2082 2500 : struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2083 :
2084 2500 : hctx_lock(hctx, &srcu_idx);
2085 2500 : ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
2086 2500 : hctx_unlock(hctx, srcu_idx);
2087 :
2088 2500 : return ret;
2089 : }
2090 :
2091 1868 : void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
2092 : struct list_head *list)
2093 : {
2094 1868 : int queued = 0;
2095 1868 : int errors = 0;
2096 :
2097 4368 : while (!list_empty(list)) {
2098 2500 : blk_status_t ret;
2099 2500 : struct request *rq = list_first_entry(list, struct request,
2100 : queuelist);
2101 :
2102 2500 : list_del_init(&rq->queuelist);
2103 2500 : ret = blk_mq_request_issue_directly(rq, list_empty(list));
2104 2500 : if (ret != BLK_STS_OK) {
2105 0 : if (ret == BLK_STS_RESOURCE ||
2106 0 : ret == BLK_STS_DEV_RESOURCE) {
2107 0 : blk_mq_request_bypass_insert(rq, false,
2108 0 : list_empty(list));
2109 0 : break;
2110 : }
2111 0 : blk_mq_end_request(rq, ret);
2112 0 : errors++;
2113 : } else
2114 2500 : queued++;
2115 : }
2116 :
2117 : /*
2118 : * If we didn't flush the entire list, we could have told
2119 : * the driver there was more coming, but that turned out to
2120 : * be a lie.
2121 : */
2122 1868 : if ((!list_empty(list) || errors) &&
2123 0 : hctx->queue->mq_ops->commit_rqs && queued)
2124 0 : hctx->queue->mq_ops->commit_rqs(hctx);
2125 1868 : }
2126 :
2127 2514 : static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
2128 : {
2129 2514 : list_add_tail(&rq->queuelist, &plug->mq_list);
2130 2514 : plug->rq_count++;
2131 2514 : if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
2132 634 : struct request *tmp;
2133 :
2134 634 : tmp = list_first_entry(&plug->mq_list, struct request,
2135 : queuelist);
2136 634 : if (tmp->q != rq->q)
2137 0 : plug->multiple_queues = true;
2138 : }
2139 2514 : }
2140 :
2141 : /**
2142 : * blk_mq_submit_bio - Create and send a request to block device.
2143 : * @bio: Bio pointer.
2144 : *
2145 : * Builds up a request structure from @q and @bio and send to the device. The
2146 : * request may not be queued directly to hardware if:
2147 : * * This request can be merged with another one
2148 : * * We want to place request at plug queue for possible future merging
2149 : * * There is an IO scheduler active at this queue
2150 : *
2151 : * It will not queue the request if there is an error with the bio, or at the
2152 : * request creation.
2153 : *
2154 : * Returns: Request queue cookie.
2155 : */
2156 8737 : blk_qc_t blk_mq_submit_bio(struct bio *bio)
2157 : {
2158 8737 : struct request_queue *q = bio->bi_bdev->bd_disk->queue;
2159 8737 : const int is_sync = op_is_sync(bio->bi_opf);
2160 8737 : const int is_flush_fua = op_is_flush(bio->bi_opf);
2161 8737 : struct blk_mq_alloc_data data = {
2162 : .q = q,
2163 : };
2164 8737 : struct request *rq;
2165 8737 : struct blk_plug *plug;
2166 8737 : struct request *same_queue_rq = NULL;
2167 8737 : unsigned int nr_segs;
2168 8737 : blk_qc_t cookie;
2169 8737 : blk_status_t ret;
2170 8737 : bool hipri;
2171 :
2172 8737 : blk_queue_bounce(q, &bio);
2173 8737 : __blk_queue_split(&bio, &nr_segs);
2174 :
2175 8737 : if (!bio_integrity_prep(bio))
2176 : goto queue_exit;
2177 :
2178 17342 : if (!is_flush_fua && !blk_queue_nomerges(q) &&
2179 8605 : blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
2180 5485 : goto queue_exit;
2181 :
2182 3252 : if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2183 0 : goto queue_exit;
2184 :
2185 3252 : rq_qos_throttle(q, bio);
2186 :
2187 3252 : hipri = bio->bi_opf & REQ_HIPRI;
2188 :
2189 3252 : data.cmd_flags = bio->bi_opf;
2190 3252 : rq = __blk_mq_alloc_request(&data);
2191 3250 : if (unlikely(!rq)) {
2192 0 : rq_qos_cleanup(q, bio);
2193 0 : if (bio->bi_opf & REQ_NOWAIT)
2194 0 : bio_wouldblock_error(bio);
2195 0 : goto queue_exit;
2196 : }
2197 :
2198 3250 : trace_block_getrq(bio);
2199 :
2200 3252 : rq_qos_track(q, rq, bio);
2201 :
2202 3252 : cookie = request_to_qc_t(data.hctx, rq);
2203 :
2204 3252 : blk_mq_bio_to_request(rq, bio, nr_segs);
2205 :
2206 3252 : ret = blk_crypto_init_request(rq);
2207 3252 : if (ret != BLK_STS_OK) {
2208 : bio->bi_status = ret;
2209 : bio_endio(bio);
2210 : blk_mq_free_request(rq);
2211 : return BLK_QC_T_NONE;
2212 : }
2213 :
2214 3252 : plug = blk_mq_plug(q, bio);
2215 3252 : if (unlikely(is_flush_fua)) {
2216 : /* Bypass scheduler for flush requests */
2217 132 : blk_insert_flush(rq);
2218 132 : blk_mq_run_hw_queue(data.hctx, true);
2219 3120 : } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
2220 2514 : !blk_queue_nonrot(q))) {
2221 : /*
2222 : * Use plugging if we have a ->commit_rqs() hook as well, as
2223 : * we know the driver uses bd->last in a smart fashion.
2224 : *
2225 : * Use normal plugging if this disk is slow HDD, as sequential
2226 : * IO may benefit a lot from plug merging.
2227 : */
2228 2514 : unsigned int request_count = plug->rq_count;
2229 2514 : struct request *last = NULL;
2230 :
2231 2514 : if (!request_count)
2232 1852 : trace_block_plug(q);
2233 : else
2234 662 : last = list_entry_rq(plug->mq_list.prev);
2235 :
2236 2514 : if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
2237 639 : blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
2238 28 : blk_flush_plug_list(plug, false);
2239 28 : trace_block_plug(q);
2240 : }
2241 :
2242 2514 : blk_add_rq_to_plug(plug, rq);
2243 606 : } else if (q->elevator) {
2244 : /* Insert the request at the IO scheduler queue */
2245 0 : blk_mq_sched_insert_request(rq, false, true, true);
2246 606 : } else if (plug && !blk_queue_nomerges(q)) {
2247 : /*
2248 : * We do limited plugging. If the bio can be merged, do that.
2249 : * Otherwise the existing request in the plug list will be
2250 : * issued. So the plug list will have one request at most
2251 : * The plug list might get flushed before this. If that happens,
2252 : * the plug list is empty, and same_queue_rq is invalid.
2253 : */
2254 0 : if (list_empty(&plug->mq_list))
2255 0 : same_queue_rq = NULL;
2256 0 : if (same_queue_rq) {
2257 0 : list_del_init(&same_queue_rq->queuelist);
2258 0 : plug->rq_count--;
2259 : }
2260 0 : blk_add_rq_to_plug(plug, rq);
2261 0 : trace_block_plug(q);
2262 :
2263 0 : if (same_queue_rq) {
2264 0 : data.hctx = same_queue_rq->mq_hctx;
2265 0 : trace_block_unplug(q, 1, true);
2266 0 : blk_mq_try_issue_directly(data.hctx, same_queue_rq,
2267 : &cookie);
2268 : }
2269 606 : } else if ((q->nr_hw_queues > 1 && is_sync) ||
2270 606 : !data.hctx->dispatch_busy) {
2271 : /*
2272 : * There is no scheduler and we can try to send directly
2273 : * to the hardware.
2274 : */
2275 606 : blk_mq_try_issue_directly(data.hctx, rq, &cookie);
2276 : } else {
2277 : /* Default case. */
2278 0 : blk_mq_sched_insert_request(rq, false, true, true);
2279 : }
2280 :
2281 3252 : if (!hipri)
2282 : return BLK_QC_T_NONE;
2283 0 : return cookie;
2284 5485 : queue_exit:
2285 5485 : blk_queue_exit(q);
2286 5485 : return BLK_QC_T_NONE;
2287 : }
2288 :
2289 0 : void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2290 : unsigned int hctx_idx)
2291 : {
2292 0 : struct page *page;
2293 :
2294 0 : if (tags->rqs && set->ops->exit_request) {
2295 : int i;
2296 :
2297 0 : for (i = 0; i < tags->nr_tags; i++) {
2298 0 : struct request *rq = tags->static_rqs[i];
2299 :
2300 0 : if (!rq)
2301 0 : continue;
2302 0 : set->ops->exit_request(set, rq, hctx_idx);
2303 0 : tags->static_rqs[i] = NULL;
2304 : }
2305 : }
2306 :
2307 0 : while (!list_empty(&tags->page_list)) {
2308 0 : page = list_first_entry(&tags->page_list, struct page, lru);
2309 0 : list_del_init(&page->lru);
2310 : /*
2311 : * Remove kmemleak object previously allocated in
2312 : * blk_mq_alloc_rqs().
2313 : */
2314 0 : kmemleak_free(page_address(page));
2315 0 : __free_pages(page, page->private);
2316 : }
2317 0 : }
2318 :
2319 0 : void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
2320 : {
2321 0 : kfree(tags->rqs);
2322 0 : tags->rqs = NULL;
2323 0 : kfree(tags->static_rqs);
2324 0 : tags->static_rqs = NULL;
2325 :
2326 0 : blk_mq_free_tags(tags, flags);
2327 0 : }
2328 :
2329 9 : struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
2330 : unsigned int hctx_idx,
2331 : unsigned int nr_tags,
2332 : unsigned int reserved_tags,
2333 : unsigned int flags)
2334 : {
2335 9 : struct blk_mq_tags *tags;
2336 9 : int node;
2337 :
2338 9 : node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2339 9 : if (node == NUMA_NO_NODE)
2340 0 : node = set->numa_node;
2341 :
2342 9 : tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
2343 9 : if (!tags)
2344 : return NULL;
2345 :
2346 9 : tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
2347 : GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2348 : node);
2349 9 : if (!tags->rqs) {
2350 0 : blk_mq_free_tags(tags, flags);
2351 0 : return NULL;
2352 : }
2353 :
2354 9 : tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
2355 : GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2356 : node);
2357 9 : if (!tags->static_rqs) {
2358 0 : kfree(tags->rqs);
2359 0 : blk_mq_free_tags(tags, flags);
2360 0 : return NULL;
2361 : }
2362 :
2363 : return tags;
2364 : }
2365 :
2366 466 : static size_t order_to_size(unsigned int order)
2367 : {
2368 466 : return (size_t)PAGE_SIZE << order;
2369 : }
2370 :
2371 2057 : static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
2372 : unsigned int hctx_idx, int node)
2373 : {
2374 2057 : int ret;
2375 :
2376 2057 : if (set->ops->init_request) {
2377 2057 : ret = set->ops->init_request(set, rq, hctx_idx, node);
2378 2057 : if (ret)
2379 : return ret;
2380 : }
2381 :
2382 2057 : WRITE_ONCE(rq->state, MQ_RQ_IDLE);
2383 2057 : return 0;
2384 : }
2385 :
2386 9 : int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2387 : unsigned int hctx_idx, unsigned int depth)
2388 : {
2389 9 : unsigned int i, j, entries_per_page, max_order = 4;
2390 9 : size_t rq_size, left;
2391 9 : int node;
2392 :
2393 9 : node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2394 9 : if (node == NUMA_NO_NODE)
2395 0 : node = set->numa_node;
2396 :
2397 9 : INIT_LIST_HEAD(&tags->page_list);
2398 :
2399 : /*
2400 : * rq_size is the size of the request plus driver payload, rounded
2401 : * to the cacheline size
2402 : */
2403 9 : rq_size = round_up(sizeof(struct request) + set->cmd_size,
2404 : cache_line_size());
2405 9 : left = rq_size * depth;
2406 :
2407 164 : for (i = 0; i < depth; ) {
2408 : int this_order = max_order;
2409 : struct page *page;
2410 : int to_do;
2411 : void *p;
2412 :
2413 156 : while (this_order && left < order_to_size(this_order - 1))
2414 : this_order--;
2415 :
2416 155 : do {
2417 155 : page = alloc_pages_node(node,
2418 : GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
2419 : this_order);
2420 155 : if (page)
2421 : break;
2422 0 : if (!this_order--)
2423 : break;
2424 0 : if (order_to_size(this_order) < rq_size)
2425 : break;
2426 : } while (1);
2427 :
2428 155 : if (!page)
2429 0 : goto fail;
2430 :
2431 155 : page->private = this_order;
2432 155 : list_add_tail(&page->lru, &tags->page_list);
2433 :
2434 155 : p = page_address(page);
2435 : /*
2436 : * Allow kmemleak to scan these pages as they contain pointers
2437 : * to additional allocations like via ops->init_request().
2438 : */
2439 155 : kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
2440 155 : entries_per_page = order_to_size(this_order) / rq_size;
2441 155 : to_do = min(entries_per_page, depth - i);
2442 155 : left -= to_do * rq_size;
2443 2203 : for (j = 0; j < to_do; j++) {
2444 2048 : struct request *rq = p;
2445 :
2446 2048 : tags->static_rqs[i] = rq;
2447 2048 : if (blk_mq_init_request(set, rq, hctx_idx, node)) {
2448 0 : tags->static_rqs[i] = NULL;
2449 0 : goto fail;
2450 : }
2451 :
2452 2048 : p += rq_size;
2453 2048 : i++;
2454 : }
2455 : }
2456 : return 0;
2457 :
2458 0 : fail:
2459 0 : blk_mq_free_rqs(set, tags, hctx_idx);
2460 0 : return -ENOMEM;
2461 : }
2462 :
2463 : struct rq_iter_data {
2464 : struct blk_mq_hw_ctx *hctx;
2465 : bool has_rq;
2466 : };
2467 :
2468 0 : static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
2469 : {
2470 0 : struct rq_iter_data *iter_data = data;
2471 :
2472 0 : if (rq->mq_hctx != iter_data->hctx)
2473 : return true;
2474 0 : iter_data->has_rq = true;
2475 0 : return false;
2476 : }
2477 :
2478 0 : static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
2479 : {
2480 0 : struct blk_mq_tags *tags = hctx->sched_tags ?
2481 0 : hctx->sched_tags : hctx->tags;
2482 0 : struct rq_iter_data data = {
2483 : .hctx = hctx,
2484 : };
2485 :
2486 0 : blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
2487 0 : return data.has_rq;
2488 : }
2489 :
2490 0 : static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
2491 : struct blk_mq_hw_ctx *hctx)
2492 : {
2493 0 : if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
2494 : return false;
2495 0 : if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
2496 0 : return false;
2497 : return true;
2498 : }
2499 :
2500 0 : static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
2501 : {
2502 0 : struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2503 : struct blk_mq_hw_ctx, cpuhp_online);
2504 :
2505 0 : if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
2506 0 : !blk_mq_last_cpu_in_hctx(cpu, hctx))
2507 0 : return 0;
2508 :
2509 : /*
2510 : * Prevent new request from being allocated on the current hctx.
2511 : *
2512 : * The smp_mb__after_atomic() Pairs with the implied barrier in
2513 : * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
2514 : * seen once we return from the tag allocator.
2515 : */
2516 0 : set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2517 0 : smp_mb__after_atomic();
2518 :
2519 : /*
2520 : * Try to grab a reference to the queue and wait for any outstanding
2521 : * requests. If we could not grab a reference the queue has been
2522 : * frozen and there are no requests.
2523 : */
2524 0 : if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
2525 0 : while (blk_mq_hctx_has_requests(hctx))
2526 0 : msleep(5);
2527 0 : percpu_ref_put(&hctx->queue->q_usage_counter);
2528 : }
2529 :
2530 : return 0;
2531 : }
2532 :
2533 0 : static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
2534 : {
2535 0 : struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2536 : struct blk_mq_hw_ctx, cpuhp_online);
2537 :
2538 0 : if (cpumask_test_cpu(cpu, hctx->cpumask))
2539 0 : clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2540 0 : return 0;
2541 : }
2542 :
2543 : /*
2544 : * 'cpu' is going away. splice any existing rq_list entries from this
2545 : * software queue to the hw queue dispatch list, and ensure that it
2546 : * gets run.
2547 : */
2548 0 : static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
2549 : {
2550 0 : struct blk_mq_hw_ctx *hctx;
2551 0 : struct blk_mq_ctx *ctx;
2552 0 : LIST_HEAD(tmp);
2553 0 : enum hctx_type type;
2554 :
2555 0 : hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2556 0 : if (!cpumask_test_cpu(cpu, hctx->cpumask))
2557 : return 0;
2558 :
2559 0 : ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2560 0 : type = hctx->type;
2561 :
2562 0 : spin_lock(&ctx->lock);
2563 0 : if (!list_empty(&ctx->rq_lists[type])) {
2564 0 : list_splice_init(&ctx->rq_lists[type], &tmp);
2565 0 : blk_mq_hctx_clear_pending(hctx, ctx);
2566 : }
2567 0 : spin_unlock(&ctx->lock);
2568 :
2569 0 : if (list_empty(&tmp))
2570 : return 0;
2571 :
2572 0 : spin_lock(&hctx->lock);
2573 0 : list_splice_tail_init(&tmp, &hctx->dispatch);
2574 0 : spin_unlock(&hctx->lock);
2575 :
2576 0 : blk_mq_run_hw_queue(hctx, true);
2577 0 : return 0;
2578 : }
2579 :
2580 0 : static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2581 : {
2582 0 : if (!(hctx->flags & BLK_MQ_F_STACKING))
2583 0 : cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2584 : &hctx->cpuhp_online);
2585 0 : cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2586 : &hctx->cpuhp_dead);
2587 0 : }
2588 :
2589 : /* hctx->ctxs will be freed in queue's release handler */
2590 0 : static void blk_mq_exit_hctx(struct request_queue *q,
2591 : struct blk_mq_tag_set *set,
2592 : struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2593 : {
2594 0 : if (blk_mq_hw_queue_mapped(hctx))
2595 0 : blk_mq_tag_idle(hctx);
2596 :
2597 0 : if (set->ops->exit_request)
2598 0 : set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2599 :
2600 0 : if (set->ops->exit_hctx)
2601 0 : set->ops->exit_hctx(hctx, hctx_idx);
2602 :
2603 0 : blk_mq_remove_cpuhp(hctx);
2604 :
2605 0 : spin_lock(&q->unused_hctx_lock);
2606 0 : list_add(&hctx->hctx_list, &q->unused_hctx_list);
2607 0 : spin_unlock(&q->unused_hctx_lock);
2608 0 : }
2609 :
2610 0 : static void blk_mq_exit_hw_queues(struct request_queue *q,
2611 : struct blk_mq_tag_set *set, int nr_queue)
2612 : {
2613 0 : struct blk_mq_hw_ctx *hctx;
2614 0 : unsigned int i;
2615 :
2616 0 : queue_for_each_hw_ctx(q, hctx, i) {
2617 0 : if (i == nr_queue)
2618 : break;
2619 0 : blk_mq_debugfs_unregister_hctx(hctx);
2620 0 : blk_mq_exit_hctx(q, set, hctx, i);
2621 : }
2622 0 : }
2623 :
2624 9 : static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2625 : {
2626 9 : int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2627 :
2628 9 : BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2629 : __alignof__(struct blk_mq_hw_ctx)) !=
2630 : sizeof(struct blk_mq_hw_ctx));
2631 :
2632 9 : if (tag_set->flags & BLK_MQ_F_BLOCKING)
2633 0 : hw_ctx_size += sizeof(struct srcu_struct);
2634 :
2635 9 : return hw_ctx_size;
2636 : }
2637 :
2638 9 : static int blk_mq_init_hctx(struct request_queue *q,
2639 : struct blk_mq_tag_set *set,
2640 : struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2641 : {
2642 9 : hctx->queue_num = hctx_idx;
2643 :
2644 9 : if (!(hctx->flags & BLK_MQ_F_STACKING))
2645 1 : cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2646 : &hctx->cpuhp_online);
2647 9 : cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2648 :
2649 9 : hctx->tags = set->tags[hctx_idx];
2650 :
2651 9 : if (set->ops->init_hctx &&
2652 0 : set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2653 0 : goto unregister_cpu_notifier;
2654 :
2655 9 : if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2656 9 : hctx->numa_node))
2657 0 : goto exit_hctx;
2658 : return 0;
2659 :
2660 0 : exit_hctx:
2661 0 : if (set->ops->exit_hctx)
2662 0 : set->ops->exit_hctx(hctx, hctx_idx);
2663 0 : unregister_cpu_notifier:
2664 0 : blk_mq_remove_cpuhp(hctx);
2665 0 : return -1;
2666 : }
2667 :
2668 : static struct blk_mq_hw_ctx *
2669 9 : blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2670 : int node)
2671 : {
2672 9 : struct blk_mq_hw_ctx *hctx;
2673 9 : gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2674 :
2675 9 : hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2676 9 : if (!hctx)
2677 0 : goto fail_alloc_hctx;
2678 :
2679 9 : if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2680 : goto free_hctx;
2681 :
2682 9 : atomic_set(&hctx->nr_active, 0);
2683 9 : if (node == NUMA_NO_NODE)
2684 0 : node = set->numa_node;
2685 9 : hctx->numa_node = node;
2686 :
2687 9 : INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2688 9 : spin_lock_init(&hctx->lock);
2689 9 : INIT_LIST_HEAD(&hctx->dispatch);
2690 9 : hctx->queue = q;
2691 9 : hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
2692 :
2693 9 : INIT_LIST_HEAD(&hctx->hctx_list);
2694 :
2695 : /*
2696 : * Allocate space for all possible cpus to avoid allocation at
2697 : * runtime
2698 : */
2699 9 : hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2700 : gfp, node);
2701 9 : if (!hctx->ctxs)
2702 0 : goto free_cpumask;
2703 :
2704 9 : if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2705 : gfp, node))
2706 0 : goto free_ctxs;
2707 9 : hctx->nr_ctx = 0;
2708 :
2709 9 : spin_lock_init(&hctx->dispatch_wait_lock);
2710 9 : init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2711 9 : INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2712 :
2713 9 : hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
2714 9 : if (!hctx->fq)
2715 0 : goto free_bitmap;
2716 :
2717 9 : if (hctx->flags & BLK_MQ_F_BLOCKING)
2718 0 : init_srcu_struct(hctx->srcu);
2719 9 : blk_mq_hctx_kobj_init(hctx);
2720 :
2721 9 : return hctx;
2722 :
2723 0 : free_bitmap:
2724 0 : sbitmap_free(&hctx->ctx_map);
2725 0 : free_ctxs:
2726 0 : kfree(hctx->ctxs);
2727 0 : free_cpumask:
2728 0 : free_cpumask_var(hctx->cpumask);
2729 0 : free_hctx:
2730 0 : kfree(hctx);
2731 : fail_alloc_hctx:
2732 : return NULL;
2733 : }
2734 :
2735 9 : static void blk_mq_init_cpu_queues(struct request_queue *q,
2736 : unsigned int nr_hw_queues)
2737 : {
2738 9 : struct blk_mq_tag_set *set = q->tag_set;
2739 9 : unsigned int i, j;
2740 :
2741 45 : for_each_possible_cpu(i) {
2742 36 : struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2743 36 : struct blk_mq_hw_ctx *hctx;
2744 36 : int k;
2745 :
2746 36 : __ctx->cpu = i;
2747 36 : spin_lock_init(&__ctx->lock);
2748 180 : for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2749 108 : INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2750 :
2751 36 : __ctx->queue = q;
2752 :
2753 : /*
2754 : * Set local node, IFF we have more than one hw queue. If
2755 : * not, we remain on the home node of the device
2756 : */
2757 72 : for (j = 0; j < set->nr_maps; j++) {
2758 36 : hctx = blk_mq_map_queue_type(q, j, i);
2759 36 : if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2760 0 : hctx->numa_node = cpu_to_node(i);
2761 : }
2762 : }
2763 9 : }
2764 :
2765 9 : static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
2766 : int hctx_idx)
2767 : {
2768 9 : unsigned int flags = set->flags;
2769 9 : int ret = 0;
2770 :
2771 9 : set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2772 : set->queue_depth, set->reserved_tags, flags);
2773 9 : if (!set->tags[hctx_idx])
2774 : return false;
2775 :
2776 9 : ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2777 : set->queue_depth);
2778 9 : if (!ret)
2779 : return true;
2780 :
2781 0 : blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2782 0 : set->tags[hctx_idx] = NULL;
2783 0 : return false;
2784 : }
2785 :
2786 0 : static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2787 : unsigned int hctx_idx)
2788 : {
2789 0 : unsigned int flags = set->flags;
2790 :
2791 0 : if (set->tags && set->tags[hctx_idx]) {
2792 0 : blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2793 0 : blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2794 0 : set->tags[hctx_idx] = NULL;
2795 : }
2796 0 : }
2797 :
2798 9 : static void blk_mq_map_swqueue(struct request_queue *q)
2799 : {
2800 9 : unsigned int i, j, hctx_idx;
2801 9 : struct blk_mq_hw_ctx *hctx;
2802 9 : struct blk_mq_ctx *ctx;
2803 9 : struct blk_mq_tag_set *set = q->tag_set;
2804 :
2805 18 : queue_for_each_hw_ctx(q, hctx, i) {
2806 9 : cpumask_clear(hctx->cpumask);
2807 9 : hctx->nr_ctx = 0;
2808 9 : hctx->dispatch_from = NULL;
2809 : }
2810 :
2811 : /*
2812 : * Map software to hardware queues.
2813 : *
2814 : * If the cpu isn't present, the cpu is mapped to first hctx.
2815 : */
2816 45 : for_each_possible_cpu(i) {
2817 :
2818 36 : ctx = per_cpu_ptr(q->queue_ctx, i);
2819 72 : for (j = 0; j < set->nr_maps; j++) {
2820 36 : if (!set->map[j].nr_queues) {
2821 0 : ctx->hctxs[j] = blk_mq_map_queue_type(q,
2822 : HCTX_TYPE_DEFAULT, i);
2823 0 : continue;
2824 : }
2825 36 : hctx_idx = set->map[j].mq_map[i];
2826 : /* unmapped hw queue can be remapped after CPU topo changed */
2827 36 : if (!set->tags[hctx_idx] &&
2828 0 : !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
2829 : /*
2830 : * If tags initialization fail for some hctx,
2831 : * that hctx won't be brought online. In this
2832 : * case, remap the current ctx to hctx[0] which
2833 : * is guaranteed to always have tags allocated
2834 : */
2835 0 : set->map[j].mq_map[i] = 0;
2836 : }
2837 :
2838 36 : hctx = blk_mq_map_queue_type(q, j, i);
2839 36 : ctx->hctxs[j] = hctx;
2840 : /*
2841 : * If the CPU is already set in the mask, then we've
2842 : * mapped this one already. This can happen if
2843 : * devices share queues across queue maps.
2844 : */
2845 36 : if (cpumask_test_cpu(i, hctx->cpumask))
2846 0 : continue;
2847 :
2848 36 : cpumask_set_cpu(i, hctx->cpumask);
2849 36 : hctx->type = j;
2850 36 : ctx->index_hw[hctx->type] = hctx->nr_ctx;
2851 36 : hctx->ctxs[hctx->nr_ctx++] = ctx;
2852 :
2853 : /*
2854 : * If the nr_ctx type overflows, we have exceeded the
2855 : * amount of sw queues we can support.
2856 : */
2857 36 : BUG_ON(!hctx->nr_ctx);
2858 : }
2859 :
2860 108 : for (; j < HCTX_MAX_TYPES; j++)
2861 72 : ctx->hctxs[j] = blk_mq_map_queue_type(q,
2862 : HCTX_TYPE_DEFAULT, i);
2863 : }
2864 :
2865 18 : queue_for_each_hw_ctx(q, hctx, i) {
2866 : /*
2867 : * If no software queues are mapped to this hardware queue,
2868 : * disable it and free the request entries.
2869 : */
2870 9 : if (!hctx->nr_ctx) {
2871 : /* Never unmap queue 0. We need it as a
2872 : * fallback in case of a new remap fails
2873 : * allocation
2874 : */
2875 0 : if (i && set->tags[i])
2876 0 : blk_mq_free_map_and_requests(set, i);
2877 :
2878 0 : hctx->tags = NULL;
2879 0 : continue;
2880 : }
2881 :
2882 9 : hctx->tags = set->tags[i];
2883 9 : WARN_ON(!hctx->tags);
2884 :
2885 : /*
2886 : * Set the map size to the number of mapped software queues.
2887 : * This is more accurate and more efficient than looping
2888 : * over all possibly mapped software queues.
2889 : */
2890 9 : sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2891 :
2892 : /*
2893 : * Initialize batch roundrobin counts
2894 : */
2895 9 : hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
2896 9 : hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2897 : }
2898 9 : }
2899 :
2900 : /*
2901 : * Caller needs to ensure that we're either frozen/quiesced, or that
2902 : * the queue isn't live yet.
2903 : */
2904 0 : static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2905 : {
2906 0 : struct blk_mq_hw_ctx *hctx;
2907 0 : int i;
2908 :
2909 0 : queue_for_each_hw_ctx(q, hctx, i) {
2910 0 : if (shared)
2911 0 : hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
2912 : else
2913 0 : hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2914 : }
2915 0 : }
2916 :
2917 0 : static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
2918 : bool shared)
2919 : {
2920 0 : struct request_queue *q;
2921 :
2922 0 : lockdep_assert_held(&set->tag_list_lock);
2923 :
2924 0 : list_for_each_entry(q, &set->tag_list, tag_set_list) {
2925 0 : blk_mq_freeze_queue(q);
2926 0 : queue_set_hctx_shared(q, shared);
2927 0 : blk_mq_unfreeze_queue(q);
2928 : }
2929 0 : }
2930 :
2931 0 : static void blk_mq_del_queue_tag_set(struct request_queue *q)
2932 : {
2933 0 : struct blk_mq_tag_set *set = q->tag_set;
2934 :
2935 0 : mutex_lock(&set->tag_list_lock);
2936 0 : list_del(&q->tag_set_list);
2937 0 : if (list_is_singular(&set->tag_list)) {
2938 : /* just transitioned to unshared */
2939 0 : set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2940 : /* update existing queue */
2941 0 : blk_mq_update_tag_set_shared(set, false);
2942 : }
2943 0 : mutex_unlock(&set->tag_list_lock);
2944 0 : INIT_LIST_HEAD(&q->tag_set_list);
2945 0 : }
2946 :
2947 9 : static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2948 : struct request_queue *q)
2949 : {
2950 9 : mutex_lock(&set->tag_list_lock);
2951 :
2952 : /*
2953 : * Check to see if we're transitioning to shared (from 1 to 2 queues).
2954 : */
2955 9 : if (!list_empty(&set->tag_list) &&
2956 0 : !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
2957 0 : set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
2958 : /* update existing queue */
2959 0 : blk_mq_update_tag_set_shared(set, true);
2960 : }
2961 9 : if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
2962 0 : queue_set_hctx_shared(q, true);
2963 9 : list_add_tail(&q->tag_set_list, &set->tag_list);
2964 :
2965 9 : mutex_unlock(&set->tag_list_lock);
2966 9 : }
2967 :
2968 : /* All allocations will be freed in release handler of q->mq_kobj */
2969 9 : static int blk_mq_alloc_ctxs(struct request_queue *q)
2970 : {
2971 9 : struct blk_mq_ctxs *ctxs;
2972 9 : int cpu;
2973 :
2974 9 : ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
2975 9 : if (!ctxs)
2976 : return -ENOMEM;
2977 :
2978 9 : ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2979 9 : if (!ctxs->queue_ctx)
2980 0 : goto fail;
2981 :
2982 45 : for_each_possible_cpu(cpu) {
2983 36 : struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
2984 36 : ctx->ctxs = ctxs;
2985 : }
2986 :
2987 9 : q->mq_kobj = &ctxs->kobj;
2988 9 : q->queue_ctx = ctxs->queue_ctx;
2989 :
2990 9 : return 0;
2991 0 : fail:
2992 0 : kfree(ctxs);
2993 0 : return -ENOMEM;
2994 : }
2995 :
2996 : /*
2997 : * It is the actual release handler for mq, but we do it from
2998 : * request queue's release handler for avoiding use-after-free
2999 : * and headache because q->mq_kobj shouldn't have been introduced,
3000 : * but we can't group ctx/kctx kobj without it.
3001 : */
3002 0 : void blk_mq_release(struct request_queue *q)
3003 : {
3004 0 : struct blk_mq_hw_ctx *hctx, *next;
3005 0 : int i;
3006 :
3007 0 : queue_for_each_hw_ctx(q, hctx, i)
3008 0 : WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
3009 :
3010 : /* all hctx are in .unused_hctx_list now */
3011 0 : list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
3012 0 : list_del_init(&hctx->hctx_list);
3013 0 : kobject_put(&hctx->kobj);
3014 : }
3015 :
3016 0 : kfree(q->queue_hw_ctx);
3017 :
3018 : /*
3019 : * release .mq_kobj and sw queue's kobject now because
3020 : * both share lifetime with request queue.
3021 : */
3022 0 : blk_mq_sysfs_deinit(q);
3023 0 : }
3024 :
3025 9 : struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
3026 : void *queuedata)
3027 : {
3028 9 : struct request_queue *uninit_q, *q;
3029 :
3030 9 : uninit_q = blk_alloc_queue(set->numa_node);
3031 9 : if (!uninit_q)
3032 9 : return ERR_PTR(-ENOMEM);
3033 9 : uninit_q->queuedata = queuedata;
3034 :
3035 : /*
3036 : * Initialize the queue without an elevator. device_add_disk() will do
3037 : * the initialization.
3038 : */
3039 9 : q = blk_mq_init_allocated_queue(set, uninit_q, false);
3040 9 : if (IS_ERR(q))
3041 0 : blk_cleanup_queue(uninit_q);
3042 :
3043 : return q;
3044 : }
3045 : EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
3046 :
3047 9 : struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3048 : {
3049 9 : return blk_mq_init_queue_data(set, NULL);
3050 : }
3051 : EXPORT_SYMBOL(blk_mq_init_queue);
3052 :
3053 : /*
3054 : * Helper for setting up a queue with mq ops, given queue depth, and
3055 : * the passed in mq ops flags.
3056 : */
3057 0 : struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
3058 : const struct blk_mq_ops *ops,
3059 : unsigned int queue_depth,
3060 : unsigned int set_flags)
3061 : {
3062 0 : struct request_queue *q;
3063 0 : int ret;
3064 :
3065 0 : memset(set, 0, sizeof(*set));
3066 0 : set->ops = ops;
3067 0 : set->nr_hw_queues = 1;
3068 0 : set->nr_maps = 1;
3069 0 : set->queue_depth = queue_depth;
3070 0 : set->numa_node = NUMA_NO_NODE;
3071 0 : set->flags = set_flags;
3072 :
3073 0 : ret = blk_mq_alloc_tag_set(set);
3074 0 : if (ret)
3075 0 : return ERR_PTR(ret);
3076 :
3077 0 : q = blk_mq_init_queue(set);
3078 0 : if (IS_ERR(q)) {
3079 0 : blk_mq_free_tag_set(set);
3080 0 : return q;
3081 : }
3082 :
3083 : return q;
3084 : }
3085 : EXPORT_SYMBOL(blk_mq_init_sq_queue);
3086 :
3087 9 : static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
3088 : struct blk_mq_tag_set *set, struct request_queue *q,
3089 : int hctx_idx, int node)
3090 : {
3091 9 : struct blk_mq_hw_ctx *hctx = NULL, *tmp;
3092 :
3093 : /* reuse dead hctx first */
3094 9 : spin_lock(&q->unused_hctx_lock);
3095 9 : list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
3096 0 : if (tmp->numa_node == node) {
3097 : hctx = tmp;
3098 : break;
3099 : }
3100 : }
3101 9 : if (hctx)
3102 0 : list_del_init(&hctx->hctx_list);
3103 9 : spin_unlock(&q->unused_hctx_lock);
3104 :
3105 9 : if (!hctx)
3106 9 : hctx = blk_mq_alloc_hctx(q, set, node);
3107 9 : if (!hctx)
3108 0 : goto fail;
3109 :
3110 9 : if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
3111 0 : goto free_hctx;
3112 :
3113 : return hctx;
3114 :
3115 0 : free_hctx:
3116 0 : kobject_put(&hctx->kobj);
3117 : fail:
3118 : return NULL;
3119 : }
3120 :
3121 9 : static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
3122 : struct request_queue *q)
3123 : {
3124 9 : int i, j, end;
3125 9 : struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
3126 :
3127 9 : if (q->nr_hw_queues < set->nr_hw_queues) {
3128 9 : struct blk_mq_hw_ctx **new_hctxs;
3129 :
3130 9 : new_hctxs = kcalloc_node(set->nr_hw_queues,
3131 : sizeof(*new_hctxs), GFP_KERNEL,
3132 : set->numa_node);
3133 9 : if (!new_hctxs)
3134 : return;
3135 9 : if (hctxs)
3136 0 : memcpy(new_hctxs, hctxs, q->nr_hw_queues *
3137 : sizeof(*hctxs));
3138 9 : q->queue_hw_ctx = new_hctxs;
3139 9 : kfree(hctxs);
3140 9 : hctxs = new_hctxs;
3141 : }
3142 :
3143 : /* protect against switching io scheduler */
3144 9 : mutex_lock(&q->sysfs_lock);
3145 27 : for (i = 0; i < set->nr_hw_queues; i++) {
3146 9 : int node;
3147 9 : struct blk_mq_hw_ctx *hctx;
3148 :
3149 9 : node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
3150 : /*
3151 : * If the hw queue has been mapped to another numa node,
3152 : * we need to realloc the hctx. If allocation fails, fallback
3153 : * to use the previous one.
3154 : */
3155 9 : if (hctxs[i] && (hctxs[i]->numa_node == node))
3156 0 : continue;
3157 :
3158 9 : hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
3159 9 : if (hctx) {
3160 9 : if (hctxs[i])
3161 0 : blk_mq_exit_hctx(q, set, hctxs[i], i);
3162 9 : hctxs[i] = hctx;
3163 : } else {
3164 0 : if (hctxs[i])
3165 0 : pr_warn("Allocate new hctx on node %d fails,\
3166 : fallback to previous one on node %d\n",
3167 : node, hctxs[i]->numa_node);
3168 : else
3169 : break;
3170 : }
3171 : }
3172 : /*
3173 : * Increasing nr_hw_queues fails. Free the newly allocated
3174 : * hctxs and keep the previous q->nr_hw_queues.
3175 : */
3176 9 : if (i != set->nr_hw_queues) {
3177 0 : j = q->nr_hw_queues;
3178 0 : end = i;
3179 : } else {
3180 9 : j = i;
3181 9 : end = q->nr_hw_queues;
3182 9 : q->nr_hw_queues = set->nr_hw_queues;
3183 : }
3184 :
3185 9 : for (; j < end; j++) {
3186 0 : struct blk_mq_hw_ctx *hctx = hctxs[j];
3187 :
3188 0 : if (hctx) {
3189 0 : if (hctx->tags)
3190 0 : blk_mq_free_map_and_requests(set, j);
3191 0 : blk_mq_exit_hctx(q, set, hctx, j);
3192 0 : hctxs[j] = NULL;
3193 : }
3194 : }
3195 9 : mutex_unlock(&q->sysfs_lock);
3196 : }
3197 :
3198 9 : struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
3199 : struct request_queue *q,
3200 : bool elevator_init)
3201 : {
3202 : /* mark the queue as mq asap */
3203 9 : q->mq_ops = set->ops;
3204 :
3205 9 : q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
3206 : blk_mq_poll_stats_bkt,
3207 : BLK_MQ_POLL_STATS_BKTS, q);
3208 9 : if (!q->poll_cb)
3209 0 : goto err_exit;
3210 :
3211 9 : if (blk_mq_alloc_ctxs(q))
3212 0 : goto err_poll;
3213 :
3214 : /* init q->mq_kobj and sw queues' kobjects */
3215 9 : blk_mq_sysfs_init(q);
3216 :
3217 9 : INIT_LIST_HEAD(&q->unused_hctx_list);
3218 9 : spin_lock_init(&q->unused_hctx_lock);
3219 :
3220 9 : blk_mq_realloc_hw_ctxs(set, q);
3221 9 : if (!q->nr_hw_queues)
3222 0 : goto err_hctxs;
3223 :
3224 9 : INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
3225 9 : blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
3226 :
3227 9 : q->tag_set = set;
3228 :
3229 9 : q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
3230 9 : if (set->nr_maps > HCTX_TYPE_POLL &&
3231 0 : set->map[HCTX_TYPE_POLL].nr_queues)
3232 0 : blk_queue_flag_set(QUEUE_FLAG_POLL, q);
3233 :
3234 9 : q->sg_reserved_size = INT_MAX;
3235 :
3236 9 : INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
3237 9 : INIT_LIST_HEAD(&q->requeue_list);
3238 9 : spin_lock_init(&q->requeue_lock);
3239 :
3240 9 : q->nr_requests = set->queue_depth;
3241 :
3242 : /*
3243 : * Default to classic polling
3244 : */
3245 9 : q->poll_nsec = BLK_MQ_POLL_CLASSIC;
3246 :
3247 9 : blk_mq_init_cpu_queues(q, set->nr_hw_queues);
3248 9 : blk_mq_add_queue_tag_set(set, q);
3249 9 : blk_mq_map_swqueue(q);
3250 :
3251 9 : if (elevator_init)
3252 0 : elevator_init_mq(q);
3253 :
3254 : return q;
3255 :
3256 0 : err_hctxs:
3257 0 : kfree(q->queue_hw_ctx);
3258 0 : q->nr_hw_queues = 0;
3259 0 : blk_mq_sysfs_deinit(q);
3260 0 : err_poll:
3261 0 : blk_stat_free_callback(q->poll_cb);
3262 0 : q->poll_cb = NULL;
3263 0 : err_exit:
3264 0 : q->mq_ops = NULL;
3265 0 : return ERR_PTR(-ENOMEM);
3266 : }
3267 : EXPORT_SYMBOL(blk_mq_init_allocated_queue);
3268 :
3269 : /* tags can _not_ be used after returning from blk_mq_exit_queue */
3270 0 : void blk_mq_exit_queue(struct request_queue *q)
3271 : {
3272 0 : struct blk_mq_tag_set *set = q->tag_set;
3273 :
3274 0 : blk_mq_del_queue_tag_set(q);
3275 0 : blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
3276 0 : }
3277 :
3278 9 : static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
3279 : {
3280 9 : int i;
3281 :
3282 18 : for (i = 0; i < set->nr_hw_queues; i++) {
3283 9 : if (!__blk_mq_alloc_map_and_request(set, i))
3284 0 : goto out_unwind;
3285 9 : cond_resched();
3286 : }
3287 :
3288 : return 0;
3289 :
3290 0 : out_unwind:
3291 0 : while (--i >= 0)
3292 0 : blk_mq_free_map_and_requests(set, i);
3293 :
3294 : return -ENOMEM;
3295 : }
3296 :
3297 : /*
3298 : * Allocate the request maps associated with this tag_set. Note that this
3299 : * may reduce the depth asked for, if memory is tight. set->queue_depth
3300 : * will be updated to reflect the allocated depth.
3301 : */
3302 9 : static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
3303 : {
3304 9 : unsigned int depth;
3305 9 : int err;
3306 :
3307 9 : depth = set->queue_depth;
3308 9 : do {
3309 9 : err = __blk_mq_alloc_rq_maps(set);
3310 9 : if (!err)
3311 : break;
3312 :
3313 0 : set->queue_depth >>= 1;
3314 0 : if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
3315 : err = -ENOMEM;
3316 : break;
3317 : }
3318 0 : } while (set->queue_depth);
3319 :
3320 9 : if (!set->queue_depth || err) {
3321 0 : pr_err("blk-mq: failed to allocate request map\n");
3322 0 : return -ENOMEM;
3323 : }
3324 :
3325 9 : if (depth != set->queue_depth)
3326 0 : pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
3327 : depth, set->queue_depth);
3328 :
3329 : return 0;
3330 : }
3331 :
3332 9 : static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
3333 : {
3334 : /*
3335 : * blk_mq_map_queues() and multiple .map_queues() implementations
3336 : * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
3337 : * number of hardware queues.
3338 : */
3339 9 : if (set->nr_maps == 1)
3340 9 : set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
3341 :
3342 9 : if (set->ops->map_queues && !is_kdump_kernel()) {
3343 : int i;
3344 :
3345 : /*
3346 : * transport .map_queues is usually done in the following
3347 : * way:
3348 : *
3349 : * for (queue = 0; queue < set->nr_hw_queues; queue++) {
3350 : * mask = get_cpu_mask(queue)
3351 : * for_each_cpu(cpu, mask)
3352 : * set->map[x].mq_map[cpu] = queue;
3353 : * }
3354 : *
3355 : * When we need to remap, the table has to be cleared for
3356 : * killing stale mapping since one CPU may not be mapped
3357 : * to any hw queue.
3358 : */
3359 2 : for (i = 0; i < set->nr_maps; i++)
3360 1 : blk_mq_clear_mq_map(&set->map[i]);
3361 :
3362 1 : return set->ops->map_queues(set);
3363 : } else {
3364 8 : BUG_ON(set->nr_maps > 1);
3365 8 : return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3366 : }
3367 : }
3368 :
3369 9 : static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
3370 : int cur_nr_hw_queues, int new_nr_hw_queues)
3371 : {
3372 9 : struct blk_mq_tags **new_tags;
3373 :
3374 9 : if (cur_nr_hw_queues >= new_nr_hw_queues)
3375 : return 0;
3376 :
3377 9 : new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
3378 : GFP_KERNEL, set->numa_node);
3379 9 : if (!new_tags)
3380 : return -ENOMEM;
3381 :
3382 9 : if (set->tags)
3383 0 : memcpy(new_tags, set->tags, cur_nr_hw_queues *
3384 : sizeof(*set->tags));
3385 9 : kfree(set->tags);
3386 9 : set->tags = new_tags;
3387 9 : set->nr_hw_queues = new_nr_hw_queues;
3388 :
3389 9 : return 0;
3390 : }
3391 :
3392 9 : static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
3393 : int new_nr_hw_queues)
3394 : {
3395 9 : return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
3396 : }
3397 :
3398 : /*
3399 : * Alloc a tag set to be associated with one or more request queues.
3400 : * May fail with EINVAL for various error conditions. May adjust the
3401 : * requested depth down, if it's too large. In that case, the set
3402 : * value will be stored in set->queue_depth.
3403 : */
3404 9 : int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
3405 : {
3406 9 : int i, ret;
3407 :
3408 9 : BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
3409 :
3410 9 : if (!set->nr_hw_queues)
3411 : return -EINVAL;
3412 9 : if (!set->queue_depth)
3413 : return -EINVAL;
3414 9 : if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
3415 : return -EINVAL;
3416 :
3417 9 : if (!set->ops->queue_rq)
3418 : return -EINVAL;
3419 :
3420 9 : if (!set->ops->get_budget ^ !set->ops->put_budget)
3421 : return -EINVAL;
3422 :
3423 9 : if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
3424 0 : pr_info("blk-mq: reduced tag depth to %u\n",
3425 : BLK_MQ_MAX_DEPTH);
3426 0 : set->queue_depth = BLK_MQ_MAX_DEPTH;
3427 : }
3428 :
3429 9 : if (!set->nr_maps)
3430 9 : set->nr_maps = 1;
3431 0 : else if (set->nr_maps > HCTX_MAX_TYPES)
3432 : return -EINVAL;
3433 :
3434 : /*
3435 : * If a crashdump is active, then we are potentially in a very
3436 : * memory constrained environment. Limit us to 1 queue and
3437 : * 64 tags to prevent using too much memory.
3438 : */
3439 9 : if (is_kdump_kernel()) {
3440 : set->nr_hw_queues = 1;
3441 : set->nr_maps = 1;
3442 : set->queue_depth = min(64U, set->queue_depth);
3443 : }
3444 : /*
3445 : * There is no use for more h/w queues than cpus if we just have
3446 : * a single map
3447 : */
3448 9 : if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
3449 0 : set->nr_hw_queues = nr_cpu_ids;
3450 :
3451 9 : if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
3452 : return -ENOMEM;
3453 :
3454 18 : ret = -ENOMEM;
3455 18 : for (i = 0; i < set->nr_maps; i++) {
3456 9 : set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
3457 : sizeof(set->map[i].mq_map[0]),
3458 : GFP_KERNEL, set->numa_node);
3459 9 : if (!set->map[i].mq_map)
3460 0 : goto out_free_mq_map;
3461 9 : set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3462 : }
3463 :
3464 9 : ret = blk_mq_update_queue_map(set);
3465 9 : if (ret)
3466 0 : goto out_free_mq_map;
3467 :
3468 9 : ret = blk_mq_alloc_map_and_requests(set);
3469 9 : if (ret)
3470 0 : goto out_free_mq_map;
3471 :
3472 9 : if (blk_mq_is_sbitmap_shared(set->flags)) {
3473 0 : atomic_set(&set->active_queues_shared_sbitmap, 0);
3474 :
3475 0 : if (blk_mq_init_shared_sbitmap(set, set->flags)) {
3476 0 : ret = -ENOMEM;
3477 0 : goto out_free_mq_rq_maps;
3478 : }
3479 : }
3480 :
3481 9 : mutex_init(&set->tag_list_lock);
3482 9 : INIT_LIST_HEAD(&set->tag_list);
3483 :
3484 9 : return 0;
3485 :
3486 0 : out_free_mq_rq_maps:
3487 0 : for (i = 0; i < set->nr_hw_queues; i++)
3488 0 : blk_mq_free_map_and_requests(set, i);
3489 0 : out_free_mq_map:
3490 0 : for (i = 0; i < set->nr_maps; i++) {
3491 0 : kfree(set->map[i].mq_map);
3492 0 : set->map[i].mq_map = NULL;
3493 : }
3494 0 : kfree(set->tags);
3495 0 : set->tags = NULL;
3496 0 : return ret;
3497 : }
3498 : EXPORT_SYMBOL(blk_mq_alloc_tag_set);
3499 :
3500 0 : void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
3501 : {
3502 0 : int i, j;
3503 :
3504 0 : for (i = 0; i < set->nr_hw_queues; i++)
3505 0 : blk_mq_free_map_and_requests(set, i);
3506 :
3507 0 : if (blk_mq_is_sbitmap_shared(set->flags))
3508 0 : blk_mq_exit_shared_sbitmap(set);
3509 :
3510 0 : for (j = 0; j < set->nr_maps; j++) {
3511 0 : kfree(set->map[j].mq_map);
3512 0 : set->map[j].mq_map = NULL;
3513 : }
3514 :
3515 0 : kfree(set->tags);
3516 0 : set->tags = NULL;
3517 0 : }
3518 : EXPORT_SYMBOL(blk_mq_free_tag_set);
3519 :
3520 0 : int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
3521 : {
3522 0 : struct blk_mq_tag_set *set = q->tag_set;
3523 0 : struct blk_mq_hw_ctx *hctx;
3524 0 : int i, ret;
3525 :
3526 0 : if (!set)
3527 : return -EINVAL;
3528 :
3529 0 : if (q->nr_requests == nr)
3530 : return 0;
3531 :
3532 0 : blk_mq_freeze_queue(q);
3533 0 : blk_mq_quiesce_queue(q);
3534 :
3535 0 : ret = 0;
3536 0 : queue_for_each_hw_ctx(q, hctx, i) {
3537 0 : if (!hctx->tags)
3538 0 : continue;
3539 : /*
3540 : * If we're using an MQ scheduler, just update the scheduler
3541 : * queue depth. This is similar to what the old code would do.
3542 : */
3543 0 : if (!hctx->sched_tags) {
3544 0 : ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
3545 : false);
3546 0 : if (!ret && blk_mq_is_sbitmap_shared(set->flags))
3547 0 : blk_mq_tag_resize_shared_sbitmap(set, nr);
3548 : } else {
3549 0 : ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
3550 : nr, true);
3551 : }
3552 0 : if (ret)
3553 : break;
3554 0 : if (q->elevator && q->elevator->type->ops.depth_updated)
3555 0 : q->elevator->type->ops.depth_updated(hctx);
3556 : }
3557 :
3558 0 : if (!ret)
3559 0 : q->nr_requests = nr;
3560 :
3561 0 : blk_mq_unquiesce_queue(q);
3562 0 : blk_mq_unfreeze_queue(q);
3563 :
3564 0 : return ret;
3565 : }
3566 :
3567 : /*
3568 : * request_queue and elevator_type pair.
3569 : * It is just used by __blk_mq_update_nr_hw_queues to cache
3570 : * the elevator_type associated with a request_queue.
3571 : */
3572 : struct blk_mq_qe_pair {
3573 : struct list_head node;
3574 : struct request_queue *q;
3575 : struct elevator_type *type;
3576 : };
3577 :
3578 : /*
3579 : * Cache the elevator_type in qe pair list and switch the
3580 : * io scheduler to 'none'
3581 : */
3582 0 : static bool blk_mq_elv_switch_none(struct list_head *head,
3583 : struct request_queue *q)
3584 : {
3585 0 : struct blk_mq_qe_pair *qe;
3586 :
3587 0 : if (!q->elevator)
3588 : return true;
3589 :
3590 0 : qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
3591 0 : if (!qe)
3592 : return false;
3593 :
3594 0 : INIT_LIST_HEAD(&qe->node);
3595 0 : qe->q = q;
3596 0 : qe->type = q->elevator->type;
3597 0 : list_add(&qe->node, head);
3598 :
3599 0 : mutex_lock(&q->sysfs_lock);
3600 : /*
3601 : * After elevator_switch_mq, the previous elevator_queue will be
3602 : * released by elevator_release. The reference of the io scheduler
3603 : * module get by elevator_get will also be put. So we need to get
3604 : * a reference of the io scheduler module here to prevent it to be
3605 : * removed.
3606 : */
3607 0 : __module_get(qe->type->elevator_owner);
3608 0 : elevator_switch_mq(q, NULL);
3609 0 : mutex_unlock(&q->sysfs_lock);
3610 :
3611 0 : return true;
3612 : }
3613 :
3614 0 : static void blk_mq_elv_switch_back(struct list_head *head,
3615 : struct request_queue *q)
3616 : {
3617 0 : struct blk_mq_qe_pair *qe;
3618 0 : struct elevator_type *t = NULL;
3619 :
3620 0 : list_for_each_entry(qe, head, node)
3621 0 : if (qe->q == q) {
3622 0 : t = qe->type;
3623 0 : break;
3624 : }
3625 :
3626 0 : if (!t)
3627 : return;
3628 :
3629 0 : list_del(&qe->node);
3630 0 : kfree(qe);
3631 :
3632 0 : mutex_lock(&q->sysfs_lock);
3633 0 : elevator_switch_mq(q, t);
3634 0 : mutex_unlock(&q->sysfs_lock);
3635 : }
3636 :
3637 0 : static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
3638 : int nr_hw_queues)
3639 : {
3640 0 : struct request_queue *q;
3641 0 : LIST_HEAD(head);
3642 0 : int prev_nr_hw_queues;
3643 :
3644 0 : lockdep_assert_held(&set->tag_list_lock);
3645 :
3646 0 : if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
3647 0 : nr_hw_queues = nr_cpu_ids;
3648 0 : if (nr_hw_queues < 1)
3649 0 : return;
3650 0 : if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
3651 : return;
3652 :
3653 0 : list_for_each_entry(q, &set->tag_list, tag_set_list)
3654 0 : blk_mq_freeze_queue(q);
3655 : /*
3656 : * Switch IO scheduler to 'none', cleaning up the data associated
3657 : * with the previous scheduler. We will switch back once we are done
3658 : * updating the new sw to hw queue mappings.
3659 : */
3660 0 : list_for_each_entry(q, &set->tag_list, tag_set_list)
3661 0 : if (!blk_mq_elv_switch_none(&head, q))
3662 0 : goto switch_back;
3663 :
3664 0 : list_for_each_entry(q, &set->tag_list, tag_set_list) {
3665 0 : blk_mq_debugfs_unregister_hctxs(q);
3666 0 : blk_mq_sysfs_unregister(q);
3667 : }
3668 :
3669 0 : prev_nr_hw_queues = set->nr_hw_queues;
3670 0 : if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
3671 : 0)
3672 0 : goto reregister;
3673 :
3674 0 : set->nr_hw_queues = nr_hw_queues;
3675 0 : fallback:
3676 0 : blk_mq_update_queue_map(set);
3677 0 : list_for_each_entry(q, &set->tag_list, tag_set_list) {
3678 0 : blk_mq_realloc_hw_ctxs(set, q);
3679 0 : if (q->nr_hw_queues != set->nr_hw_queues) {
3680 0 : pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3681 : nr_hw_queues, prev_nr_hw_queues);
3682 0 : set->nr_hw_queues = prev_nr_hw_queues;
3683 0 : blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3684 0 : goto fallback;
3685 : }
3686 0 : blk_mq_map_swqueue(q);
3687 : }
3688 :
3689 0 : reregister:
3690 0 : list_for_each_entry(q, &set->tag_list, tag_set_list) {
3691 0 : blk_mq_sysfs_register(q);
3692 0 : blk_mq_debugfs_register_hctxs(q);
3693 : }
3694 :
3695 0 : switch_back:
3696 0 : list_for_each_entry(q, &set->tag_list, tag_set_list)
3697 0 : blk_mq_elv_switch_back(&head, q);
3698 :
3699 0 : list_for_each_entry(q, &set->tag_list, tag_set_list)
3700 0 : blk_mq_unfreeze_queue(q);
3701 : }
3702 :
3703 0 : void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
3704 : {
3705 0 : mutex_lock(&set->tag_list_lock);
3706 0 : __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
3707 0 : mutex_unlock(&set->tag_list_lock);
3708 0 : }
3709 : EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
3710 :
3711 : /* Enable polling stats and return whether they were already enabled. */
3712 0 : static bool blk_poll_stats_enable(struct request_queue *q)
3713 : {
3714 0 : if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3715 0 : blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
3716 0 : return true;
3717 0 : blk_stat_add_callback(q, q->poll_cb);
3718 0 : return false;
3719 : }
3720 :
3721 0 : static void blk_mq_poll_stats_start(struct request_queue *q)
3722 : {
3723 : /*
3724 : * We don't arm the callback if polling stats are not enabled or the
3725 : * callback is already active.
3726 : */
3727 0 : if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3728 0 : blk_stat_is_active(q->poll_cb))
3729 : return;
3730 :
3731 0 : blk_stat_activate_msecs(q->poll_cb, 100);
3732 : }
3733 :
3734 0 : static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
3735 : {
3736 0 : struct request_queue *q = cb->data;
3737 0 : int bucket;
3738 :
3739 0 : for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
3740 0 : if (cb->stat[bucket].nr_samples)
3741 0 : q->poll_stat[bucket] = cb->stat[bucket];
3742 : }
3743 0 : }
3744 :
3745 0 : static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3746 : struct request *rq)
3747 : {
3748 0 : unsigned long ret = 0;
3749 0 : int bucket;
3750 :
3751 : /*
3752 : * If stats collection isn't on, don't sleep but turn it on for
3753 : * future users
3754 : */
3755 0 : if (!blk_poll_stats_enable(q))
3756 : return 0;
3757 :
3758 : /*
3759 : * As an optimistic guess, use half of the mean service time
3760 : * for this type of request. We can (and should) make this smarter.
3761 : * For instance, if the completion latencies are tight, we can
3762 : * get closer than just half the mean. This is especially
3763 : * important on devices where the completion latencies are longer
3764 : * than ~10 usec. We do use the stats for the relevant IO size
3765 : * if available which does lead to better estimates.
3766 : */
3767 0 : bucket = blk_mq_poll_stats_bkt(rq);
3768 0 : if (bucket < 0)
3769 : return ret;
3770 :
3771 0 : if (q->poll_stat[bucket].nr_samples)
3772 0 : ret = (q->poll_stat[bucket].mean + 1) / 2;
3773 :
3774 : return ret;
3775 : }
3776 :
3777 0 : static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3778 : struct request *rq)
3779 : {
3780 0 : struct hrtimer_sleeper hs;
3781 0 : enum hrtimer_mode mode;
3782 0 : unsigned int nsecs;
3783 0 : ktime_t kt;
3784 :
3785 0 : if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
3786 : return false;
3787 :
3788 : /*
3789 : * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3790 : *
3791 : * 0: use half of prev avg
3792 : * >0: use this specific value
3793 : */
3794 0 : if (q->poll_nsec > 0)
3795 0 : nsecs = q->poll_nsec;
3796 : else
3797 0 : nsecs = blk_mq_poll_nsecs(q, rq);
3798 :
3799 0 : if (!nsecs)
3800 : return false;
3801 :
3802 0 : rq->rq_flags |= RQF_MQ_POLL_SLEPT;
3803 :
3804 : /*
3805 : * This will be replaced with the stats tracking code, using
3806 : * 'avg_completion_time / 2' as the pre-sleep target.
3807 : */
3808 0 : kt = nsecs;
3809 :
3810 0 : mode = HRTIMER_MODE_REL;
3811 0 : hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
3812 0 : hrtimer_set_expires(&hs.timer, kt);
3813 :
3814 0 : do {
3815 0 : if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
3816 : break;
3817 0 : set_current_state(TASK_UNINTERRUPTIBLE);
3818 0 : hrtimer_sleeper_start_expires(&hs, mode);
3819 0 : if (hs.task)
3820 0 : io_schedule();
3821 0 : hrtimer_cancel(&hs.timer);
3822 0 : mode = HRTIMER_MODE_ABS;
3823 0 : } while (hs.task && !signal_pending(current));
3824 :
3825 0 : __set_current_state(TASK_RUNNING);
3826 0 : destroy_hrtimer_on_stack(&hs.timer);
3827 0 : return true;
3828 : }
3829 :
3830 0 : static bool blk_mq_poll_hybrid(struct request_queue *q,
3831 : struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3832 : {
3833 0 : struct request *rq;
3834 :
3835 0 : if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
3836 : return false;
3837 :
3838 0 : if (!blk_qc_t_is_internal(cookie))
3839 0 : rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3840 : else {
3841 0 : rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3842 : /*
3843 : * With scheduling, if the request has completed, we'll
3844 : * get a NULL return here, as we clear the sched tag when
3845 : * that happens. The request still remains valid, like always,
3846 : * so we should be safe with just the NULL check.
3847 : */
3848 0 : if (!rq)
3849 : return false;
3850 : }
3851 :
3852 0 : return blk_mq_poll_hybrid_sleep(q, rq);
3853 : }
3854 :
3855 : /**
3856 : * blk_poll - poll for IO completions
3857 : * @q: the queue
3858 : * @cookie: cookie passed back at IO submission time
3859 : * @spin: whether to spin for completions
3860 : *
3861 : * Description:
3862 : * Poll for completions on the passed in queue. Returns number of
3863 : * completed entries found. If @spin is true, then blk_poll will continue
3864 : * looping until at least one completion is found, unless the task is
3865 : * otherwise marked running (or we need to reschedule).
3866 : */
3867 0 : int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3868 : {
3869 0 : struct blk_mq_hw_ctx *hctx;
3870 0 : long state;
3871 :
3872 0 : if (!blk_qc_t_valid(cookie) ||
3873 0 : !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3874 0 : return 0;
3875 :
3876 0 : if (current->plug)
3877 0 : blk_flush_plug_list(current->plug, false);
3878 :
3879 0 : hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3880 :
3881 : /*
3882 : * If we sleep, have the caller restart the poll loop to reset
3883 : * the state. Like for the other success return cases, the
3884 : * caller is responsible for checking if the IO completed. If
3885 : * the IO isn't complete, we'll get called again and will go
3886 : * straight to the busy poll loop. If specified not to spin,
3887 : * we also should not sleep.
3888 : */
3889 0 : if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
3890 : return 1;
3891 :
3892 0 : hctx->poll_considered++;
3893 :
3894 0 : state = current->state;
3895 0 : do {
3896 0 : int ret;
3897 :
3898 0 : hctx->poll_invoked++;
3899 :
3900 0 : ret = q->mq_ops->poll(hctx);
3901 0 : if (ret > 0) {
3902 0 : hctx->poll_success++;
3903 0 : __set_current_state(TASK_RUNNING);
3904 0 : return ret;
3905 : }
3906 :
3907 0 : if (signal_pending_state(state, current))
3908 0 : __set_current_state(TASK_RUNNING);
3909 :
3910 0 : if (current->state == TASK_RUNNING)
3911 : return 1;
3912 0 : if (ret < 0 || !spin)
3913 : break;
3914 0 : cpu_relax();
3915 0 : } while (!need_resched());
3916 :
3917 0 : __set_current_state(TASK_RUNNING);
3918 0 : return 0;
3919 : }
3920 : EXPORT_SYMBOL_GPL(blk_poll);
3921 :
3922 0 : unsigned int blk_mq_rq_cpu(struct request *rq)
3923 : {
3924 0 : return rq->mq_ctx->cpu;
3925 : }
3926 : EXPORT_SYMBOL(blk_mq_rq_cpu);
3927 :
3928 1 : static int __init blk_mq_init(void)
3929 : {
3930 1 : int i;
3931 :
3932 6 : for_each_possible_cpu(i)
3933 5 : init_llist_head(&per_cpu(blk_cpu_done, i));
3934 1 : open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
3935 :
3936 1 : cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
3937 : "block/softirq:dead", NULL,
3938 : blk_softirq_cpu_dead);
3939 1 : cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3940 : blk_mq_hctx_notify_dead);
3941 1 : cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
3942 : blk_mq_hctx_notify_online,
3943 : blk_mq_hctx_notify_offline);
3944 1 : return 0;
3945 : }
3946 : subsys_initcall(blk_mq_init);
|