Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
4 : */
5 : #include <linux/mm.h>
6 : #include <linux/swap.h>
7 : #include <linux/bio.h>
8 : #include <linux/blkdev.h>
9 : #include <linux/uio.h>
10 : #include <linux/iocontext.h>
11 : #include <linux/slab.h>
12 : #include <linux/init.h>
13 : #include <linux/kernel.h>
14 : #include <linux/export.h>
15 : #include <linux/mempool.h>
16 : #include <linux/workqueue.h>
17 : #include <linux/cgroup.h>
18 : #include <linux/blk-cgroup.h>
19 : #include <linux/highmem.h>
20 : #include <linux/sched/sysctl.h>
21 : #include <linux/blk-crypto.h>
22 : #include <linux/xarray.h>
23 :
24 : #include <trace/events/block.h>
25 : #include "blk.h"
26 : #include "blk-rq-qos.h"
27 :
28 : static struct biovec_slab {
29 : int nr_vecs;
30 : char *name;
31 : struct kmem_cache *slab;
32 : } bvec_slabs[] __read_mostly = {
33 : { .nr_vecs = 16, .name = "biovec-16" },
34 : { .nr_vecs = 64, .name = "biovec-64" },
35 : { .nr_vecs = 128, .name = "biovec-128" },
36 : { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
37 : };
38 :
39 2117 : static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
40 : {
41 2117 : switch (nr_vecs) {
42 : /* smaller bios use inline vecs */
43 : case 5 ... 16:
44 : return &bvec_slabs[0];
45 1332 : case 17 ... 64:
46 1332 : return &bvec_slabs[1];
47 0 : case 65 ... 128:
48 0 : return &bvec_slabs[2];
49 129 : case 129 ... BIO_MAX_VECS:
50 129 : return &bvec_slabs[3];
51 0 : default:
52 0 : BUG();
53 : return NULL;
54 : }
55 : }
56 :
57 : /*
58 : * fs_bio_set is the bio_set containing bio and iovec memory pools used by
59 : * IO code that does not need private memory pools.
60 : */
61 : struct bio_set fs_bio_set;
62 : EXPORT_SYMBOL(fs_bio_set);
63 :
64 : /*
65 : * Our slab pool management
66 : */
67 : struct bio_slab {
68 : struct kmem_cache *slab;
69 : unsigned int slab_ref;
70 : unsigned int slab_size;
71 : char name[8];
72 : };
73 : static DEFINE_MUTEX(bio_slab_lock);
74 : static DEFINE_XARRAY(bio_slabs);
75 :
76 4 : static struct bio_slab *create_bio_slab(unsigned int size)
77 : {
78 4 : struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);
79 :
80 4 : if (!bslab)
81 : return NULL;
82 :
83 4 : snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
84 4 : bslab->slab = kmem_cache_create(bslab->name, size,
85 : ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
86 4 : if (!bslab->slab)
87 0 : goto fail_alloc_slab;
88 :
89 4 : bslab->slab_ref = 1;
90 4 : bslab->slab_size = size;
91 :
92 4 : if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
93 : return bslab;
94 :
95 0 : kmem_cache_destroy(bslab->slab);
96 :
97 0 : fail_alloc_slab:
98 0 : kfree(bslab);
99 0 : return NULL;
100 : }
101 :
102 12 : static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
103 : {
104 12 : return bs->front_pad + sizeof(struct bio) + bs->back_pad;
105 : }
106 :
107 12 : static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
108 : {
109 12 : unsigned int size = bs_bio_slab_size(bs);
110 12 : struct bio_slab *bslab;
111 :
112 12 : mutex_lock(&bio_slab_lock);
113 12 : bslab = xa_load(&bio_slabs, size);
114 12 : if (bslab)
115 8 : bslab->slab_ref++;
116 : else
117 4 : bslab = create_bio_slab(size);
118 12 : mutex_unlock(&bio_slab_lock);
119 :
120 12 : if (bslab)
121 12 : return bslab->slab;
122 : return NULL;
123 : }
124 :
125 0 : static void bio_put_slab(struct bio_set *bs)
126 : {
127 0 : struct bio_slab *bslab = NULL;
128 0 : unsigned int slab_size = bs_bio_slab_size(bs);
129 :
130 0 : mutex_lock(&bio_slab_lock);
131 :
132 0 : bslab = xa_load(&bio_slabs, slab_size);
133 0 : if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
134 0 : goto out;
135 :
136 0 : WARN_ON_ONCE(bslab->slab != bs->bio_slab);
137 :
138 0 : WARN_ON(!bslab->slab_ref);
139 :
140 0 : if (--bslab->slab_ref)
141 0 : goto out;
142 :
143 0 : xa_erase(&bio_slabs, slab_size);
144 :
145 0 : kmem_cache_destroy(bslab->slab);
146 0 : kfree(bslab);
147 :
148 0 : out:
149 0 : mutex_unlock(&bio_slab_lock);
150 0 : }
151 :
152 8664 : void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
153 : {
154 8664 : BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
155 :
156 8664 : if (nr_vecs == BIO_MAX_VECS)
157 129 : mempool_free(bv, pool);
158 8535 : else if (nr_vecs > BIO_INLINE_VECS)
159 994 : kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
160 8664 : }
161 :
162 : /*
163 : * Make the first allocation restricted and don't dump info on allocation
164 : * failures, since we'll fall back to the mempool in case of failure.
165 : */
166 994 : static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
167 : {
168 994 : return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
169 994 : __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
170 : }
171 :
172 1123 : struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
173 : gfp_t gfp_mask)
174 : {
175 1123 : struct biovec_slab *bvs = biovec_slab(*nr_vecs);
176 :
177 1123 : if (WARN_ON_ONCE(!bvs))
178 : return NULL;
179 :
180 : /*
181 : * Upgrade the nr_vecs request to take full advantage of the allocation.
182 : * We also rely on this in the bvec_free path.
183 : */
184 1123 : *nr_vecs = bvs->nr_vecs;
185 :
186 : /*
187 : * Try a slab allocation first for all smaller allocations. If that
188 : * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
189 : * The mempool is sized to handle up to BIO_MAX_VECS entries.
190 : */
191 1123 : if (*nr_vecs < BIO_MAX_VECS) {
192 994 : struct bio_vec *bvl;
193 :
194 994 : bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
195 994 : if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
196 : return bvl;
197 0 : *nr_vecs = BIO_MAX_VECS;
198 : }
199 :
200 129 : return mempool_alloc(pool, gfp_mask);
201 : }
202 :
203 17405 : void bio_uninit(struct bio *bio)
204 : {
205 : #ifdef CONFIG_BLK_CGROUP
206 : if (bio->bi_blkg) {
207 : blkg_put(bio->bi_blkg);
208 : bio->bi_blkg = NULL;
209 : }
210 : #endif
211 17405 : if (bio_integrity(bio))
212 17405 : bio_integrity_free(bio);
213 :
214 17405 : bio_crypt_free_ctx(bio);
215 0 : }
216 : EXPORT_SYMBOL(bio_uninit);
217 :
218 8666 : static void bio_free(struct bio *bio)
219 : {
220 8666 : struct bio_set *bs = bio->bi_pool;
221 8666 : void *p;
222 :
223 8666 : bio_uninit(bio);
224 :
225 8666 : if (bs) {
226 8664 : bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
227 :
228 : /*
229 : * If we have front padding, adjust the bio pointer before freeing
230 : */
231 8665 : p = bio;
232 8665 : p -= bs->front_pad;
233 :
234 8665 : mempool_free(p, &bs->bio_pool);
235 : } else {
236 : /* Bio was allocated by bio_kmalloc() */
237 2 : kfree(bio);
238 : }
239 8667 : }
240 :
241 : /*
242 : * Users of this function have their own bio allocation. Subsequently,
243 : * they must remember to pair any call to bio_init() with bio_uninit()
244 : * when IO has completed, or when the bio is released.
245 : */
246 8739 : void bio_init(struct bio *bio, struct bio_vec *table,
247 : unsigned short max_vecs)
248 : {
249 8739 : memset(bio, 0, sizeof(*bio));
250 8739 : atomic_set(&bio->__bi_remaining, 1);
251 8739 : atomic_set(&bio->__bi_cnt, 1);
252 :
253 8739 : bio->bi_io_vec = table;
254 8739 : bio->bi_max_vecs = max_vecs;
255 8739 : }
256 : EXPORT_SYMBOL(bio_init);
257 :
258 : /**
259 : * bio_reset - reinitialize a bio
260 : * @bio: bio to reset
261 : *
262 : * Description:
263 : * After calling bio_reset(), @bio will be in the same state as a freshly
264 : * allocated bio returned bio bio_alloc_bioset() - the only fields that are
265 : * preserved are the ones that are initialized by bio_alloc_bioset(). See
266 : * comment in struct bio.
267 : */
268 0 : void bio_reset(struct bio *bio)
269 : {
270 0 : bio_uninit(bio);
271 0 : memset(bio, 0, BIO_RESET_BYTES);
272 0 : atomic_set(&bio->__bi_remaining, 1);
273 0 : }
274 : EXPORT_SYMBOL(bio_reset);
275 :
276 0 : static struct bio *__bio_chain_endio(struct bio *bio)
277 : {
278 0 : struct bio *parent = bio->bi_private;
279 :
280 0 : if (!parent->bi_status)
281 0 : parent->bi_status = bio->bi_status;
282 0 : bio_put(bio);
283 0 : return parent;
284 : }
285 :
286 0 : static void bio_chain_endio(struct bio *bio)
287 : {
288 0 : bio_endio(__bio_chain_endio(bio));
289 0 : }
290 :
291 : /**
292 : * bio_chain - chain bio completions
293 : * @bio: the target bio
294 : * @parent: the parent bio of @bio
295 : *
296 : * The caller won't have a bi_end_io called when @bio completes - instead,
297 : * @parent's bi_end_io won't be called until both @parent and @bio have
298 : * completed; the chained bio will also be freed when it completes.
299 : *
300 : * The caller must not set bi_private or bi_end_io in @bio.
301 : */
302 0 : void bio_chain(struct bio *bio, struct bio *parent)
303 : {
304 0 : BUG_ON(bio->bi_private || bio->bi_end_io);
305 :
306 0 : bio->bi_private = parent;
307 0 : bio->bi_end_io = bio_chain_endio;
308 0 : bio_inc_remaining(parent);
309 0 : }
310 : EXPORT_SYMBOL(bio_chain);
311 :
312 0 : static void bio_alloc_rescue(struct work_struct *work)
313 : {
314 0 : struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
315 0 : struct bio *bio;
316 :
317 0 : while (1) {
318 0 : spin_lock(&bs->rescue_lock);
319 0 : bio = bio_list_pop(&bs->rescue_list);
320 0 : spin_unlock(&bs->rescue_lock);
321 :
322 0 : if (!bio)
323 : break;
324 :
325 0 : submit_bio_noacct(bio);
326 : }
327 0 : }
328 :
329 0 : static void punt_bios_to_rescuer(struct bio_set *bs)
330 : {
331 0 : struct bio_list punt, nopunt;
332 0 : struct bio *bio;
333 :
334 0 : if (WARN_ON_ONCE(!bs->rescue_workqueue))
335 0 : return;
336 : /*
337 : * In order to guarantee forward progress we must punt only bios that
338 : * were allocated from this bio_set; otherwise, if there was a bio on
339 : * there for a stacking driver higher up in the stack, processing it
340 : * could require allocating bios from this bio_set, and doing that from
341 : * our own rescuer would be bad.
342 : *
343 : * Since bio lists are singly linked, pop them all instead of trying to
344 : * remove from the middle of the list:
345 : */
346 :
347 0 : bio_list_init(&punt);
348 0 : bio_list_init(&nopunt);
349 :
350 0 : while ((bio = bio_list_pop(¤t->bio_list[0])))
351 0 : bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
352 0 : current->bio_list[0] = nopunt;
353 :
354 0 : bio_list_init(&nopunt);
355 0 : while ((bio = bio_list_pop(¤t->bio_list[1])))
356 0 : bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
357 0 : current->bio_list[1] = nopunt;
358 :
359 0 : spin_lock(&bs->rescue_lock);
360 0 : bio_list_merge(&bs->rescue_list, &punt);
361 0 : spin_unlock(&bs->rescue_lock);
362 :
363 0 : queue_work(bs->rescue_workqueue, &bs->rescue_work);
364 : }
365 :
366 : /**
367 : * bio_alloc_bioset - allocate a bio for I/O
368 : * @gfp_mask: the GFP_* mask given to the slab allocator
369 : * @nr_iovecs: number of iovecs to pre-allocate
370 : * @bs: the bio_set to allocate from.
371 : *
372 : * Allocate a bio from the mempools in @bs.
373 : *
374 : * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
375 : * allocate a bio. This is due to the mempool guarantees. To make this work,
376 : * callers must never allocate more than 1 bio at a time from the general pool.
377 : * Callers that need to allocate more than 1 bio must always submit the
378 : * previously allocated bio for IO before attempting to allocate a new one.
379 : * Failure to do so can cause deadlocks under memory pressure.
380 : *
381 : * Note that when running under submit_bio_noacct() (i.e. any block driver),
382 : * bios are not submitted until after you return - see the code in
383 : * submit_bio_noacct() that converts recursion into iteration, to prevent
384 : * stack overflows.
385 : *
386 : * This would normally mean allocating multiple bios under submit_bio_noacct()
387 : * would be susceptible to deadlocks, but we have
388 : * deadlock avoidance code that resubmits any blocked bios from a rescuer
389 : * thread.
390 : *
391 : * However, we do not guarantee forward progress for allocations from other
392 : * mempools. Doing multiple allocations from the same mempool under
393 : * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
394 : * for per bio allocations.
395 : *
396 : * Returns: Pointer to new bio on success, NULL on failure.
397 : */
398 8665 : struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
399 : struct bio_set *bs)
400 : {
401 8665 : gfp_t saved_gfp = gfp_mask;
402 8665 : struct bio *bio;
403 8665 : void *p;
404 :
405 : /* should not use nobvec bioset for nr_iovecs > 0 */
406 17330 : if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
407 : return NULL;
408 :
409 : /*
410 : * submit_bio_noacct() converts recursion to iteration; this means if
411 : * we're running beneath it, any bios we allocate and submit will not be
412 : * submitted (and thus freed) until after we return.
413 : *
414 : * This exposes us to a potential deadlock if we allocate multiple bios
415 : * from the same bio_set() while running underneath submit_bio_noacct().
416 : * If we were to allocate multiple bios (say a stacking block driver
417 : * that was splitting bios), we would deadlock if we exhausted the
418 : * mempool's reserve.
419 : *
420 : * We solve this, and guarantee forward progress, with a rescuer
421 : * workqueue per bio_set. If we go to allocate and there are bios on
422 : * current->bio_list, we first try the allocation without
423 : * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
424 : * blocking to the rescuer workqueue before we retry with the original
425 : * gfp_flags.
426 : */
427 8665 : if (current->bio_list &&
428 0 : (!bio_list_empty(¤t->bio_list[0]) ||
429 0 : !bio_list_empty(¤t->bio_list[1])) &&
430 0 : bs->rescue_workqueue)
431 0 : gfp_mask &= ~__GFP_DIRECT_RECLAIM;
432 :
433 8665 : p = mempool_alloc(&bs->bio_pool, gfp_mask);
434 8665 : if (!p && gfp_mask != saved_gfp) {
435 0 : punt_bios_to_rescuer(bs);
436 0 : gfp_mask = saved_gfp;
437 0 : p = mempool_alloc(&bs->bio_pool, gfp_mask);
438 : }
439 8665 : if (unlikely(!p))
440 : return NULL;
441 :
442 8665 : bio = p + bs->front_pad;
443 8665 : if (nr_iovecs > BIO_INLINE_VECS) {
444 1123 : struct bio_vec *bvl = NULL;
445 :
446 1123 : bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
447 1123 : if (!bvl && gfp_mask != saved_gfp) {
448 0 : punt_bios_to_rescuer(bs);
449 0 : gfp_mask = saved_gfp;
450 0 : bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
451 : }
452 1123 : if (unlikely(!bvl))
453 0 : goto err_free;
454 :
455 1123 : bio_init(bio, bvl, nr_iovecs);
456 7542 : } else if (nr_iovecs) {
457 7542 : bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
458 : } else {
459 0 : bio_init(bio, NULL, 0);
460 : }
461 :
462 8665 : bio->bi_pool = bs;
463 8665 : return bio;
464 :
465 0 : err_free:
466 0 : mempool_free(p, &bs->bio_pool);
467 0 : return NULL;
468 : }
469 : EXPORT_SYMBOL(bio_alloc_bioset);
470 :
471 : /**
472 : * bio_kmalloc - kmalloc a bio for I/O
473 : * @gfp_mask: the GFP_* mask given to the slab allocator
474 : * @nr_iovecs: number of iovecs to pre-allocate
475 : *
476 : * Use kmalloc to allocate and initialize a bio.
477 : *
478 : * Returns: Pointer to new bio on success, NULL on failure.
479 : */
480 2 : struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
481 : {
482 2 : struct bio *bio;
483 :
484 2 : if (nr_iovecs > UIO_MAXIOV)
485 : return NULL;
486 :
487 2 : bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
488 2 : if (unlikely(!bio))
489 : return NULL;
490 2 : bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
491 2 : bio->bi_pool = NULL;
492 2 : return bio;
493 : }
494 : EXPORT_SYMBOL(bio_kmalloc);
495 :
496 0 : void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
497 : {
498 0 : unsigned long flags;
499 0 : struct bio_vec bv;
500 0 : struct bvec_iter iter;
501 :
502 0 : __bio_for_each_segment(bv, bio, iter, start) {
503 0 : char *data = bvec_kmap_irq(&bv, &flags);
504 0 : memset(data, 0, bv.bv_len);
505 0 : flush_dcache_page(bv.bv_page);
506 0 : bvec_kunmap_irq(data, &flags);
507 : }
508 0 : }
509 : EXPORT_SYMBOL(zero_fill_bio_iter);
510 :
511 : /**
512 : * bio_truncate - truncate the bio to small size of @new_size
513 : * @bio: the bio to be truncated
514 : * @new_size: new size for truncating the bio
515 : *
516 : * Description:
517 : * Truncate the bio to new size of @new_size. If bio_op(bio) is
518 : * REQ_OP_READ, zero the truncated part. This function should only
519 : * be used for handling corner cases, such as bio eod.
520 : */
521 0 : void bio_truncate(struct bio *bio, unsigned new_size)
522 : {
523 0 : struct bio_vec bv;
524 0 : struct bvec_iter iter;
525 0 : unsigned int done = 0;
526 0 : bool truncated = false;
527 :
528 0 : if (new_size >= bio->bi_iter.bi_size)
529 0 : return;
530 :
531 0 : if (bio_op(bio) != REQ_OP_READ)
532 0 : goto exit;
533 :
534 0 : bio_for_each_segment(bv, bio, iter) {
535 0 : if (done + bv.bv_len > new_size) {
536 0 : unsigned offset;
537 :
538 0 : if (!truncated)
539 0 : offset = new_size - done;
540 : else
541 : offset = 0;
542 0 : zero_user(bv.bv_page, offset, bv.bv_len - offset);
543 0 : truncated = true;
544 : }
545 0 : done += bv.bv_len;
546 : }
547 :
548 0 : exit:
549 : /*
550 : * Don't touch bvec table here and make it really immutable, since
551 : * fs bio user has to retrieve all pages via bio_for_each_segment_all
552 : * in its .end_bio() callback.
553 : *
554 : * It is enough to truncate bio by updating .bi_size since we can make
555 : * correct bvec with the updated .bi_size for drivers.
556 : */
557 0 : bio->bi_iter.bi_size = new_size;
558 : }
559 :
560 : /**
561 : * guard_bio_eod - truncate a BIO to fit the block device
562 : * @bio: bio to truncate
563 : *
564 : * This allows us to do IO even on the odd last sectors of a device, even if the
565 : * block size is some multiple of the physical sector size.
566 : *
567 : * We'll just truncate the bio to the size of the device, and clear the end of
568 : * the buffer head manually. Truly out-of-range accesses will turn into actual
569 : * I/O errors, this only handles the "we need to be able to do I/O at the final
570 : * sector" case.
571 : */
572 6859 : void guard_bio_eod(struct bio *bio)
573 : {
574 6859 : sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
575 :
576 6859 : if (!maxsector)
577 : return;
578 :
579 : /*
580 : * If the *whole* IO is past the end of the device,
581 : * let it through, and the IO layer will turn it into
582 : * an EIO.
583 : */
584 6859 : if (unlikely(bio->bi_iter.bi_sector >= maxsector))
585 : return;
586 :
587 6859 : maxsector -= bio->bi_iter.bi_sector;
588 6859 : if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
589 : return;
590 :
591 0 : bio_truncate(bio, maxsector << 9);
592 : }
593 :
594 : /**
595 : * bio_put - release a reference to a bio
596 : * @bio: bio to release reference to
597 : *
598 : * Description:
599 : * Put a reference to a &struct bio, either one you have gotten with
600 : * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
601 : **/
602 8666 : void bio_put(struct bio *bio)
603 : {
604 8666 : if (!bio_flagged(bio, BIO_REFFED))
605 8666 : bio_free(bio);
606 : else {
607 0 : BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
608 :
609 : /*
610 : * last put frees it
611 : */
612 0 : if (atomic_dec_and_test(&bio->__bi_cnt))
613 0 : bio_free(bio);
614 : }
615 8667 : }
616 : EXPORT_SYMBOL(bio_put);
617 :
618 : /**
619 : * __bio_clone_fast - clone a bio that shares the original bio's biovec
620 : * @bio: destination bio
621 : * @bio_src: bio to clone
622 : *
623 : * Clone a &bio. Caller will own the returned bio, but not
624 : * the actual data it points to. Reference count of returned
625 : * bio will be one.
626 : *
627 : * Caller must ensure that @bio_src is not freed before @bio.
628 : */
629 0 : void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
630 : {
631 0 : WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
632 :
633 : /*
634 : * most users will be overriding ->bi_bdev with a new target,
635 : * so we don't set nor calculate new physical/hw segment counts here
636 : */
637 0 : bio->bi_bdev = bio_src->bi_bdev;
638 0 : bio_set_flag(bio, BIO_CLONED);
639 0 : if (bio_flagged(bio_src, BIO_THROTTLED))
640 0 : bio_set_flag(bio, BIO_THROTTLED);
641 0 : if (bio_flagged(bio_src, BIO_REMAPPED))
642 0 : bio_set_flag(bio, BIO_REMAPPED);
643 0 : bio->bi_opf = bio_src->bi_opf;
644 0 : bio->bi_ioprio = bio_src->bi_ioprio;
645 0 : bio->bi_write_hint = bio_src->bi_write_hint;
646 0 : bio->bi_iter = bio_src->bi_iter;
647 0 : bio->bi_io_vec = bio_src->bi_io_vec;
648 :
649 0 : bio_clone_blkg_association(bio, bio_src);
650 0 : blkcg_bio_issue_init(bio);
651 0 : }
652 : EXPORT_SYMBOL(__bio_clone_fast);
653 :
654 : /**
655 : * bio_clone_fast - clone a bio that shares the original bio's biovec
656 : * @bio: bio to clone
657 : * @gfp_mask: allocation priority
658 : * @bs: bio_set to allocate from
659 : *
660 : * Like __bio_clone_fast, only also allocates the returned bio
661 : */
662 0 : struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
663 : {
664 0 : struct bio *b;
665 :
666 0 : b = bio_alloc_bioset(gfp_mask, 0, bs);
667 0 : if (!b)
668 : return NULL;
669 :
670 0 : __bio_clone_fast(b, bio);
671 :
672 0 : if (bio_crypt_clone(b, bio, gfp_mask) < 0)
673 : goto err_put;
674 :
675 0 : if (bio_integrity(bio) &&
676 : bio_integrity_clone(b, bio, gfp_mask) < 0)
677 : goto err_put;
678 :
679 0 : return b;
680 :
681 : err_put:
682 : bio_put(b);
683 : return NULL;
684 : }
685 : EXPORT_SYMBOL(bio_clone_fast);
686 :
687 0 : const char *bio_devname(struct bio *bio, char *buf)
688 : {
689 0 : return bdevname(bio->bi_bdev, buf);
690 : }
691 : EXPORT_SYMBOL(bio_devname);
692 :
693 20444 : static inline bool page_is_mergeable(const struct bio_vec *bv,
694 : struct page *page, unsigned int len, unsigned int off,
695 : bool *same_page)
696 : {
697 20444 : size_t bv_end = bv->bv_offset + bv->bv_len;
698 20444 : phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
699 20444 : phys_addr_t page_addr = page_to_phys(page);
700 :
701 20444 : if (vec_end_addr + 1 != page_addr + off)
702 : return false;
703 16521 : if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
704 : return false;
705 :
706 16521 : *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
707 16521 : if (*same_page)
708 : return true;
709 16521 : return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
710 : }
711 :
712 : /*
713 : * Try to merge a page into a segment, while obeying the hardware segment
714 : * size limit. This is not for normal read/write bios, but for passthrough
715 : * or Zone Append operations that we can't split.
716 : */
717 0 : static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
718 : struct page *page, unsigned len,
719 : unsigned offset, bool *same_page)
720 : {
721 0 : struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
722 0 : unsigned long mask = queue_segment_boundary(q);
723 0 : phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
724 0 : phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
725 :
726 0 : if ((addr1 | mask) != (addr2 | mask))
727 : return false;
728 0 : if (bv->bv_len + len > queue_max_segment_size(q))
729 : return false;
730 0 : return __bio_try_merge_page(bio, page, len, offset, same_page);
731 : }
732 :
733 : /**
734 : * bio_add_hw_page - attempt to add a page to a bio with hw constraints
735 : * @q: the target queue
736 : * @bio: destination bio
737 : * @page: page to add
738 : * @len: vec entry length
739 : * @offset: vec entry offset
740 : * @max_sectors: maximum number of sectors that can be added
741 : * @same_page: return if the segment has been merged inside the same page
742 : *
743 : * Add a page to a bio while respecting the hardware max_sectors, max_segment
744 : * and gap limitations.
745 : */
746 2 : int bio_add_hw_page(struct request_queue *q, struct bio *bio,
747 : struct page *page, unsigned int len, unsigned int offset,
748 : unsigned int max_sectors, bool *same_page)
749 : {
750 2 : struct bio_vec *bvec;
751 :
752 2 : if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
753 : return 0;
754 :
755 2 : if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
756 : return 0;
757 :
758 2 : if (bio->bi_vcnt > 0) {
759 0 : if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
760 0 : return len;
761 :
762 : /*
763 : * If the queue doesn't support SG gaps and adding this segment
764 : * would create a gap, disallow it.
765 : */
766 0 : bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
767 0 : if (bvec_gap_to_prev(q, bvec, offset))
768 : return 0;
769 : }
770 :
771 2 : if (bio_full(bio, len))
772 : return 0;
773 :
774 2 : if (bio->bi_vcnt >= queue_max_segments(q))
775 : return 0;
776 :
777 2 : bvec = &bio->bi_io_vec[bio->bi_vcnt];
778 2 : bvec->bv_page = page;
779 2 : bvec->bv_len = len;
780 2 : bvec->bv_offset = offset;
781 2 : bio->bi_vcnt++;
782 2 : bio->bi_iter.bi_size += len;
783 2 : return len;
784 : }
785 :
786 : /**
787 : * bio_add_pc_page - attempt to add page to passthrough bio
788 : * @q: the target queue
789 : * @bio: destination bio
790 : * @page: page to add
791 : * @len: vec entry length
792 : * @offset: vec entry offset
793 : *
794 : * Attempt to add a page to the bio_vec maplist. This can fail for a
795 : * number of reasons, such as the bio being full or target block device
796 : * limitations. The target block device must allow bio's up to PAGE_SIZE,
797 : * so it is always possible to add a single page to an empty bio.
798 : *
799 : * This should only be used by passthrough bios.
800 : */
801 2 : int bio_add_pc_page(struct request_queue *q, struct bio *bio,
802 : struct page *page, unsigned int len, unsigned int offset)
803 : {
804 2 : bool same_page = false;
805 2 : return bio_add_hw_page(q, bio, page, len, offset,
806 : queue_max_hw_sectors(q), &same_page);
807 : }
808 : EXPORT_SYMBOL(bio_add_pc_page);
809 :
810 : /**
811 : * bio_add_zone_append_page - attempt to add page to zone-append bio
812 : * @bio: destination bio
813 : * @page: page to add
814 : * @len: vec entry length
815 : * @offset: vec entry offset
816 : *
817 : * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
818 : * for a zone-append request. This can fail for a number of reasons, such as the
819 : * bio being full or the target block device is not a zoned block device or
820 : * other limitations of the target block device. The target block device must
821 : * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
822 : * to an empty bio.
823 : *
824 : * Returns: number of bytes added to the bio, or 0 in case of a failure.
825 : */
826 0 : int bio_add_zone_append_page(struct bio *bio, struct page *page,
827 : unsigned int len, unsigned int offset)
828 : {
829 0 : struct request_queue *q = bio->bi_bdev->bd_disk->queue;
830 0 : bool same_page = false;
831 :
832 0 : if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
833 : return 0;
834 :
835 0 : if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
836 0 : return 0;
837 :
838 : return bio_add_hw_page(q, bio, page, len, offset,
839 : queue_max_zone_append_sectors(q), &same_page);
840 : }
841 : EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
842 :
843 : /**
844 : * __bio_try_merge_page - try appending data to an existing bvec.
845 : * @bio: destination bio
846 : * @page: start page to add
847 : * @len: length of the data to add
848 : * @off: offset of the data relative to @page
849 : * @same_page: return if the segment has been merged inside the same page
850 : *
851 : * Try to add the data at @page + @off to the last bvec of @bio. This is a
852 : * useful optimisation for file systems with a block size smaller than the
853 : * page size.
854 : *
855 : * Warn if (@len, @off) crosses pages in case that @same_page is true.
856 : *
857 : * Return %true on success or %false on failure.
858 : */
859 29109 : bool __bio_try_merge_page(struct bio *bio, struct page *page,
860 : unsigned int len, unsigned int off, bool *same_page)
861 : {
862 29109 : if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
863 : return false;
864 :
865 29109 : if (bio->bi_vcnt > 0) {
866 20444 : struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
867 :
868 20444 : if (page_is_mergeable(bv, page, len, off, same_page)) {
869 16521 : if (bio->bi_iter.bi_size > UINT_MAX - len) {
870 0 : *same_page = false;
871 0 : return false;
872 : }
873 16521 : bv->bv_len += len;
874 16521 : bio->bi_iter.bi_size += len;
875 16521 : return true;
876 : }
877 : }
878 : return false;
879 : }
880 : EXPORT_SYMBOL_GPL(__bio_try_merge_page);
881 :
882 : /**
883 : * __bio_add_page - add page(s) to a bio in a new segment
884 : * @bio: destination bio
885 : * @page: start page to add
886 : * @len: length of the data to add, may cross pages
887 : * @off: offset of the data relative to @page, may cross pages
888 : *
889 : * Add the data at @page + @off to @bio as a new bvec. The caller must ensure
890 : * that @bio has space for another bvec.
891 : */
892 12588 : void __bio_add_page(struct bio *bio, struct page *page,
893 : unsigned int len, unsigned int off)
894 : {
895 12588 : struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
896 :
897 12588 : WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
898 25176 : WARN_ON_ONCE(bio_full(bio, len));
899 :
900 12588 : bv->bv_page = page;
901 12588 : bv->bv_offset = off;
902 12588 : bv->bv_len = len;
903 :
904 12588 : bio->bi_iter.bi_size += len;
905 12588 : bio->bi_vcnt++;
906 :
907 25176 : if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
908 0 : bio_set_flag(bio, BIO_WORKINGSET);
909 12588 : }
910 : EXPORT_SYMBOL_GPL(__bio_add_page);
911 :
912 : /**
913 : * bio_add_page - attempt to add page(s) to bio
914 : * @bio: destination bio
915 : * @page: start page to add
916 : * @len: vec entry length, may cross pages
917 : * @offset: vec entry offset relative to @page, may cross pages
918 : *
919 : * Attempt to add page(s) to the bio_vec maplist. This will only fail
920 : * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
921 : */
922 29109 : int bio_add_page(struct bio *bio, struct page *page,
923 : unsigned int len, unsigned int offset)
924 : {
925 29109 : bool same_page = false;
926 :
927 29109 : if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
928 12588 : if (bio_full(bio, len))
929 : return 0;
930 12588 : __bio_add_page(bio, page, len, offset);
931 : }
932 29109 : return len;
933 : }
934 : EXPORT_SYMBOL(bio_add_page);
935 :
936 0 : void bio_release_pages(struct bio *bio, bool mark_dirty)
937 : {
938 0 : struct bvec_iter_all iter_all;
939 0 : struct bio_vec *bvec;
940 :
941 0 : if (bio_flagged(bio, BIO_NO_PAGE_REF))
942 0 : return;
943 :
944 0 : bio_for_each_segment_all(bvec, bio, iter_all) {
945 0 : if (mark_dirty && !PageCompound(bvec->bv_page))
946 0 : set_page_dirty_lock(bvec->bv_page);
947 0 : put_page(bvec->bv_page);
948 : }
949 : }
950 : EXPORT_SYMBOL_GPL(bio_release_pages);
951 :
952 0 : static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
953 : {
954 0 : WARN_ON_ONCE(bio->bi_max_vecs);
955 :
956 0 : bio->bi_vcnt = iter->nr_segs;
957 0 : bio->bi_io_vec = (struct bio_vec *)iter->bvec;
958 0 : bio->bi_iter.bi_bvec_done = iter->iov_offset;
959 0 : bio->bi_iter.bi_size = iter->count;
960 0 : bio_set_flag(bio, BIO_NO_PAGE_REF);
961 0 : bio_set_flag(bio, BIO_CLONED);
962 :
963 0 : iov_iter_advance(iter, iter->count);
964 0 : return 0;
965 : }
966 :
967 : #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
968 :
969 : /**
970 : * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
971 : * @bio: bio to add pages to
972 : * @iter: iov iterator describing the region to be mapped
973 : *
974 : * Pins pages from *iter and appends them to @bio's bvec array. The
975 : * pages will have to be released using put_page() when done.
976 : * For multi-segment *iter, this function only adds pages from the
977 : * next non-empty segment of the iov iterator.
978 : */
979 0 : static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
980 : {
981 0 : unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
982 0 : unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
983 0 : struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
984 0 : struct page **pages = (struct page **)bv;
985 0 : bool same_page = false;
986 0 : ssize_t size, left;
987 0 : unsigned len, i;
988 0 : size_t offset;
989 :
990 : /*
991 : * Move page array up in the allocated memory for the bio vecs as far as
992 : * possible so that we can start filling biovecs from the beginning
993 : * without overwriting the temporary page array.
994 : */
995 0 : BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
996 0 : pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
997 :
998 0 : size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
999 0 : if (unlikely(size <= 0))
1000 0 : return size ? size : -EFAULT;
1001 :
1002 0 : for (left = size, i = 0; left > 0; left -= len, i++) {
1003 0 : struct page *page = pages[i];
1004 :
1005 0 : len = min_t(size_t, PAGE_SIZE - offset, left);
1006 :
1007 0 : if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
1008 0 : if (same_page)
1009 0 : put_page(page);
1010 : } else {
1011 0 : if (WARN_ON_ONCE(bio_full(bio, len)))
1012 : return -EINVAL;
1013 0 : __bio_add_page(bio, page, len, offset);
1014 : }
1015 0 : offset = 0;
1016 : }
1017 :
1018 0 : iov_iter_advance(iter, size);
1019 0 : return 0;
1020 : }
1021 :
1022 0 : static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
1023 : {
1024 0 : unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
1025 0 : unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
1026 0 : struct request_queue *q = bio->bi_bdev->bd_disk->queue;
1027 0 : unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
1028 0 : struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
1029 0 : struct page **pages = (struct page **)bv;
1030 0 : ssize_t size, left;
1031 0 : unsigned len, i;
1032 0 : size_t offset;
1033 0 : int ret = 0;
1034 :
1035 0 : if (WARN_ON_ONCE(!max_append_sectors))
1036 : return 0;
1037 :
1038 : /*
1039 : * Move page array up in the allocated memory for the bio vecs as far as
1040 : * possible so that we can start filling biovecs from the beginning
1041 : * without overwriting the temporary page array.
1042 : */
1043 0 : BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
1044 0 : pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
1045 :
1046 0 : size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
1047 0 : if (unlikely(size <= 0))
1048 0 : return size ? size : -EFAULT;
1049 :
1050 0 : for (left = size, i = 0; left > 0; left -= len, i++) {
1051 0 : struct page *page = pages[i];
1052 0 : bool same_page = false;
1053 :
1054 0 : len = min_t(size_t, PAGE_SIZE - offset, left);
1055 0 : if (bio_add_hw_page(q, bio, page, len, offset,
1056 : max_append_sectors, &same_page) != len) {
1057 0 : ret = -EINVAL;
1058 0 : break;
1059 : }
1060 0 : if (same_page)
1061 0 : put_page(page);
1062 0 : offset = 0;
1063 : }
1064 :
1065 0 : iov_iter_advance(iter, size - left);
1066 0 : return ret;
1067 : }
1068 :
1069 : /**
1070 : * bio_iov_iter_get_pages - add user or kernel pages to a bio
1071 : * @bio: bio to add pages to
1072 : * @iter: iov iterator describing the region to be added
1073 : *
1074 : * This takes either an iterator pointing to user memory, or one pointing to
1075 : * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
1076 : * map them into the kernel. On IO completion, the caller should put those
1077 : * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
1078 : * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
1079 : * to ensure the bvecs and pages stay referenced until the submitted I/O is
1080 : * completed by a call to ->ki_complete() or returns with an error other than
1081 : * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
1082 : * on IO completion. If it isn't, then pages should be released.
1083 : *
1084 : * The function tries, but does not guarantee, to pin as many pages as
1085 : * fit into the bio, or are requested in @iter, whatever is smaller. If
1086 : * MM encounters an error pinning the requested pages, it stops. Error
1087 : * is returned only if 0 pages could be pinned.
1088 : *
1089 : * It's intended for direct IO, so doesn't do PSI tracking, the caller is
1090 : * responsible for setting BIO_WORKINGSET if necessary.
1091 : */
1092 0 : int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
1093 : {
1094 0 : int ret = 0;
1095 :
1096 0 : if (iov_iter_is_bvec(iter)) {
1097 0 : if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
1098 : return -EINVAL;
1099 0 : return bio_iov_bvec_set(bio, iter);
1100 : }
1101 :
1102 0 : do {
1103 0 : if (bio_op(bio) == REQ_OP_ZONE_APPEND)
1104 0 : ret = __bio_iov_append_get_pages(bio, iter);
1105 : else
1106 0 : ret = __bio_iov_iter_get_pages(bio, iter);
1107 0 : } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
1108 :
1109 : /* don't account direct I/O as memory stall */
1110 0 : bio_clear_flag(bio, BIO_WORKINGSET);
1111 0 : return bio->bi_vcnt ? 0 : ret;
1112 : }
1113 : EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
1114 :
1115 72 : static void submit_bio_wait_endio(struct bio *bio)
1116 : {
1117 72 : complete(bio->bi_private);
1118 72 : }
1119 :
1120 : /**
1121 : * submit_bio_wait - submit a bio, and wait until it completes
1122 : * @bio: The &struct bio which describes the I/O
1123 : *
1124 : * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
1125 : * bio_endio() on failure.
1126 : *
1127 : * WARNING: Unlike to how submit_bio() is usually used, this function does not
1128 : * result in bio reference to be consumed. The caller must drop the reference
1129 : * on his own.
1130 : */
1131 72 : int submit_bio_wait(struct bio *bio)
1132 : {
1133 72 : DECLARE_COMPLETION_ONSTACK_MAP(done,
1134 : bio->bi_bdev->bd_disk->lockdep_map);
1135 72 : unsigned long hang_check;
1136 :
1137 72 : bio->bi_private = &done;
1138 72 : bio->bi_end_io = submit_bio_wait_endio;
1139 72 : bio->bi_opf |= REQ_SYNC;
1140 72 : submit_bio(bio);
1141 :
1142 : /* Prevent hang_check timer from firing at us during very long I/O */
1143 72 : hang_check = sysctl_hung_task_timeout_secs;
1144 72 : if (hang_check)
1145 : while (!wait_for_completion_io_timeout(&done,
1146 : hang_check * (HZ/2)))
1147 : ;
1148 : else
1149 72 : wait_for_completion_io(&done);
1150 :
1151 72 : return blk_status_to_errno(bio->bi_status);
1152 : }
1153 : EXPORT_SYMBOL(submit_bio_wait);
1154 :
1155 : /**
1156 : * bio_advance - increment/complete a bio by some number of bytes
1157 : * @bio: bio to advance
1158 : * @bytes: number of bytes to complete
1159 : *
1160 : * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
1161 : * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
1162 : * be updated on the last bvec as well.
1163 : *
1164 : * @bio will then represent the remaining, uncompleted portion of the io.
1165 : */
1166 8799 : void bio_advance(struct bio *bio, unsigned bytes)
1167 : {
1168 8799 : if (bio_integrity(bio))
1169 8799 : bio_integrity_advance(bio, bytes);
1170 :
1171 8799 : bio_crypt_advance(bio, bytes);
1172 8799 : bio_advance_iter(bio, &bio->bi_iter, bytes);
1173 8799 : }
1174 : EXPORT_SYMBOL(bio_advance);
1175 :
1176 0 : void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
1177 : struct bio *src, struct bvec_iter *src_iter)
1178 : {
1179 0 : struct bio_vec src_bv, dst_bv;
1180 0 : void *src_p, *dst_p;
1181 0 : unsigned bytes;
1182 :
1183 0 : while (src_iter->bi_size && dst_iter->bi_size) {
1184 0 : src_bv = bio_iter_iovec(src, *src_iter);
1185 0 : dst_bv = bio_iter_iovec(dst, *dst_iter);
1186 :
1187 0 : bytes = min(src_bv.bv_len, dst_bv.bv_len);
1188 :
1189 0 : src_p = kmap_atomic(src_bv.bv_page);
1190 0 : dst_p = kmap_atomic(dst_bv.bv_page);
1191 :
1192 0 : memcpy(dst_p + dst_bv.bv_offset,
1193 0 : src_p + src_bv.bv_offset,
1194 : bytes);
1195 :
1196 0 : kunmap_atomic(dst_p);
1197 0 : kunmap_atomic(src_p);
1198 :
1199 0 : flush_dcache_page(dst_bv.bv_page);
1200 :
1201 0 : bio_advance_iter_single(src, src_iter, bytes);
1202 0 : bio_advance_iter_single(dst, dst_iter, bytes);
1203 : }
1204 0 : }
1205 : EXPORT_SYMBOL(bio_copy_data_iter);
1206 :
1207 : /**
1208 : * bio_copy_data - copy contents of data buffers from one bio to another
1209 : * @src: source bio
1210 : * @dst: destination bio
1211 : *
1212 : * Stops when it reaches the end of either @src or @dst - that is, copies
1213 : * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
1214 : */
1215 0 : void bio_copy_data(struct bio *dst, struct bio *src)
1216 : {
1217 0 : struct bvec_iter src_iter = src->bi_iter;
1218 0 : struct bvec_iter dst_iter = dst->bi_iter;
1219 :
1220 0 : bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1221 0 : }
1222 : EXPORT_SYMBOL(bio_copy_data);
1223 :
1224 : /**
1225 : * bio_list_copy_data - copy contents of data buffers from one chain of bios to
1226 : * another
1227 : * @src: source bio list
1228 : * @dst: destination bio list
1229 : *
1230 : * Stops when it reaches the end of either the @src list or @dst list - that is,
1231 : * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
1232 : * bios).
1233 : */
1234 0 : void bio_list_copy_data(struct bio *dst, struct bio *src)
1235 : {
1236 0 : struct bvec_iter src_iter = src->bi_iter;
1237 0 : struct bvec_iter dst_iter = dst->bi_iter;
1238 :
1239 0 : while (1) {
1240 0 : if (!src_iter.bi_size) {
1241 0 : src = src->bi_next;
1242 0 : if (!src)
1243 : break;
1244 :
1245 0 : src_iter = src->bi_iter;
1246 : }
1247 :
1248 0 : if (!dst_iter.bi_size) {
1249 0 : dst = dst->bi_next;
1250 0 : if (!dst)
1251 : break;
1252 :
1253 0 : dst_iter = dst->bi_iter;
1254 : }
1255 :
1256 0 : bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1257 : }
1258 0 : }
1259 : EXPORT_SYMBOL(bio_list_copy_data);
1260 :
1261 2 : void bio_free_pages(struct bio *bio)
1262 : {
1263 2 : struct bio_vec *bvec;
1264 2 : struct bvec_iter_all iter_all;
1265 :
1266 4 : bio_for_each_segment_all(bvec, bio, iter_all)
1267 2 : __free_page(bvec->bv_page);
1268 2 : }
1269 : EXPORT_SYMBOL(bio_free_pages);
1270 :
1271 : /*
1272 : * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
1273 : * for performing direct-IO in BIOs.
1274 : *
1275 : * The problem is that we cannot run set_page_dirty() from interrupt context
1276 : * because the required locks are not interrupt-safe. So what we can do is to
1277 : * mark the pages dirty _before_ performing IO. And in interrupt context,
1278 : * check that the pages are still dirty. If so, fine. If not, redirty them
1279 : * in process context.
1280 : *
1281 : * We special-case compound pages here: normally this means reads into hugetlb
1282 : * pages. The logic in here doesn't really work right for compound pages
1283 : * because the VM does not uniformly chase down the head page in all cases.
1284 : * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
1285 : * handle them at all. So we skip compound pages here at an early stage.
1286 : *
1287 : * Note that this code is very hard to test under normal circumstances because
1288 : * direct-io pins the pages with get_user_pages(). This makes
1289 : * is_page_cache_freeable return false, and the VM will not clean the pages.
1290 : * But other code (eg, flusher threads) could clean the pages if they are mapped
1291 : * pagecache.
1292 : *
1293 : * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
1294 : * deferred bio dirtying paths.
1295 : */
1296 :
1297 : /*
1298 : * bio_set_pages_dirty() will mark all the bio's pages as dirty.
1299 : */
1300 0 : void bio_set_pages_dirty(struct bio *bio)
1301 : {
1302 0 : struct bio_vec *bvec;
1303 0 : struct bvec_iter_all iter_all;
1304 :
1305 0 : bio_for_each_segment_all(bvec, bio, iter_all) {
1306 0 : if (!PageCompound(bvec->bv_page))
1307 0 : set_page_dirty_lock(bvec->bv_page);
1308 : }
1309 0 : }
1310 :
1311 : /*
1312 : * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
1313 : * If they are, then fine. If, however, some pages are clean then they must
1314 : * have been written out during the direct-IO read. So we take another ref on
1315 : * the BIO and re-dirty the pages in process context.
1316 : *
1317 : * It is expected that bio_check_pages_dirty() will wholly own the BIO from
1318 : * here on. It will run one put_page() against each page and will run one
1319 : * bio_put() against the BIO.
1320 : */
1321 :
1322 : static void bio_dirty_fn(struct work_struct *work);
1323 :
1324 : static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
1325 : static DEFINE_SPINLOCK(bio_dirty_lock);
1326 : static struct bio *bio_dirty_list;
1327 :
1328 : /*
1329 : * This runs in process context
1330 : */
1331 0 : static void bio_dirty_fn(struct work_struct *work)
1332 : {
1333 0 : struct bio *bio, *next;
1334 :
1335 0 : spin_lock_irq(&bio_dirty_lock);
1336 0 : next = bio_dirty_list;
1337 0 : bio_dirty_list = NULL;
1338 0 : spin_unlock_irq(&bio_dirty_lock);
1339 :
1340 0 : while ((bio = next) != NULL) {
1341 0 : next = bio->bi_private;
1342 :
1343 0 : bio_release_pages(bio, true);
1344 0 : bio_put(bio);
1345 : }
1346 0 : }
1347 :
1348 0 : void bio_check_pages_dirty(struct bio *bio)
1349 : {
1350 0 : struct bio_vec *bvec;
1351 0 : unsigned long flags;
1352 0 : struct bvec_iter_all iter_all;
1353 :
1354 0 : bio_for_each_segment_all(bvec, bio, iter_all) {
1355 0 : if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
1356 0 : goto defer;
1357 : }
1358 :
1359 0 : bio_release_pages(bio, false);
1360 0 : bio_put(bio);
1361 0 : return;
1362 0 : defer:
1363 0 : spin_lock_irqsave(&bio_dirty_lock, flags);
1364 0 : bio->bi_private = bio_dirty_list;
1365 0 : bio_dirty_list = bio;
1366 0 : spin_unlock_irqrestore(&bio_dirty_lock, flags);
1367 0 : schedule_work(&bio_dirty_work);
1368 : }
1369 :
1370 8738 : static inline bool bio_remaining_done(struct bio *bio)
1371 : {
1372 : /*
1373 : * If we're not chaining, then ->__bi_remaining is always 1 and
1374 : * we always end io on the first invocation.
1375 : */
1376 8738 : if (!bio_flagged(bio, BIO_CHAIN))
1377 : return true;
1378 :
1379 0 : BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
1380 :
1381 0 : if (atomic_dec_and_test(&bio->__bi_remaining)) {
1382 0 : bio_clear_flag(bio, BIO_CHAIN);
1383 0 : return true;
1384 : }
1385 :
1386 : return false;
1387 : }
1388 :
1389 : /**
1390 : * bio_endio - end I/O on a bio
1391 : * @bio: bio
1392 : *
1393 : * Description:
1394 : * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
1395 : * way to end I/O on a bio. No one should call bi_end_io() directly on a
1396 : * bio unless they own it and thus know that it has an end_io function.
1397 : *
1398 : * bio_endio() can be called several times on a bio that has been chained
1399 : * using bio_chain(). The ->bi_end_io() function will only be called the
1400 : * last time. At this point the BLK_TA_COMPLETE tracing event will be
1401 : * generated if BIO_TRACE_COMPLETION is set.
1402 : **/
1403 8738 : void bio_endio(struct bio *bio)
1404 : {
1405 8738 : again:
1406 8738 : if (!bio_remaining_done(bio))
1407 : return;
1408 8738 : if (!bio_integrity_endio(bio))
1409 : return;
1410 :
1411 8738 : if (bio->bi_bdev)
1412 8736 : rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
1413 :
1414 : /*
1415 : * Need to have a real endio function for chained bios, otherwise
1416 : * various corner cases will break (like stacking block devices that
1417 : * save/restore bi_end_io) - however, we want to avoid unbounded
1418 : * recursion and blowing the stack. Tail call optimization would
1419 : * handle this, but compiling with frame pointers also disables
1420 : * gcc's sibling call optimization.
1421 : */
1422 8738 : if (bio->bi_end_io == bio_chain_endio) {
1423 0 : bio = __bio_chain_endio(bio);
1424 0 : goto again;
1425 : }
1426 :
1427 8739 : if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
1428 0 : trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
1429 0 : bio_clear_flag(bio, BIO_TRACE_COMPLETION);
1430 : }
1431 :
1432 8739 : blk_throtl_bio_endio(bio);
1433 : /* release cgroup info */
1434 8739 : bio_uninit(bio);
1435 8739 : if (bio->bi_end_io)
1436 8739 : bio->bi_end_io(bio);
1437 : }
1438 : EXPORT_SYMBOL(bio_endio);
1439 :
1440 : /**
1441 : * bio_split - split a bio
1442 : * @bio: bio to split
1443 : * @sectors: number of sectors to split from the front of @bio
1444 : * @gfp: gfp mask
1445 : * @bs: bio set to allocate from
1446 : *
1447 : * Allocates and returns a new bio which represents @sectors from the start of
1448 : * @bio, and updates @bio to represent the remaining sectors.
1449 : *
1450 : * Unless this is a discard request the newly allocated bio will point
1451 : * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
1452 : * neither @bio nor @bs are freed before the split bio.
1453 : */
1454 0 : struct bio *bio_split(struct bio *bio, int sectors,
1455 : gfp_t gfp, struct bio_set *bs)
1456 : {
1457 0 : struct bio *split;
1458 :
1459 0 : BUG_ON(sectors <= 0);
1460 0 : BUG_ON(sectors >= bio_sectors(bio));
1461 :
1462 : /* Zone append commands cannot be split */
1463 0 : if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
1464 : return NULL;
1465 :
1466 0 : split = bio_clone_fast(bio, gfp, bs);
1467 0 : if (!split)
1468 : return NULL;
1469 :
1470 0 : split->bi_iter.bi_size = sectors << 9;
1471 :
1472 0 : if (bio_integrity(split))
1473 0 : bio_integrity_trim(split);
1474 :
1475 0 : bio_advance(bio, split->bi_iter.bi_size);
1476 :
1477 0 : if (bio_flagged(bio, BIO_TRACE_COMPLETION))
1478 0 : bio_set_flag(split, BIO_TRACE_COMPLETION);
1479 :
1480 : return split;
1481 : }
1482 : EXPORT_SYMBOL(bio_split);
1483 :
1484 : /**
1485 : * bio_trim - trim a bio
1486 : * @bio: bio to trim
1487 : * @offset: number of sectors to trim from the front of @bio
1488 : * @size: size we want to trim @bio to, in sectors
1489 : */
1490 0 : void bio_trim(struct bio *bio, int offset, int size)
1491 : {
1492 : /* 'bio' is a cloned bio which we need to trim to match
1493 : * the given offset and size.
1494 : */
1495 :
1496 0 : size <<= 9;
1497 0 : if (offset == 0 && size == bio->bi_iter.bi_size)
1498 : return;
1499 :
1500 0 : bio_advance(bio, offset << 9);
1501 0 : bio->bi_iter.bi_size = size;
1502 :
1503 0 : if (bio_integrity(bio))
1504 0 : bio_integrity_trim(bio);
1505 :
1506 : }
1507 : EXPORT_SYMBOL_GPL(bio_trim);
1508 :
1509 : /*
1510 : * create memory pools for biovec's in a bio_set.
1511 : * use the global biovec slabs created for general use.
1512 : */
1513 3 : int biovec_init_pool(mempool_t *pool, int pool_entries)
1514 : {
1515 3 : struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
1516 :
1517 3 : return mempool_init_slab_pool(pool, pool_entries, bp->slab);
1518 : }
1519 :
1520 : /*
1521 : * bioset_exit - exit a bioset initialized with bioset_init()
1522 : *
1523 : * May be called on a zeroed but uninitialized bioset (i.e. allocated with
1524 : * kzalloc()).
1525 : */
1526 0 : void bioset_exit(struct bio_set *bs)
1527 : {
1528 0 : if (bs->rescue_workqueue)
1529 0 : destroy_workqueue(bs->rescue_workqueue);
1530 0 : bs->rescue_workqueue = NULL;
1531 :
1532 0 : mempool_exit(&bs->bio_pool);
1533 0 : mempool_exit(&bs->bvec_pool);
1534 :
1535 0 : bioset_integrity_free(bs);
1536 0 : if (bs->bio_slab)
1537 0 : bio_put_slab(bs);
1538 0 : bs->bio_slab = NULL;
1539 0 : }
1540 : EXPORT_SYMBOL(bioset_exit);
1541 :
1542 : /**
1543 : * bioset_init - Initialize a bio_set
1544 : * @bs: pool to initialize
1545 : * @pool_size: Number of bio and bio_vecs to cache in the mempool
1546 : * @front_pad: Number of bytes to allocate in front of the returned bio
1547 : * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS
1548 : * and %BIOSET_NEED_RESCUER
1549 : *
1550 : * Description:
1551 : * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
1552 : * to ask for a number of bytes to be allocated in front of the bio.
1553 : * Front pad allocation is useful for embedding the bio inside
1554 : * another structure, to avoid allocating extra data to go with the bio.
1555 : * Note that the bio must be embedded at the END of that structure always,
1556 : * or things will break badly.
1557 : * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
1558 : * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast().
1559 : * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
1560 : * dispatch queued requests when the mempool runs out of space.
1561 : *
1562 : */
1563 12 : int bioset_init(struct bio_set *bs,
1564 : unsigned int pool_size,
1565 : unsigned int front_pad,
1566 : int flags)
1567 : {
1568 12 : bs->front_pad = front_pad;
1569 12 : if (flags & BIOSET_NEED_BVECS)
1570 3 : bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
1571 : else
1572 9 : bs->back_pad = 0;
1573 :
1574 12 : spin_lock_init(&bs->rescue_lock);
1575 12 : bio_list_init(&bs->rescue_list);
1576 12 : INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
1577 :
1578 12 : bs->bio_slab = bio_find_or_create_slab(bs);
1579 12 : if (!bs->bio_slab)
1580 : return -ENOMEM;
1581 :
1582 12 : if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
1583 0 : goto bad;
1584 :
1585 15 : if ((flags & BIOSET_NEED_BVECS) &&
1586 3 : biovec_init_pool(&bs->bvec_pool, pool_size))
1587 0 : goto bad;
1588 :
1589 12 : if (!(flags & BIOSET_NEED_RESCUER))
1590 : return 0;
1591 :
1592 0 : bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
1593 0 : if (!bs->rescue_workqueue)
1594 0 : goto bad;
1595 :
1596 : return 0;
1597 0 : bad:
1598 0 : bioset_exit(bs);
1599 0 : return -ENOMEM;
1600 : }
1601 : EXPORT_SYMBOL(bioset_init);
1602 :
1603 : /*
1604 : * Initialize and setup a new bio_set, based on the settings from
1605 : * another bio_set.
1606 : */
1607 0 : int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
1608 : {
1609 0 : int flags;
1610 :
1611 0 : flags = 0;
1612 0 : if (src->bvec_pool.min_nr)
1613 0 : flags |= BIOSET_NEED_BVECS;
1614 0 : if (src->rescue_workqueue)
1615 0 : flags |= BIOSET_NEED_RESCUER;
1616 :
1617 0 : return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
1618 : }
1619 : EXPORT_SYMBOL(bioset_init_from_src);
1620 :
1621 1 : static int __init init_bio(void)
1622 : {
1623 1 : int i;
1624 :
1625 1 : bio_integrity_init();
1626 :
1627 5 : for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
1628 4 : struct biovec_slab *bvs = bvec_slabs + i;
1629 :
1630 4 : bvs->slab = kmem_cache_create(bvs->name,
1631 4 : bvs->nr_vecs * sizeof(struct bio_vec), 0,
1632 : SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1633 : }
1634 :
1635 1 : if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
1636 0 : panic("bio: can't allocate bios\n");
1637 :
1638 1 : if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
1639 : panic("bio: can't create integrity pool\n");
1640 :
1641 1 : return 0;
1642 : }
1643 : subsys_initcall(init_bio);
|