Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2010 Red Hat, Inc.
4 : * Copyright (c) 2016-2018 Christoph Hellwig.
5 : */
6 : #include <linux/module.h>
7 : #include <linux/compiler.h>
8 : #include <linux/fs.h>
9 : #include <linux/iomap.h>
10 : #include <linux/backing-dev.h>
11 : #include <linux/uio.h>
12 : #include <linux/task_io_accounting_ops.h>
13 : #include "trace.h"
14 :
15 : #include "../internal.h"
16 :
17 : /*
18 : * Private flags for iomap_dio, must not overlap with the public ones in
19 : * iomap.h:
20 : */
21 : #define IOMAP_DIO_WRITE_FUA (1 << 28)
22 : #define IOMAP_DIO_NEED_SYNC (1 << 29)
23 : #define IOMAP_DIO_WRITE (1 << 30)
24 : #define IOMAP_DIO_DIRTY (1 << 31)
25 :
26 : struct iomap_dio {
27 : struct kiocb *iocb;
28 : const struct iomap_dio_ops *dops;
29 : loff_t i_size;
30 : loff_t size;
31 : atomic_t ref;
32 : unsigned flags;
33 : int error;
34 : bool wait_for_completion;
35 :
36 : union {
37 : /* used during submission and for synchronous completion: */
38 : struct {
39 : struct iov_iter *iter;
40 : struct task_struct *waiter;
41 : struct request_queue *last_queue;
42 : blk_qc_t cookie;
43 : } submit;
44 :
45 : /* used for aio completion: */
46 : struct {
47 : struct work_struct work;
48 : } aio;
49 : };
50 : };
51 :
52 0 : int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
53 : {
54 0 : struct request_queue *q = READ_ONCE(kiocb->private);
55 :
56 0 : if (!q)
57 : return 0;
58 0 : return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
59 : }
60 : EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
61 :
62 0 : static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
63 : struct bio *bio, loff_t pos)
64 : {
65 0 : atomic_inc(&dio->ref);
66 :
67 0 : if (dio->iocb->ki_flags & IOCB_HIPRI)
68 0 : bio_set_polled(bio, dio->iocb);
69 :
70 0 : dio->submit.last_queue = bdev_get_queue(iomap->bdev);
71 0 : if (dio->dops && dio->dops->submit_io)
72 0 : dio->submit.cookie = dio->dops->submit_io(
73 0 : file_inode(dio->iocb->ki_filp),
74 : iomap, bio, pos);
75 : else
76 0 : dio->submit.cookie = submit_bio(bio);
77 0 : }
78 :
79 0 : ssize_t iomap_dio_complete(struct iomap_dio *dio)
80 : {
81 0 : const struct iomap_dio_ops *dops = dio->dops;
82 0 : struct kiocb *iocb = dio->iocb;
83 0 : struct inode *inode = file_inode(iocb->ki_filp);
84 0 : loff_t offset = iocb->ki_pos;
85 0 : ssize_t ret = dio->error;
86 :
87 0 : if (dops && dops->end_io)
88 0 : ret = dops->end_io(iocb, dio->size, ret, dio->flags);
89 :
90 0 : if (likely(!ret)) {
91 0 : ret = dio->size;
92 : /* check for short read */
93 0 : if (offset + ret > dio->i_size &&
94 0 : !(dio->flags & IOMAP_DIO_WRITE))
95 0 : ret = dio->i_size - offset;
96 0 : iocb->ki_pos += ret;
97 : }
98 :
99 : /*
100 : * Try again to invalidate clean pages which might have been cached by
101 : * non-direct readahead, or faulted in by get_user_pages() if the source
102 : * of the write was an mmap'ed region of the file we're writing. Either
103 : * one is a pretty crazy thing to do, so we don't support it 100%. If
104 : * this invalidation fails, tough, the write still worked...
105 : *
106 : * And this page cache invalidation has to be after ->end_io(), as some
107 : * filesystems convert unwritten extents to real allocations in
108 : * ->end_io() when necessary, otherwise a racing buffer read would cache
109 : * zeros from unwritten extents.
110 : */
111 0 : if (!dio->error && dio->size &&
112 0 : (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
113 0 : int err;
114 0 : err = invalidate_inode_pages2_range(inode->i_mapping,
115 0 : offset >> PAGE_SHIFT,
116 0 : (offset + dio->size - 1) >> PAGE_SHIFT);
117 0 : if (err)
118 0 : dio_warn_stale_pagecache(iocb->ki_filp);
119 : }
120 :
121 0 : inode_dio_end(file_inode(iocb->ki_filp));
122 : /*
123 : * If this is a DSYNC write, make sure we push it to stable storage now
124 : * that we've written data.
125 : */
126 0 : if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
127 0 : ret = generic_write_sync(iocb, ret);
128 :
129 0 : kfree(dio);
130 :
131 0 : return ret;
132 : }
133 : EXPORT_SYMBOL_GPL(iomap_dio_complete);
134 :
135 0 : static void iomap_dio_complete_work(struct work_struct *work)
136 : {
137 0 : struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
138 0 : struct kiocb *iocb = dio->iocb;
139 :
140 0 : iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
141 0 : }
142 :
143 : /*
144 : * Set an error in the dio if none is set yet. We have to use cmpxchg
145 : * as the submission context and the completion context(s) can race to
146 : * update the error.
147 : */
148 0 : static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
149 : {
150 0 : cmpxchg(&dio->error, 0, ret);
151 0 : }
152 :
153 0 : static void iomap_dio_bio_end_io(struct bio *bio)
154 : {
155 0 : struct iomap_dio *dio = bio->bi_private;
156 0 : bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
157 :
158 0 : if (bio->bi_status)
159 0 : iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
160 :
161 0 : if (atomic_dec_and_test(&dio->ref)) {
162 0 : if (dio->wait_for_completion) {
163 0 : struct task_struct *waiter = dio->submit.waiter;
164 0 : WRITE_ONCE(dio->submit.waiter, NULL);
165 0 : blk_wake_io_task(waiter);
166 0 : } else if (dio->flags & IOMAP_DIO_WRITE) {
167 0 : struct inode *inode = file_inode(dio->iocb->ki_filp);
168 :
169 0 : INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
170 0 : queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
171 : } else {
172 0 : iomap_dio_complete_work(&dio->aio.work);
173 : }
174 : }
175 :
176 0 : if (should_dirty) {
177 0 : bio_check_pages_dirty(bio);
178 : } else {
179 0 : bio_release_pages(bio, false);
180 0 : bio_put(bio);
181 : }
182 0 : }
183 :
184 : static void
185 0 : iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
186 : unsigned len)
187 : {
188 0 : struct page *page = ZERO_PAGE(0);
189 0 : int flags = REQ_SYNC | REQ_IDLE;
190 0 : struct bio *bio;
191 :
192 0 : bio = bio_alloc(GFP_KERNEL, 1);
193 0 : bio_set_dev(bio, iomap->bdev);
194 0 : bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
195 0 : bio->bi_private = dio;
196 0 : bio->bi_end_io = iomap_dio_bio_end_io;
197 :
198 0 : get_page(page);
199 0 : __bio_add_page(bio, page, len, 0);
200 0 : bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
201 0 : iomap_dio_submit_bio(dio, iomap, bio, pos);
202 0 : }
203 :
204 : /*
205 : * Figure out the bio's operation flags from the dio request, the
206 : * mapping, and whether or not we want FUA. Note that we can end up
207 : * clearing the WRITE_FUA flag in the dio request.
208 : */
209 : static inline unsigned int
210 0 : iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua)
211 : {
212 0 : unsigned int opflags = REQ_SYNC | REQ_IDLE;
213 :
214 0 : if (!(dio->flags & IOMAP_DIO_WRITE)) {
215 0 : WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
216 : return REQ_OP_READ;
217 : }
218 :
219 0 : if (iomap->flags & IOMAP_F_ZONE_APPEND)
220 : opflags |= REQ_OP_ZONE_APPEND;
221 : else
222 0 : opflags |= REQ_OP_WRITE;
223 :
224 0 : if (use_fua)
225 0 : opflags |= REQ_FUA;
226 : else
227 0 : dio->flags &= ~IOMAP_DIO_WRITE_FUA;
228 :
229 : return opflags;
230 : }
231 :
232 : static loff_t
233 0 : iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
234 : struct iomap_dio *dio, struct iomap *iomap)
235 : {
236 0 : unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
237 0 : unsigned int fs_block_size = i_blocksize(inode), pad;
238 0 : unsigned int align = iov_iter_alignment(dio->submit.iter);
239 0 : unsigned int bio_opf;
240 0 : struct bio *bio;
241 0 : bool need_zeroout = false;
242 0 : bool use_fua = false;
243 0 : int nr_pages, ret = 0;
244 0 : size_t copied = 0;
245 0 : size_t orig_count;
246 :
247 0 : if ((pos | length | align) & ((1 << blkbits) - 1))
248 : return -EINVAL;
249 :
250 0 : if (iomap->type == IOMAP_UNWRITTEN) {
251 0 : dio->flags |= IOMAP_DIO_UNWRITTEN;
252 0 : need_zeroout = true;
253 : }
254 :
255 0 : if (iomap->flags & IOMAP_F_SHARED)
256 0 : dio->flags |= IOMAP_DIO_COW;
257 :
258 0 : if (iomap->flags & IOMAP_F_NEW) {
259 : need_zeroout = true;
260 0 : } else if (iomap->type == IOMAP_MAPPED) {
261 : /*
262 : * Use a FUA write if we need datasync semantics, this is a pure
263 : * data IO that doesn't require any metadata updates (including
264 : * after IO completion such as unwritten extent conversion) and
265 : * the underlying device supports FUA. This allows us to avoid
266 : * cache flushes on IO completion.
267 : */
268 0 : if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
269 0 : (dio->flags & IOMAP_DIO_WRITE_FUA) &&
270 0 : blk_queue_fua(bdev_get_queue(iomap->bdev)))
271 0 : use_fua = true;
272 : }
273 :
274 : /*
275 : * Save the original count and trim the iter to just the extent we
276 : * are operating on right now. The iter will be re-expanded once
277 : * we are done.
278 : */
279 0 : orig_count = iov_iter_count(dio->submit.iter);
280 0 : iov_iter_truncate(dio->submit.iter, length);
281 :
282 0 : if (!iov_iter_count(dio->submit.iter))
283 0 : goto out;
284 :
285 0 : if (need_zeroout) {
286 : /* zero out from the start of the block to the write offset */
287 0 : pad = pos & (fs_block_size - 1);
288 0 : if (pad)
289 0 : iomap_dio_zero(dio, iomap, pos - pad, pad);
290 : }
291 :
292 : /*
293 : * Set the operation flags early so that bio_iov_iter_get_pages
294 : * can set up the page vector appropriately for a ZONE_APPEND
295 : * operation.
296 : */
297 0 : bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
298 :
299 0 : nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
300 0 : do {
301 0 : size_t n;
302 0 : if (dio->error) {
303 0 : iov_iter_revert(dio->submit.iter, copied);
304 0 : copied = ret = 0;
305 0 : goto out;
306 : }
307 :
308 0 : bio = bio_alloc(GFP_KERNEL, nr_pages);
309 0 : bio_set_dev(bio, iomap->bdev);
310 0 : bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
311 0 : bio->bi_write_hint = dio->iocb->ki_hint;
312 0 : bio->bi_ioprio = dio->iocb->ki_ioprio;
313 0 : bio->bi_private = dio;
314 0 : bio->bi_end_io = iomap_dio_bio_end_io;
315 0 : bio->bi_opf = bio_opf;
316 :
317 0 : ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
318 0 : if (unlikely(ret)) {
319 : /*
320 : * We have to stop part way through an IO. We must fall
321 : * through to the sub-block tail zeroing here, otherwise
322 : * this short IO may expose stale data in the tail of
323 : * the block we haven't written data to.
324 : */
325 0 : bio_put(bio);
326 0 : goto zero_tail;
327 : }
328 :
329 0 : n = bio->bi_iter.bi_size;
330 0 : if (dio->flags & IOMAP_DIO_WRITE) {
331 0 : task_io_account_write(n);
332 : } else {
333 0 : if (dio->flags & IOMAP_DIO_DIRTY)
334 0 : bio_set_pages_dirty(bio);
335 : }
336 :
337 0 : dio->size += n;
338 0 : copied += n;
339 :
340 0 : nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
341 : BIO_MAX_VECS);
342 0 : iomap_dio_submit_bio(dio, iomap, bio, pos);
343 0 : pos += n;
344 0 : } while (nr_pages);
345 :
346 : /*
347 : * We need to zeroout the tail of a sub-block write if the extent type
348 : * requires zeroing or the write extends beyond EOF. If we don't zero
349 : * the block tail in the latter case, we can expose stale data via mmap
350 : * reads of the EOF block.
351 : */
352 0 : zero_tail:
353 0 : if (need_zeroout ||
354 0 : ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
355 : /* zero out from the end of the write to the end of the block */
356 0 : pad = pos & (fs_block_size - 1);
357 0 : if (pad)
358 0 : iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
359 : }
360 0 : out:
361 : /* Undo iter limitation to current extent */
362 0 : iov_iter_reexpand(dio->submit.iter, orig_count - copied);
363 0 : if (copied)
364 0 : return copied;
365 0 : return ret;
366 : }
367 :
368 : static loff_t
369 0 : iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
370 : {
371 0 : length = iov_iter_zero(length, dio->submit.iter);
372 0 : dio->size += length;
373 0 : return length;
374 : }
375 :
376 : static loff_t
377 0 : iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
378 : struct iomap_dio *dio, struct iomap *iomap)
379 : {
380 0 : struct iov_iter *iter = dio->submit.iter;
381 0 : size_t copied;
382 :
383 0 : BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
384 :
385 0 : if (dio->flags & IOMAP_DIO_WRITE) {
386 0 : loff_t size = inode->i_size;
387 :
388 0 : if (pos > size)
389 0 : memset(iomap->inline_data + size, 0, pos - size);
390 0 : copied = copy_from_iter(iomap->inline_data + pos, length, iter);
391 0 : if (copied) {
392 0 : if (pos + copied > size)
393 0 : i_size_write(inode, pos + copied);
394 0 : mark_inode_dirty(inode);
395 : }
396 : } else {
397 0 : copied = copy_to_iter(iomap->inline_data + pos, length, iter);
398 : }
399 0 : dio->size += copied;
400 0 : return copied;
401 : }
402 :
403 : static loff_t
404 0 : iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
405 : void *data, struct iomap *iomap, struct iomap *srcmap)
406 : {
407 0 : struct iomap_dio *dio = data;
408 :
409 0 : switch (iomap->type) {
410 0 : case IOMAP_HOLE:
411 0 : if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
412 : return -EIO;
413 0 : return iomap_dio_hole_actor(length, dio);
414 0 : case IOMAP_UNWRITTEN:
415 0 : if (!(dio->flags & IOMAP_DIO_WRITE))
416 0 : return iomap_dio_hole_actor(length, dio);
417 0 : return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
418 0 : case IOMAP_MAPPED:
419 0 : return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
420 0 : case IOMAP_INLINE:
421 0 : return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
422 0 : case IOMAP_DELALLOC:
423 : /*
424 : * DIO is not serialised against mmap() access at all, and so
425 : * if the page_mkwrite occurs between the writeback and the
426 : * iomap_apply() call in the DIO path, then it will see the
427 : * DELALLOC block that the page-mkwrite allocated.
428 : */
429 0 : pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
430 : dio->iocb->ki_filp, current->comm);
431 : return -EIO;
432 : default:
433 0 : WARN_ON_ONCE(1);
434 0 : return -EIO;
435 : }
436 : }
437 :
438 : /*
439 : * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
440 : * is being issued as AIO or not. This allows us to optimise pure data writes
441 : * to use REQ_FUA rather than requiring generic_write_sync() to issue a
442 : * REQ_FLUSH post write. This is slightly tricky because a single request here
443 : * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
444 : * may be pure data writes. In that case, we still need to do a full data sync
445 : * completion.
446 : *
447 : * Returns -ENOTBLK In case of a page invalidation invalidation failure for
448 : * writes. The callers needs to fall back to buffered I/O in this case.
449 : */
450 : struct iomap_dio *
451 0 : __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
452 : const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
453 : unsigned int dio_flags)
454 : {
455 0 : struct address_space *mapping = iocb->ki_filp->f_mapping;
456 0 : struct inode *inode = file_inode(iocb->ki_filp);
457 0 : size_t count = iov_iter_count(iter);
458 0 : loff_t pos = iocb->ki_pos;
459 0 : loff_t end = iocb->ki_pos + count - 1, ret = 0;
460 0 : bool wait_for_completion =
461 0 : is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
462 0 : unsigned int iomap_flags = IOMAP_DIRECT;
463 0 : struct blk_plug plug;
464 0 : struct iomap_dio *dio;
465 :
466 0 : if (!count)
467 : return NULL;
468 :
469 0 : dio = kmalloc(sizeof(*dio), GFP_KERNEL);
470 0 : if (!dio)
471 0 : return ERR_PTR(-ENOMEM);
472 :
473 0 : dio->iocb = iocb;
474 0 : atomic_set(&dio->ref, 1);
475 0 : dio->size = 0;
476 0 : dio->i_size = i_size_read(inode);
477 0 : dio->dops = dops;
478 0 : dio->error = 0;
479 0 : dio->flags = 0;
480 :
481 0 : dio->submit.iter = iter;
482 0 : dio->submit.waiter = current;
483 0 : dio->submit.cookie = BLK_QC_T_NONE;
484 0 : dio->submit.last_queue = NULL;
485 :
486 0 : if (iov_iter_rw(iter) == READ) {
487 0 : if (pos >= dio->i_size)
488 0 : goto out_free_dio;
489 :
490 0 : if (iter_is_iovec(iter))
491 0 : dio->flags |= IOMAP_DIO_DIRTY;
492 : } else {
493 0 : iomap_flags |= IOMAP_WRITE;
494 0 : dio->flags |= IOMAP_DIO_WRITE;
495 :
496 : /* for data sync or sync, we need sync completion processing */
497 0 : if (iocb->ki_flags & IOCB_DSYNC)
498 0 : dio->flags |= IOMAP_DIO_NEED_SYNC;
499 :
500 : /*
501 : * For datasync only writes, we optimistically try using FUA for
502 : * this IO. Any non-FUA write that occurs will clear this flag,
503 : * hence we know before completion whether a cache flush is
504 : * necessary.
505 : */
506 0 : if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
507 0 : dio->flags |= IOMAP_DIO_WRITE_FUA;
508 : }
509 :
510 0 : if (iocb->ki_flags & IOCB_NOWAIT) {
511 0 : if (filemap_range_has_page(mapping, pos, end)) {
512 0 : ret = -EAGAIN;
513 0 : goto out_free_dio;
514 : }
515 0 : iomap_flags |= IOMAP_NOWAIT;
516 : }
517 :
518 0 : if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
519 0 : ret = -EAGAIN;
520 0 : if (pos >= dio->i_size || pos + count > dio->i_size)
521 0 : goto out_free_dio;
522 0 : iomap_flags |= IOMAP_OVERWRITE_ONLY;
523 : }
524 :
525 0 : ret = filemap_write_and_wait_range(mapping, pos, end);
526 0 : if (ret)
527 0 : goto out_free_dio;
528 :
529 0 : if (iov_iter_rw(iter) == WRITE) {
530 : /*
531 : * Try to invalidate cache pages for the range we are writing.
532 : * If this invalidation fails, let the caller fall back to
533 : * buffered I/O.
534 : */
535 0 : if (invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
536 0 : end >> PAGE_SHIFT)) {
537 0 : trace_iomap_dio_invalidate_fail(inode, pos, count);
538 0 : ret = -ENOTBLK;
539 0 : goto out_free_dio;
540 : }
541 :
542 0 : if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
543 0 : ret = sb_init_dio_done_wq(inode->i_sb);
544 0 : if (ret < 0)
545 0 : goto out_free_dio;
546 : }
547 : }
548 :
549 0 : inode_dio_begin(inode);
550 :
551 0 : blk_start_plug(&plug);
552 0 : do {
553 0 : ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio,
554 : iomap_dio_actor);
555 0 : if (ret <= 0) {
556 : /* magic error code to fall back to buffered I/O */
557 0 : if (ret == -ENOTBLK) {
558 0 : wait_for_completion = true;
559 0 : ret = 0;
560 : }
561 : break;
562 : }
563 0 : pos += ret;
564 :
565 0 : if (iov_iter_rw(iter) == READ && pos >= dio->i_size) {
566 : /*
567 : * We only report that we've read data up to i_size.
568 : * Revert iter to a state corresponding to that as
569 : * some callers (such as splice code) rely on it.
570 : */
571 0 : iov_iter_revert(iter, pos - dio->i_size);
572 0 : break;
573 : }
574 0 : } while ((count = iov_iter_count(iter)) > 0);
575 0 : blk_finish_plug(&plug);
576 :
577 0 : if (ret < 0)
578 0 : iomap_dio_set_error(dio, ret);
579 :
580 : /*
581 : * If all the writes we issued were FUA, we don't need to flush the
582 : * cache on IO completion. Clear the sync flag for this case.
583 : */
584 0 : if (dio->flags & IOMAP_DIO_WRITE_FUA)
585 0 : dio->flags &= ~IOMAP_DIO_NEED_SYNC;
586 :
587 0 : WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
588 0 : WRITE_ONCE(iocb->private, dio->submit.last_queue);
589 :
590 : /*
591 : * We are about to drop our additional submission reference, which
592 : * might be the last reference to the dio. There are three different
593 : * ways we can progress here:
594 : *
595 : * (a) If this is the last reference we will always complete and free
596 : * the dio ourselves.
597 : * (b) If this is not the last reference, and we serve an asynchronous
598 : * iocb, we must never touch the dio after the decrement, the
599 : * I/O completion handler will complete and free it.
600 : * (c) If this is not the last reference, but we serve a synchronous
601 : * iocb, the I/O completion handler will wake us up on the drop
602 : * of the final reference, and we will complete and free it here
603 : * after we got woken by the I/O completion handler.
604 : */
605 0 : dio->wait_for_completion = wait_for_completion;
606 0 : if (!atomic_dec_and_test(&dio->ref)) {
607 0 : if (!wait_for_completion)
608 0 : return ERR_PTR(-EIOCBQUEUED);
609 :
610 0 : for (;;) {
611 0 : set_current_state(TASK_UNINTERRUPTIBLE);
612 0 : if (!READ_ONCE(dio->submit.waiter))
613 : break;
614 :
615 0 : if (!(iocb->ki_flags & IOCB_HIPRI) ||
616 0 : !dio->submit.last_queue ||
617 0 : !blk_poll(dio->submit.last_queue,
618 : dio->submit.cookie, true))
619 0 : blk_io_schedule();
620 : }
621 0 : __set_current_state(TASK_RUNNING);
622 : }
623 :
624 : return dio;
625 :
626 0 : out_free_dio:
627 0 : kfree(dio);
628 0 : if (ret)
629 0 : return ERR_PTR(ret);
630 : return NULL;
631 : }
632 : EXPORT_SYMBOL_GPL(__iomap_dio_rw);
633 :
634 : ssize_t
635 0 : iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
636 : const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
637 : unsigned int dio_flags)
638 : {
639 0 : struct iomap_dio *dio;
640 :
641 0 : dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
642 0 : if (IS_ERR_OR_NULL(dio))
643 0 : return PTR_ERR_OR_ZERO(dio);
644 0 : return iomap_dio_complete(dio);
645 : }
646 : EXPORT_SYMBOL_GPL(iomap_dio_rw);
|