Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/fs/buffer.c
4 : *
5 : * Copyright (C) 1991, 1992, 2002 Linus Torvalds
6 : */
7 :
8 : /*
9 : * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 : *
11 : * Removed a lot of unnecessary code and simplified things now that
12 : * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 : *
14 : * Speed up hash, lru, and free list operations. Use gfp() for allocating
15 : * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 : *
17 : * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 : *
19 : * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20 : */
21 :
22 : #include <linux/kernel.h>
23 : #include <linux/sched/signal.h>
24 : #include <linux/syscalls.h>
25 : #include <linux/fs.h>
26 : #include <linux/iomap.h>
27 : #include <linux/mm.h>
28 : #include <linux/percpu.h>
29 : #include <linux/slab.h>
30 : #include <linux/capability.h>
31 : #include <linux/blkdev.h>
32 : #include <linux/file.h>
33 : #include <linux/quotaops.h>
34 : #include <linux/highmem.h>
35 : #include <linux/export.h>
36 : #include <linux/backing-dev.h>
37 : #include <linux/writeback.h>
38 : #include <linux/hash.h>
39 : #include <linux/suspend.h>
40 : #include <linux/buffer_head.h>
41 : #include <linux/task_io_accounting_ops.h>
42 : #include <linux/bio.h>
43 : #include <linux/cpu.h>
44 : #include <linux/bitops.h>
45 : #include <linux/mpage.h>
46 : #include <linux/bit_spinlock.h>
47 : #include <linux/pagevec.h>
48 : #include <linux/sched/mm.h>
49 : #include <trace/events/block.h>
50 : #include <linux/fscrypt.h>
51 :
52 : #include "internal.h"
53 :
54 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
55 : static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
56 : enum rw_hint hint, struct writeback_control *wbc);
57 :
58 : #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
59 :
60 21088 : inline void touch_buffer(struct buffer_head *bh)
61 : {
62 21088 : trace_block_touch_buffer(bh);
63 21088 : mark_page_accessed(bh->b_page);
64 21087 : }
65 : EXPORT_SYMBOL(touch_buffer);
66 :
67 0 : void __lock_buffer(struct buffer_head *bh)
68 : {
69 0 : wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
70 0 : }
71 : EXPORT_SYMBOL(__lock_buffer);
72 :
73 9737 : void unlock_buffer(struct buffer_head *bh)
74 : {
75 9737 : clear_bit_unlock(BH_Lock, &bh->b_state);
76 9737 : smp_mb__after_atomic();
77 9737 : wake_up_bit(&bh->b_state, BH_Lock);
78 9737 : }
79 : EXPORT_SYMBOL(unlock_buffer);
80 :
81 : /*
82 : * Returns if the page has dirty or writeback buffers. If all the buffers
83 : * are unlocked and clean then the PageDirty information is stale. If
84 : * any of the pages are locked, it is assumed they are locked for IO.
85 : */
86 0 : void buffer_check_dirty_writeback(struct page *page,
87 : bool *dirty, bool *writeback)
88 : {
89 0 : struct buffer_head *head, *bh;
90 0 : *dirty = false;
91 0 : *writeback = false;
92 :
93 0 : BUG_ON(!PageLocked(page));
94 :
95 0 : if (!page_has_buffers(page))
96 : return;
97 :
98 0 : if (PageWriteback(page))
99 0 : *writeback = true;
100 :
101 0 : head = page_buffers(page);
102 0 : bh = head;
103 0 : do {
104 0 : if (buffer_locked(bh))
105 0 : *writeback = true;
106 :
107 0 : if (buffer_dirty(bh))
108 0 : *dirty = true;
109 :
110 0 : bh = bh->b_this_page;
111 0 : } while (bh != head);
112 : }
113 : EXPORT_SYMBOL(buffer_check_dirty_writeback);
114 :
115 : /*
116 : * Block until a buffer comes unlocked. This doesn't stop it
117 : * from becoming locked again - you have to lock it yourself
118 : * if you want to preserve its state.
119 : */
120 795 : void __wait_on_buffer(struct buffer_head * bh)
121 : {
122 795 : wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
123 7 : }
124 : EXPORT_SYMBOL(__wait_on_buffer);
125 :
126 0 : static void buffer_io_error(struct buffer_head *bh, char *msg)
127 : {
128 0 : if (!test_bit(BH_Quiet, &bh->b_state))
129 0 : printk_ratelimited(KERN_ERR
130 : "Buffer I/O error on dev %pg, logical block %llu%s\n",
131 : bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
132 0 : }
133 :
134 : /*
135 : * End-of-IO handler helper function which does not touch the bh after
136 : * unlocking it.
137 : * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
138 : * a race there is benign: unlock_buffer() only use the bh's address for
139 : * hashing after unlocking the buffer, so it doesn't actually touch the bh
140 : * itself.
141 : */
142 3890 : static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
143 : {
144 3890 : if (uptodate) {
145 3890 : set_buffer_uptodate(bh);
146 : } else {
147 : /* This happens, due to failed read-ahead attempts. */
148 0 : clear_buffer_uptodate(bh);
149 : }
150 3890 : unlock_buffer(bh);
151 3890 : }
152 :
153 : /*
154 : * Default synchronous end-of-IO handler.. Just mark it up-to-date and
155 : * unlock the buffer. This is what ll_rw_block uses too.
156 : */
157 3890 : void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
158 : {
159 3890 : __end_buffer_read_notouch(bh, uptodate);
160 3890 : put_bh(bh);
161 3890 : }
162 : EXPORT_SYMBOL(end_buffer_read_sync);
163 :
164 12 : void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
165 : {
166 12 : if (uptodate) {
167 12 : set_buffer_uptodate(bh);
168 : } else {
169 0 : buffer_io_error(bh, ", lost sync page write");
170 0 : mark_buffer_write_io_error(bh);
171 0 : clear_buffer_uptodate(bh);
172 : }
173 12 : unlock_buffer(bh);
174 12 : put_bh(bh);
175 12 : }
176 : EXPORT_SYMBOL(end_buffer_write_sync);
177 :
178 : /*
179 : * Various filesystems appear to want __find_get_block to be non-blocking.
180 : * But it's the page lock which protects the buffers. To get around this,
181 : * we get exclusion from try_to_free_buffers with the blockdev mapping's
182 : * private_lock.
183 : *
184 : * Hack idea: for the blockdev mapping, private_lock contention
185 : * may be quite high. This code could TryLock the page, and if that
186 : * succeeds, there is no need to take private_lock.
187 : */
188 : static struct buffer_head *
189 20039 : __find_get_block_slow(struct block_device *bdev, sector_t block)
190 : {
191 20039 : struct inode *bd_inode = bdev->bd_inode;
192 20039 : struct address_space *bd_mapping = bd_inode->i_mapping;
193 20039 : struct buffer_head *ret = NULL;
194 20039 : pgoff_t index;
195 20039 : struct buffer_head *bh;
196 20039 : struct buffer_head *head;
197 20039 : struct page *page;
198 20039 : int all_mapped = 1;
199 20039 : static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
200 :
201 20039 : index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
202 20039 : page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
203 20039 : if (!page)
204 8258 : goto out;
205 :
206 11781 : spin_lock(&bd_mapping->private_lock);
207 11781 : if (!page_has_buffers(page))
208 0 : goto out_unlock;
209 11781 : head = page_buffers(page);
210 11781 : bh = head;
211 11783 : do {
212 11783 : if (!buffer_mapped(bh))
213 : all_mapped = 0;
214 11783 : else if (bh->b_blocknr == block) {
215 11781 : ret = bh;
216 11781 : get_bh(bh);
217 11781 : goto out_unlock;
218 : }
219 2 : bh = bh->b_this_page;
220 2 : } while (bh != head);
221 :
222 : /* we might be here because some of the buffers on this page are
223 : * not mapped. This is due to various races between
224 : * file io on the block device and getblk. It gets dealt with
225 : * elsewhere, don't buffer_error if we had some unmapped buffers
226 : */
227 0 : ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
228 0 : if (all_mapped && __ratelimit(&last_warned)) {
229 0 : printk("__find_get_block_slow() failed. block=%llu, "
230 : "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
231 : "device %pg blocksize: %d\n",
232 : (unsigned long long)block,
233 0 : (unsigned long long)bh->b_blocknr,
234 : bh->b_state, bh->b_size, bdev,
235 0 : 1 << bd_inode->i_blkbits);
236 : }
237 0 : out_unlock:
238 11781 : spin_unlock(&bd_mapping->private_lock);
239 11781 : put_page(page);
240 20039 : out:
241 20039 : return ret;
242 : }
243 :
244 1 : static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
245 : {
246 1 : unsigned long flags;
247 1 : struct buffer_head *first;
248 1 : struct buffer_head *tmp;
249 1 : struct page *page;
250 1 : int page_uptodate = 1;
251 :
252 1 : BUG_ON(!buffer_async_read(bh));
253 :
254 1 : page = bh->b_page;
255 1 : if (uptodate) {
256 1 : set_buffer_uptodate(bh);
257 : } else {
258 0 : clear_buffer_uptodate(bh);
259 0 : buffer_io_error(bh, ", async page read");
260 0 : SetPageError(page);
261 : }
262 :
263 : /*
264 : * Be _very_ careful from here on. Bad things can happen if
265 : * two buffer heads end IO at almost the same time and both
266 : * decide that the page is now completely done.
267 : */
268 1 : first = page_buffers(page);
269 1 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
270 1 : clear_buffer_async_read(bh);
271 1 : unlock_buffer(bh);
272 1 : tmp = bh;
273 1 : do {
274 1 : if (!buffer_uptodate(tmp))
275 0 : page_uptodate = 0;
276 1 : if (buffer_async_read(tmp)) {
277 0 : BUG_ON(!buffer_locked(tmp));
278 0 : goto still_busy;
279 : }
280 1 : tmp = tmp->b_this_page;
281 1 : } while (tmp != bh);
282 1 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
283 :
284 : /*
285 : * If none of the buffers had errors and they are all
286 : * uptodate then we can set the page uptodate.
287 : */
288 2 : if (page_uptodate && !PageError(page))
289 1 : SetPageUptodate(page);
290 1 : unlock_page(page);
291 1 : return;
292 :
293 0 : still_busy:
294 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
295 : return;
296 : }
297 :
298 : struct decrypt_bh_ctx {
299 : struct work_struct work;
300 : struct buffer_head *bh;
301 : };
302 :
303 : static void decrypt_bh(struct work_struct *work)
304 : {
305 : struct decrypt_bh_ctx *ctx =
306 : container_of(work, struct decrypt_bh_ctx, work);
307 : struct buffer_head *bh = ctx->bh;
308 : int err;
309 :
310 : err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
311 : bh_offset(bh));
312 : end_buffer_async_read(bh, err == 0);
313 : kfree(ctx);
314 : }
315 :
316 : /*
317 : * I/O completion handler for block_read_full_page() - pages
318 : * which come unlocked at the end of I/O.
319 : */
320 1 : static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
321 : {
322 : /* Decrypt if needed */
323 1 : if (uptodate &&
324 : fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
325 : struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
326 :
327 : if (ctx) {
328 : INIT_WORK(&ctx->work, decrypt_bh);
329 : ctx->bh = bh;
330 : fscrypt_enqueue_decrypt_work(&ctx->work);
331 : return;
332 : }
333 : uptodate = 0;
334 : }
335 1 : end_buffer_async_read(bh, uptodate);
336 : }
337 :
338 : /*
339 : * Completion handler for block_write_full_page() - pages which are unlocked
340 : * during I/O, and which have PageWriteback cleared upon I/O completion.
341 : */
342 867 : void end_buffer_async_write(struct buffer_head *bh, int uptodate)
343 : {
344 867 : unsigned long flags;
345 867 : struct buffer_head *first;
346 867 : struct buffer_head *tmp;
347 867 : struct page *page;
348 :
349 867 : BUG_ON(!buffer_async_write(bh));
350 :
351 867 : page = bh->b_page;
352 867 : if (uptodate) {
353 867 : set_buffer_uptodate(bh);
354 : } else {
355 0 : buffer_io_error(bh, ", lost async page write");
356 0 : mark_buffer_write_io_error(bh);
357 0 : clear_buffer_uptodate(bh);
358 0 : SetPageError(page);
359 : }
360 :
361 867 : first = page_buffers(page);
362 867 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
363 :
364 867 : clear_buffer_async_write(bh);
365 867 : unlock_buffer(bh);
366 867 : tmp = bh->b_this_page;
367 867 : while (tmp != bh) {
368 0 : if (buffer_async_write(tmp)) {
369 0 : BUG_ON(!buffer_locked(tmp));
370 0 : goto still_busy;
371 : }
372 0 : tmp = tmp->b_this_page;
373 : }
374 867 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
375 867 : end_page_writeback(page);
376 867 : return;
377 :
378 0 : still_busy:
379 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
380 : return;
381 : }
382 : EXPORT_SYMBOL(end_buffer_async_write);
383 :
384 : /*
385 : * If a page's buffers are under async readin (end_buffer_async_read
386 : * completion) then there is a possibility that another thread of
387 : * control could lock one of the buffers after it has completed
388 : * but while some of the other buffers have not completed. This
389 : * locked buffer would confuse end_buffer_async_read() into not unlocking
390 : * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
391 : * that this buffer is not under async I/O.
392 : *
393 : * The page comes unlocked when it has no locked buffer_async buffers
394 : * left.
395 : *
396 : * PageLocked prevents anyone starting new async I/O reads any of
397 : * the buffers.
398 : *
399 : * PageWriteback is used to prevent simultaneous writeout of the same
400 : * page.
401 : *
402 : * PageLocked prevents anyone from starting writeback of a page which is
403 : * under read I/O (PageWriteback is only ever set against a locked page).
404 : */
405 1 : static void mark_buffer_async_read(struct buffer_head *bh)
406 : {
407 1 : bh->b_end_io = end_buffer_async_read_io;
408 1 : set_buffer_async_read(bh);
409 1 : }
410 :
411 867 : static void mark_buffer_async_write_endio(struct buffer_head *bh,
412 : bh_end_io_t *handler)
413 : {
414 867 : bh->b_end_io = handler;
415 867 : set_buffer_async_write(bh);
416 867 : }
417 :
418 0 : void mark_buffer_async_write(struct buffer_head *bh)
419 : {
420 0 : mark_buffer_async_write_endio(bh, end_buffer_async_write);
421 0 : }
422 : EXPORT_SYMBOL(mark_buffer_async_write);
423 :
424 :
425 : /*
426 : * fs/buffer.c contains helper functions for buffer-backed address space's
427 : * fsync functions. A common requirement for buffer-based filesystems is
428 : * that certain data from the backing blockdev needs to be written out for
429 : * a successful fsync(). For example, ext2 indirect blocks need to be
430 : * written back and waited upon before fsync() returns.
431 : *
432 : * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
433 : * inode_has_buffers() and invalidate_inode_buffers() are provided for the
434 : * management of a list of dependent buffers at ->i_mapping->private_list.
435 : *
436 : * Locking is a little subtle: try_to_free_buffers() will remove buffers
437 : * from their controlling inode's queue when they are being freed. But
438 : * try_to_free_buffers() will be operating against the *blockdev* mapping
439 : * at the time, not against the S_ISREG file which depends on those buffers.
440 : * So the locking for private_list is via the private_lock in the address_space
441 : * which backs the buffers. Which is different from the address_space
442 : * against which the buffers are listed. So for a particular address_space,
443 : * mapping->private_lock does *not* protect mapping->private_list! In fact,
444 : * mapping->private_list will always be protected by the backing blockdev's
445 : * ->private_lock.
446 : *
447 : * Which introduces a requirement: all buffers on an address_space's
448 : * ->private_list must be from the same address_space: the blockdev's.
449 : *
450 : * address_spaces which do not place buffers at ->private_list via these
451 : * utility functions are free to use private_lock and private_list for
452 : * whatever they want. The only requirement is that list_empty(private_list)
453 : * be true at clear_inode() time.
454 : *
455 : * FIXME: clear_inode should not call invalidate_inode_buffers(). The
456 : * filesystems should do that. invalidate_inode_buffers() should just go
457 : * BUG_ON(!list_empty).
458 : *
459 : * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
460 : * take an address_space, not an inode. And it should be called
461 : * mark_buffer_dirty_fsync() to clearly define why those buffers are being
462 : * queued up.
463 : *
464 : * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
465 : * list if it is already on a list. Because if the buffer is on a list,
466 : * it *must* already be on the right one. If not, the filesystem is being
467 : * silly. This will save a ton of locking. But first we have to ensure
468 : * that buffers are taken *off* the old inode's list when they are freed
469 : * (presumably in truncate). That requires careful auditing of all
470 : * filesystems (do it inside bforget()). It could also be done by bringing
471 : * b_inode back.
472 : */
473 :
474 : /*
475 : * The buffer's backing address_space's private_lock must be held
476 : */
477 0 : static void __remove_assoc_queue(struct buffer_head *bh)
478 : {
479 0 : list_del_init(&bh->b_assoc_buffers);
480 0 : WARN_ON(!bh->b_assoc_map);
481 0 : bh->b_assoc_map = NULL;
482 0 : }
483 :
484 5484 : int inode_has_buffers(struct inode *inode)
485 : {
486 5484 : return !list_empty(&inode->i_data.private_list);
487 : }
488 :
489 : /*
490 : * osync is designed to support O_SYNC io. It waits synchronously for
491 : * all already-submitted IO to complete, but does not queue any new
492 : * writes to the disk.
493 : *
494 : * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
495 : * you dirty the buffers, and then use osync_inode_buffers to wait for
496 : * completion. Any other dirty buffers which are not yet queued for
497 : * write will not be flushed to disk by the osync.
498 : */
499 0 : static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
500 : {
501 0 : struct buffer_head *bh;
502 0 : struct list_head *p;
503 0 : int err = 0;
504 :
505 0 : spin_lock(lock);
506 0 : repeat:
507 0 : list_for_each_prev(p, list) {
508 0 : bh = BH_ENTRY(p);
509 0 : if (buffer_locked(bh)) {
510 0 : get_bh(bh);
511 0 : spin_unlock(lock);
512 0 : wait_on_buffer(bh);
513 0 : if (!buffer_uptodate(bh))
514 0 : err = -EIO;
515 0 : brelse(bh);
516 0 : spin_lock(lock);
517 0 : goto repeat;
518 : }
519 : }
520 0 : spin_unlock(lock);
521 0 : return err;
522 : }
523 :
524 0 : void emergency_thaw_bdev(struct super_block *sb)
525 : {
526 0 : while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
527 0 : printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
528 0 : }
529 :
530 : /**
531 : * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
532 : * @mapping: the mapping which wants those buffers written
533 : *
534 : * Starts I/O against the buffers at mapping->private_list, and waits upon
535 : * that I/O.
536 : *
537 : * Basically, this is a convenience function for fsync().
538 : * @mapping is a file or directory which needs those buffers to be written for
539 : * a successful fsync().
540 : */
541 0 : int sync_mapping_buffers(struct address_space *mapping)
542 : {
543 0 : struct address_space *buffer_mapping = mapping->private_data;
544 :
545 0 : if (buffer_mapping == NULL || list_empty(&mapping->private_list))
546 : return 0;
547 :
548 0 : return fsync_buffers_list(&buffer_mapping->private_lock,
549 : &mapping->private_list);
550 : }
551 : EXPORT_SYMBOL(sync_mapping_buffers);
552 :
553 : /*
554 : * Called when we've recently written block `bblock', and it is known that
555 : * `bblock' was for a buffer_boundary() buffer. This means that the block at
556 : * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
557 : * dirty, schedule it for IO. So that indirects merge nicely with their data.
558 : */
559 0 : void write_boundary_block(struct block_device *bdev,
560 : sector_t bblock, unsigned blocksize)
561 : {
562 0 : struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
563 0 : if (bh) {
564 0 : if (buffer_dirty(bh))
565 0 : ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
566 0 : put_bh(bh);
567 : }
568 0 : }
569 :
570 0 : void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
571 : {
572 0 : struct address_space *mapping = inode->i_mapping;
573 0 : struct address_space *buffer_mapping = bh->b_page->mapping;
574 :
575 0 : mark_buffer_dirty(bh);
576 0 : if (!mapping->private_data) {
577 0 : mapping->private_data = buffer_mapping;
578 : } else {
579 0 : BUG_ON(mapping->private_data != buffer_mapping);
580 : }
581 0 : if (!bh->b_assoc_map) {
582 0 : spin_lock(&buffer_mapping->private_lock);
583 0 : list_move_tail(&bh->b_assoc_buffers,
584 : &mapping->private_list);
585 0 : bh->b_assoc_map = mapping;
586 0 : spin_unlock(&buffer_mapping->private_lock);
587 : }
588 0 : }
589 : EXPORT_SYMBOL(mark_buffer_dirty_inode);
590 :
591 : /*
592 : * Mark the page dirty, and set it dirty in the page cache, and mark the inode
593 : * dirty.
594 : *
595 : * If warn is true, then emit a warning if the page is not uptodate and has
596 : * not been truncated.
597 : *
598 : * The caller must hold lock_page_memcg().
599 : */
600 2719 : void __set_page_dirty(struct page *page, struct address_space *mapping,
601 : int warn)
602 : {
603 2719 : unsigned long flags;
604 :
605 2719 : xa_lock_irqsave(&mapping->i_pages, flags);
606 2719 : if (page->mapping) { /* Race with truncate? */
607 2719 : WARN_ON_ONCE(warn && !PageUptodate(page));
608 2719 : account_page_dirtied(page, mapping);
609 2719 : __xa_set_mark(&mapping->i_pages, page_index(page),
610 : PAGECACHE_TAG_DIRTY);
611 : }
612 2719 : xa_unlock_irqrestore(&mapping->i_pages, flags);
613 2719 : }
614 : EXPORT_SYMBOL_GPL(__set_page_dirty);
615 :
616 : /*
617 : * Add a page to the dirty page list.
618 : *
619 : * It is a sad fact of life that this function is called from several places
620 : * deeply under spinlocking. It may not sleep.
621 : *
622 : * If the page has buffers, the uptodate buffers are set dirty, to preserve
623 : * dirty-state coherency between the page and the buffers. It the page does
624 : * not have buffers then when they are later attached they will all be set
625 : * dirty.
626 : *
627 : * The buffers are dirtied before the page is dirtied. There's a small race
628 : * window in which a writepage caller may see the page cleanness but not the
629 : * buffer dirtiness. That's fine. If this code were to set the page dirty
630 : * before the buffers, a concurrent writepage caller could clear the page dirty
631 : * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
632 : * page on the dirty page list.
633 : *
634 : * We use private_lock to lock against try_to_free_buffers while using the
635 : * page's buffer list. Also use this to protect against clean buffers being
636 : * added to the page after it was set dirty.
637 : *
638 : * FIXME: may need to call ->reservepage here as well. That's rather up to the
639 : * address_space though.
640 : */
641 912 : int __set_page_dirty_buffers(struct page *page)
642 : {
643 912 : int newly_dirty;
644 912 : struct address_space *mapping = page_mapping(page);
645 :
646 912 : if (unlikely(!mapping))
647 0 : return !TestSetPageDirty(page);
648 :
649 912 : spin_lock(&mapping->private_lock);
650 912 : if (page_has_buffers(page)) {
651 912 : struct buffer_head *head = page_buffers(page);
652 912 : struct buffer_head *bh = head;
653 :
654 912 : do {
655 912 : set_buffer_dirty(bh);
656 912 : bh = bh->b_this_page;
657 912 : } while (bh != head);
658 : }
659 : /*
660 : * Lock out page's memcg migration to keep PageDirty
661 : * synchronized with per-memcg dirty page counters.
662 : */
663 912 : lock_page_memcg(page);
664 912 : newly_dirty = !TestSetPageDirty(page);
665 912 : spin_unlock(&mapping->private_lock);
666 :
667 912 : if (newly_dirty)
668 0 : __set_page_dirty(page, mapping, 1);
669 :
670 912 : unlock_page_memcg(page);
671 :
672 912 : if (newly_dirty)
673 0 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
674 :
675 : return newly_dirty;
676 : }
677 : EXPORT_SYMBOL(__set_page_dirty_buffers);
678 :
679 : /*
680 : * Write out and wait upon a list of buffers.
681 : *
682 : * We have conflicting pressures: we want to make sure that all
683 : * initially dirty buffers get waited on, but that any subsequently
684 : * dirtied buffers don't. After all, we don't want fsync to last
685 : * forever if somebody is actively writing to the file.
686 : *
687 : * Do this in two main stages: first we copy dirty buffers to a
688 : * temporary inode list, queueing the writes as we go. Then we clean
689 : * up, waiting for those writes to complete.
690 : *
691 : * During this second stage, any subsequent updates to the file may end
692 : * up refiling the buffer on the original inode's dirty list again, so
693 : * there is a chance we will end up with a buffer queued for write but
694 : * not yet completed on that list. So, as a final cleanup we go through
695 : * the osync code to catch these locked, dirty buffers without requeuing
696 : * any newly dirty buffers for write.
697 : */
698 0 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
699 : {
700 0 : struct buffer_head *bh;
701 0 : struct list_head tmp;
702 0 : struct address_space *mapping;
703 0 : int err = 0, err2;
704 0 : struct blk_plug plug;
705 :
706 0 : INIT_LIST_HEAD(&tmp);
707 0 : blk_start_plug(&plug);
708 :
709 0 : spin_lock(lock);
710 0 : while (!list_empty(list)) {
711 0 : bh = BH_ENTRY(list->next);
712 0 : mapping = bh->b_assoc_map;
713 0 : __remove_assoc_queue(bh);
714 : /* Avoid race with mark_buffer_dirty_inode() which does
715 : * a lockless check and we rely on seeing the dirty bit */
716 0 : smp_mb();
717 0 : if (buffer_dirty(bh) || buffer_locked(bh)) {
718 0 : list_add(&bh->b_assoc_buffers, &tmp);
719 0 : bh->b_assoc_map = mapping;
720 0 : if (buffer_dirty(bh)) {
721 0 : get_bh(bh);
722 0 : spin_unlock(lock);
723 : /*
724 : * Ensure any pending I/O completes so that
725 : * write_dirty_buffer() actually writes the
726 : * current contents - it is a noop if I/O is
727 : * still in flight on potentially older
728 : * contents.
729 : */
730 0 : write_dirty_buffer(bh, REQ_SYNC);
731 :
732 : /*
733 : * Kick off IO for the previous mapping. Note
734 : * that we will not run the very last mapping,
735 : * wait_on_buffer() will do that for us
736 : * through sync_buffer().
737 : */
738 0 : brelse(bh);
739 0 : spin_lock(lock);
740 : }
741 : }
742 : }
743 :
744 0 : spin_unlock(lock);
745 0 : blk_finish_plug(&plug);
746 0 : spin_lock(lock);
747 :
748 0 : while (!list_empty(&tmp)) {
749 0 : bh = BH_ENTRY(tmp.prev);
750 0 : get_bh(bh);
751 0 : mapping = bh->b_assoc_map;
752 0 : __remove_assoc_queue(bh);
753 : /* Avoid race with mark_buffer_dirty_inode() which does
754 : * a lockless check and we rely on seeing the dirty bit */
755 0 : smp_mb();
756 0 : if (buffer_dirty(bh)) {
757 0 : list_add(&bh->b_assoc_buffers,
758 : &mapping->private_list);
759 0 : bh->b_assoc_map = mapping;
760 : }
761 0 : spin_unlock(lock);
762 0 : wait_on_buffer(bh);
763 0 : if (!buffer_uptodate(bh))
764 0 : err = -EIO;
765 0 : brelse(bh);
766 0 : spin_lock(lock);
767 : }
768 :
769 0 : spin_unlock(lock);
770 0 : err2 = osync_buffers_list(lock, list);
771 0 : if (err)
772 : return err;
773 : else
774 0 : return err2;
775 : }
776 :
777 : /*
778 : * Invalidate any and all dirty buffers on a given inode. We are
779 : * probably unmounting the fs, but that doesn't mean we have already
780 : * done a sync(). Just drop the buffers from the inode list.
781 : *
782 : * NOTE: we take the inode's blockdev's mapping's private_lock. Which
783 : * assumes that all the buffers are against the blockdev. Not true
784 : * for reiserfs.
785 : */
786 193 : void invalidate_inode_buffers(struct inode *inode)
787 : {
788 193 : if (inode_has_buffers(inode)) {
789 0 : struct address_space *mapping = &inode->i_data;
790 0 : struct list_head *list = &mapping->private_list;
791 0 : struct address_space *buffer_mapping = mapping->private_data;
792 :
793 0 : spin_lock(&buffer_mapping->private_lock);
794 0 : while (!list_empty(list))
795 0 : __remove_assoc_queue(BH_ENTRY(list->next));
796 0 : spin_unlock(&buffer_mapping->private_lock);
797 : }
798 193 : }
799 : EXPORT_SYMBOL(invalidate_inode_buffers);
800 :
801 : /*
802 : * Remove any clean buffers from the inode's buffer list. This is called
803 : * when we're trying to free the inode itself. Those buffers can pin it.
804 : *
805 : * Returns true if all buffers were removed.
806 : */
807 0 : int remove_inode_buffers(struct inode *inode)
808 : {
809 0 : int ret = 1;
810 :
811 0 : if (inode_has_buffers(inode)) {
812 0 : struct address_space *mapping = &inode->i_data;
813 0 : struct list_head *list = &mapping->private_list;
814 0 : struct address_space *buffer_mapping = mapping->private_data;
815 :
816 0 : spin_lock(&buffer_mapping->private_lock);
817 0 : while (!list_empty(list)) {
818 0 : struct buffer_head *bh = BH_ENTRY(list->next);
819 0 : if (buffer_dirty(bh)) {
820 : ret = 0;
821 : break;
822 : }
823 0 : __remove_assoc_queue(bh);
824 : }
825 0 : spin_unlock(&buffer_mapping->private_lock);
826 : }
827 0 : return ret;
828 : }
829 :
830 : /*
831 : * Create the appropriate buffers when given a page for data area and
832 : * the size of each buffer.. Use the bh->b_this_page linked list to
833 : * follow the buffers created. Return NULL if unable to create more
834 : * buffers.
835 : *
836 : * The retry flag is used to differentiate async IO (paging, swapping)
837 : * which may not fail from ordinary buffer allocations.
838 : */
839 5842 : struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
840 : bool retry)
841 : {
842 5842 : struct buffer_head *bh, *head;
843 5842 : gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
844 5842 : long offset;
845 5842 : struct mem_cgroup *memcg, *old_memcg;
846 :
847 5842 : if (retry)
848 5842 : gfp |= __GFP_NOFAIL;
849 :
850 : /* The page lock pins the memcg */
851 5842 : memcg = page_memcg(page);
852 5842 : old_memcg = set_active_memcg(memcg);
853 :
854 5842 : head = NULL;
855 5842 : offset = PAGE_SIZE;
856 11690 : while ((offset -= size) >= 0) {
857 5848 : bh = alloc_buffer_head(gfp);
858 5848 : if (!bh)
859 0 : goto no_grow;
860 :
861 5848 : bh->b_this_page = head;
862 5848 : bh->b_blocknr = -1;
863 5848 : head = bh;
864 :
865 5848 : bh->b_size = size;
866 :
867 : /* Link the buffer to its page */
868 5848 : set_bh_page(bh, page, offset);
869 : }
870 5842 : out:
871 5842 : set_active_memcg(old_memcg);
872 5842 : return head;
873 : /*
874 : * In case anything failed, we just free everything we got.
875 : */
876 0 : no_grow:
877 0 : if (head) {
878 0 : do {
879 0 : bh = head;
880 0 : head = head->b_this_page;
881 0 : free_buffer_head(bh);
882 0 : } while (head);
883 : }
884 :
885 0 : goto out;
886 : }
887 : EXPORT_SYMBOL_GPL(alloc_page_buffers);
888 :
889 : static inline void
890 4129 : link_dev_buffers(struct page *page, struct buffer_head *head)
891 : {
892 4129 : struct buffer_head *bh, *tail;
893 :
894 4129 : bh = head;
895 4135 : do {
896 4135 : tail = bh;
897 4135 : bh = bh->b_this_page;
898 4135 : } while (bh);
899 4129 : tail->b_this_page = head;
900 4129 : attach_page_private(page, head);
901 4129 : }
902 :
903 4129 : static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
904 : {
905 4129 : sector_t retval = ~((sector_t)0);
906 4129 : loff_t sz = i_size_read(bdev->bd_inode);
907 :
908 4129 : if (sz) {
909 4129 : unsigned int sizebits = blksize_bits(size);
910 4129 : retval = (sz >> sizebits);
911 : }
912 4129 : return retval;
913 : }
914 :
915 : /*
916 : * Initialise the state of a blockdev page's buffers.
917 : */
918 : static sector_t
919 4129 : init_page_buffers(struct page *page, struct block_device *bdev,
920 : sector_t block, int size)
921 : {
922 4129 : struct buffer_head *head = page_buffers(page);
923 4129 : struct buffer_head *bh = head;
924 4129 : int uptodate = PageUptodate(page);
925 8258 : sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
926 :
927 4135 : do {
928 4135 : if (!buffer_mapped(bh)) {
929 4135 : bh->b_end_io = NULL;
930 4135 : bh->b_private = NULL;
931 4135 : bh->b_bdev = bdev;
932 4135 : bh->b_blocknr = block;
933 4135 : if (uptodate)
934 0 : set_buffer_uptodate(bh);
935 4135 : if (block < end_block)
936 4135 : set_buffer_mapped(bh);
937 : }
938 4135 : block++;
939 4135 : bh = bh->b_this_page;
940 4135 : } while (bh != head);
941 :
942 : /*
943 : * Caller needs to validate requested block against end of device.
944 : */
945 4129 : return end_block;
946 : }
947 :
948 : /*
949 : * Create the page-cache page that contains the requested block.
950 : *
951 : * This is used purely for blockdev mappings.
952 : */
953 : static int
954 4129 : grow_dev_page(struct block_device *bdev, sector_t block,
955 : pgoff_t index, int size, int sizebits, gfp_t gfp)
956 : {
957 4129 : struct inode *inode = bdev->bd_inode;
958 4129 : struct page *page;
959 4129 : struct buffer_head *bh;
960 4129 : sector_t end_block;
961 4129 : int ret = 0;
962 4129 : gfp_t gfp_mask;
963 :
964 4129 : gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
965 :
966 : /*
967 : * XXX: __getblk_slow() can not really deal with failure and
968 : * will endlessly loop on improvised global reclaim. Prefer
969 : * looping in the allocator rather than here, at least that
970 : * code knows what it's doing.
971 : */
972 4129 : gfp_mask |= __GFP_NOFAIL;
973 :
974 4129 : page = find_or_create_page(inode->i_mapping, index, gfp_mask);
975 :
976 8258 : BUG_ON(!PageLocked(page));
977 :
978 4129 : if (page_has_buffers(page)) {
979 0 : bh = page_buffers(page);
980 0 : if (bh->b_size == size) {
981 0 : end_block = init_page_buffers(page, bdev,
982 : (sector_t)index << sizebits,
983 : size);
984 0 : goto done;
985 : }
986 0 : if (!try_to_free_buffers(page))
987 0 : goto failed;
988 : }
989 :
990 : /*
991 : * Allocate some buffers for this page
992 : */
993 4129 : bh = alloc_page_buffers(page, size, true);
994 :
995 : /*
996 : * Link the page to the buffers and initialise them. Take the
997 : * lock to be atomic wrt __find_get_block(), which does not
998 : * run under the page lock.
999 : */
1000 4129 : spin_lock(&inode->i_mapping->private_lock);
1001 4129 : link_dev_buffers(page, bh);
1002 4129 : end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1003 : size);
1004 4129 : spin_unlock(&inode->i_mapping->private_lock);
1005 4129 : done:
1006 4129 : ret = (block < end_block) ? 1 : -ENXIO;
1007 4129 : failed:
1008 4129 : unlock_page(page);
1009 4129 : put_page(page);
1010 4129 : return ret;
1011 : }
1012 :
1013 : /*
1014 : * Create buffers for the specified block device block's page. If
1015 : * that page was dirty, the buffers are set dirty also.
1016 : */
1017 : static int
1018 4129 : grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1019 : {
1020 4129 : pgoff_t index;
1021 4129 : int sizebits;
1022 :
1023 4129 : sizebits = -1;
1024 4133 : do {
1025 4133 : sizebits++;
1026 4133 : } while ((size << sizebits) < PAGE_SIZE);
1027 :
1028 4129 : index = block >> sizebits;
1029 :
1030 : /*
1031 : * Check for a block which wants to lie outside our maximum possible
1032 : * pagecache index. (this comparison is done using sector_t types).
1033 : */
1034 4129 : if (unlikely(index != block >> sizebits)) {
1035 : printk(KERN_ERR "%s: requested out-of-range block %llu for "
1036 : "device %pg\n",
1037 : __func__, (unsigned long long)block,
1038 : bdev);
1039 : return -EIO;
1040 : }
1041 :
1042 : /* Create a page with the proper size buffers.. */
1043 4129 : return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1044 : }
1045 :
1046 : static struct buffer_head *
1047 4129 : __getblk_slow(struct block_device *bdev, sector_t block,
1048 : unsigned size, gfp_t gfp)
1049 : {
1050 : /* Size must be multiple of hard sectorsize */
1051 8258 : if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1052 : (size < 512 || size > PAGE_SIZE))) {
1053 0 : printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1054 : size);
1055 0 : printk(KERN_ERR "logical block size: %d\n",
1056 : bdev_logical_block_size(bdev));
1057 :
1058 0 : dump_stack();
1059 0 : return NULL;
1060 : }
1061 :
1062 8258 : for (;;) {
1063 8258 : struct buffer_head *bh;
1064 8258 : int ret;
1065 :
1066 8258 : bh = __find_get_block(bdev, block, size);
1067 8258 : if (bh)
1068 4129 : return bh;
1069 :
1070 4129 : ret = grow_buffers(bdev, block, size, gfp);
1071 4129 : if (ret < 0)
1072 : return NULL;
1073 : }
1074 : }
1075 :
1076 : /*
1077 : * The relationship between dirty buffers and dirty pages:
1078 : *
1079 : * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1080 : * the page is tagged dirty in the page cache.
1081 : *
1082 : * At all times, the dirtiness of the buffers represents the dirtiness of
1083 : * subsections of the page. If the page has buffers, the page dirty bit is
1084 : * merely a hint about the true dirty state.
1085 : *
1086 : * When a page is set dirty in its entirety, all its buffers are marked dirty
1087 : * (if the page has buffers).
1088 : *
1089 : * When a buffer is marked dirty, its page is dirtied, but the page's other
1090 : * buffers are not.
1091 : *
1092 : * Also. When blockdev buffers are explicitly read with bread(), they
1093 : * individually become uptodate. But their backing page remains not
1094 : * uptodate - even if all of its buffers are uptodate. A subsequent
1095 : * block_read_full_page() against that page will discover all the uptodate
1096 : * buffers, will set the page uptodate and will perform no I/O.
1097 : */
1098 :
1099 : /**
1100 : * mark_buffer_dirty - mark a buffer_head as needing writeout
1101 : * @bh: the buffer_head to mark dirty
1102 : *
1103 : * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1104 : * its backing page dirty, then tag the page as dirty in the page cache
1105 : * and then attach the address_space's inode to its superblock's dirty
1106 : * inode list.
1107 : *
1108 : * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1109 : * i_pages lock and mapping->host->i_lock.
1110 : */
1111 4185 : void mark_buffer_dirty(struct buffer_head *bh)
1112 : {
1113 4185 : WARN_ON_ONCE(!buffer_uptodate(bh));
1114 :
1115 4185 : trace_block_dirty_buffer(bh);
1116 :
1117 : /*
1118 : * Very *carefully* optimize the it-is-already-dirty case.
1119 : *
1120 : * Don't let the final "is it dirty" escape to before we
1121 : * perhaps modified the buffer.
1122 : */
1123 4185 : if (buffer_dirty(bh)) {
1124 621 : smp_mb();
1125 621 : if (buffer_dirty(bh))
1126 : return;
1127 : }
1128 :
1129 3564 : if (!test_set_buffer_dirty(bh)) {
1130 3564 : struct page *page = bh->b_page;
1131 3564 : struct address_space *mapping = NULL;
1132 :
1133 3564 : lock_page_memcg(page);
1134 7128 : if (!TestSetPageDirty(page)) {
1135 2719 : mapping = page_mapping(page);
1136 2719 : if (mapping)
1137 2719 : __set_page_dirty(page, mapping, 0);
1138 : }
1139 2719 : unlock_page_memcg(page);
1140 2719 : if (mapping)
1141 2719 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1142 : }
1143 : }
1144 : EXPORT_SYMBOL(mark_buffer_dirty);
1145 :
1146 0 : void mark_buffer_write_io_error(struct buffer_head *bh)
1147 : {
1148 0 : struct super_block *sb;
1149 :
1150 0 : set_buffer_write_io_error(bh);
1151 : /* FIXME: do we need to set this in both places? */
1152 0 : if (bh->b_page && bh->b_page->mapping)
1153 0 : mapping_set_error(bh->b_page->mapping, -EIO);
1154 0 : if (bh->b_assoc_map)
1155 0 : mapping_set_error(bh->b_assoc_map, -EIO);
1156 0 : rcu_read_lock();
1157 0 : sb = READ_ONCE(bh->b_bdev->bd_super);
1158 0 : if (sb)
1159 0 : errseq_set(&sb->s_wb_err, -EIO);
1160 0 : rcu_read_unlock();
1161 0 : }
1162 : EXPORT_SYMBOL(mark_buffer_write_io_error);
1163 :
1164 : /*
1165 : * Decrement a buffer_head's reference count. If all buffers against a page
1166 : * have zero reference count, are clean and unlocked, and if the page is clean
1167 : * and unlocked then try_to_free_buffers() may strip the buffers from the page
1168 : * in preparation for freeing it (sometimes, rarely, buffers are removed from
1169 : * a page but it ends up not being freed, and buffers may later be reattached).
1170 : */
1171 50185 : void __brelse(struct buffer_head * buf)
1172 : {
1173 50185 : if (atomic_read(&buf->b_count)) {
1174 50186 : put_bh(buf);
1175 50186 : return;
1176 : }
1177 0 : WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1178 : }
1179 : EXPORT_SYMBOL(__brelse);
1180 :
1181 : /*
1182 : * bforget() is like brelse(), except it discards any
1183 : * potentially dirty data.
1184 : */
1185 5 : void __bforget(struct buffer_head *bh)
1186 : {
1187 5 : clear_buffer_dirty(bh);
1188 5 : if (bh->b_assoc_map) {
1189 0 : struct address_space *buffer_mapping = bh->b_page->mapping;
1190 :
1191 0 : spin_lock(&buffer_mapping->private_lock);
1192 0 : list_del_init(&bh->b_assoc_buffers);
1193 0 : bh->b_assoc_map = NULL;
1194 0 : spin_unlock(&buffer_mapping->private_lock);
1195 : }
1196 5 : __brelse(bh);
1197 5 : }
1198 : EXPORT_SYMBOL(__bforget);
1199 :
1200 0 : static struct buffer_head *__bread_slow(struct buffer_head *bh)
1201 : {
1202 0 : lock_buffer(bh);
1203 0 : if (buffer_uptodate(bh)) {
1204 0 : unlock_buffer(bh);
1205 0 : return bh;
1206 : } else {
1207 0 : get_bh(bh);
1208 0 : bh->b_end_io = end_buffer_read_sync;
1209 0 : submit_bh(REQ_OP_READ, 0, bh);
1210 0 : wait_on_buffer(bh);
1211 0 : if (buffer_uptodate(bh))
1212 : return bh;
1213 : }
1214 0 : brelse(bh);
1215 : return NULL;
1216 : }
1217 :
1218 : /*
1219 : * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1220 : * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1221 : * refcount elevated by one when they're in an LRU. A buffer can only appear
1222 : * once in a particular CPU's LRU. A single buffer can be present in multiple
1223 : * CPU's LRUs at the same time.
1224 : *
1225 : * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1226 : * sb_find_get_block().
1227 : *
1228 : * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1229 : * a local interrupt disable for that.
1230 : */
1231 :
1232 : #define BH_LRU_SIZE 16
1233 :
1234 : struct bh_lru {
1235 : struct buffer_head *bhs[BH_LRU_SIZE];
1236 : };
1237 :
1238 : static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1239 :
1240 : #ifdef CONFIG_SMP
1241 : #define bh_lru_lock() local_irq_disable()
1242 : #define bh_lru_unlock() local_irq_enable()
1243 : #else
1244 : #define bh_lru_lock() preempt_disable()
1245 : #define bh_lru_unlock() preempt_enable()
1246 : #endif
1247 :
1248 52907 : static inline void check_irqs_on(void)
1249 : {
1250 : #ifdef irqs_disabled
1251 52907 : BUG_ON(irqs_disabled());
1252 : #endif
1253 52907 : }
1254 :
1255 : /*
1256 : * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1257 : * inserted at the front, and the buffer_head at the back if any is evicted.
1258 : * Or, if already in the LRU it is moved to the front.
1259 : */
1260 11781 : static void bh_lru_install(struct buffer_head *bh)
1261 : {
1262 11781 : struct buffer_head *evictee = bh;
1263 11781 : struct bh_lru *b;
1264 11781 : int i;
1265 :
1266 11781 : check_irqs_on();
1267 11781 : bh_lru_lock();
1268 :
1269 11781 : b = this_cpu_ptr(&bh_lrus);
1270 200277 : for (i = 0; i < BH_LRU_SIZE; i++) {
1271 188496 : swap(evictee, b->bhs[i]);
1272 188496 : if (evictee == bh) {
1273 0 : bh_lru_unlock();
1274 0 : return;
1275 : }
1276 : }
1277 :
1278 11781 : get_bh(bh);
1279 11781 : bh_lru_unlock();
1280 11781 : brelse(evictee);
1281 : }
1282 :
1283 : /*
1284 : * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1285 : */
1286 : static struct buffer_head *
1287 41126 : lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1288 : {
1289 41126 : struct buffer_head *ret = NULL;
1290 41126 : unsigned int i;
1291 :
1292 41126 : check_irqs_on();
1293 41126 : bh_lru_lock();
1294 408537 : for (i = 0; i < BH_LRU_SIZE; i++) {
1295 388500 : struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1296 :
1297 388500 : if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1298 21088 : bh->b_size == size) {
1299 21088 : if (i) {
1300 60046 : while (i) {
1301 46788 : __this_cpu_write(bh_lrus.bhs[i],
1302 : __this_cpu_read(bh_lrus.bhs[i - 1]));
1303 46788 : i--;
1304 : }
1305 21088 : __this_cpu_write(bh_lrus.bhs[0], bh);
1306 : }
1307 21088 : get_bh(bh);
1308 21088 : ret = bh;
1309 21088 : break;
1310 : }
1311 : }
1312 41125 : bh_lru_unlock();
1313 41127 : return ret;
1314 : }
1315 :
1316 : /*
1317 : * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1318 : * it in the LRU and mark it as accessed. If it is not present then return
1319 : * NULL
1320 : */
1321 : struct buffer_head *
1322 41126 : __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1323 : {
1324 41126 : struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1325 :
1326 41127 : if (bh == NULL) {
1327 : /* __find_get_block_slow will mark the page accessed */
1328 20039 : bh = __find_get_block_slow(bdev, block);
1329 20039 : if (bh)
1330 11781 : bh_lru_install(bh);
1331 : } else
1332 21088 : touch_buffer(bh);
1333 :
1334 41126 : return bh;
1335 : }
1336 : EXPORT_SYMBOL(__find_get_block);
1337 :
1338 : /*
1339 : * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1340 : * which corresponds to the passed block_device, block and size. The
1341 : * returned buffer has its reference count incremented.
1342 : *
1343 : * __getblk_gfp() will lock up the machine if grow_dev_page's
1344 : * try_to_free_buffers() attempt is failing. FIXME, perhaps?
1345 : */
1346 : struct buffer_head *
1347 31809 : __getblk_gfp(struct block_device *bdev, sector_t block,
1348 : unsigned size, gfp_t gfp)
1349 : {
1350 31809 : struct buffer_head *bh = __find_get_block(bdev, block, size);
1351 :
1352 31809 : might_sleep();
1353 31809 : if (bh == NULL)
1354 4129 : bh = __getblk_slow(bdev, block, size, gfp);
1355 31809 : return bh;
1356 : }
1357 : EXPORT_SYMBOL(__getblk_gfp);
1358 :
1359 : /*
1360 : * Do async read-ahead on a buffer..
1361 : */
1362 0 : void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1363 : {
1364 0 : struct buffer_head *bh = __getblk(bdev, block, size);
1365 0 : if (likely(bh)) {
1366 0 : ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1367 0 : brelse(bh);
1368 : }
1369 0 : }
1370 : EXPORT_SYMBOL(__breadahead);
1371 :
1372 0 : void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
1373 : gfp_t gfp)
1374 : {
1375 0 : struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1376 0 : if (likely(bh)) {
1377 0 : ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1378 0 : brelse(bh);
1379 : }
1380 0 : }
1381 : EXPORT_SYMBOL(__breadahead_gfp);
1382 :
1383 : /**
1384 : * __bread_gfp() - reads a specified block and returns the bh
1385 : * @bdev: the block_device to read from
1386 : * @block: number of block
1387 : * @size: size (in bytes) to read
1388 : * @gfp: page allocation flag
1389 : *
1390 : * Reads a specified block, and returns buffer head that contains it.
1391 : * The page cache can be allocated from non-movable area
1392 : * not to prevent page migration if you set gfp to zero.
1393 : * It returns NULL if the block was unreadable.
1394 : */
1395 : struct buffer_head *
1396 0 : __bread_gfp(struct block_device *bdev, sector_t block,
1397 : unsigned size, gfp_t gfp)
1398 : {
1399 0 : struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1400 :
1401 0 : if (likely(bh) && !buffer_uptodate(bh))
1402 0 : bh = __bread_slow(bh);
1403 0 : return bh;
1404 : }
1405 : EXPORT_SYMBOL(__bread_gfp);
1406 :
1407 : /*
1408 : * invalidate_bh_lrus() is called rarely - but not only at unmount.
1409 : * This doesn't race because it runs in each cpu either in irq
1410 : * or with preempt disabled.
1411 : */
1412 2 : static void invalidate_bh_lru(void *arg)
1413 : {
1414 2 : struct bh_lru *b = &get_cpu_var(bh_lrus);
1415 2 : int i;
1416 :
1417 34 : for (i = 0; i < BH_LRU_SIZE; i++) {
1418 32 : brelse(b->bhs[i]);
1419 32 : b->bhs[i] = NULL;
1420 : }
1421 2 : put_cpu_var(bh_lrus);
1422 2 : }
1423 :
1424 12 : static bool has_bh_in_lru(int cpu, void *dummy)
1425 : {
1426 12 : struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1427 12 : int i;
1428 :
1429 172 : for (i = 0; i < BH_LRU_SIZE; i++) {
1430 162 : if (b->bhs[i])
1431 : return true;
1432 : }
1433 :
1434 : return false;
1435 : }
1436 :
1437 3 : void invalidate_bh_lrus(void)
1438 : {
1439 3 : on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1440 3 : }
1441 : EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1442 :
1443 7659 : void set_bh_page(struct buffer_head *bh,
1444 : struct page *page, unsigned long offset)
1445 : {
1446 7659 : bh->b_page = page;
1447 7659 : BUG_ON(offset >= PAGE_SIZE);
1448 7659 : if (PageHighMem(page))
1449 : /*
1450 : * This catches illegal uses and preserves the offset:
1451 : */
1452 : bh->b_data = (char *)(0 + offset);
1453 : else
1454 7659 : bh->b_data = page_address(page) + offset;
1455 7659 : }
1456 : EXPORT_SYMBOL(set_bh_page);
1457 :
1458 : /*
1459 : * Called when truncating a buffer on a page completely.
1460 : */
1461 :
1462 : /* Bits that are cleared during an invalidate */
1463 : #define BUFFER_FLAGS_DISCARD \
1464 : (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1465 : 1 << BH_Delay | 1 << BH_Unwritten)
1466 :
1467 505 : static void discard_buffer(struct buffer_head * bh)
1468 : {
1469 505 : unsigned long b_state, b_state_old;
1470 :
1471 505 : lock_buffer(bh);
1472 505 : clear_buffer_dirty(bh);
1473 505 : bh->b_bdev = NULL;
1474 505 : b_state = bh->b_state;
1475 505 : for (;;) {
1476 505 : b_state_old = cmpxchg(&bh->b_state, b_state,
1477 : (b_state & ~BUFFER_FLAGS_DISCARD));
1478 505 : if (b_state_old == b_state)
1479 : break;
1480 : b_state = b_state_old;
1481 : }
1482 505 : unlock_buffer(bh);
1483 505 : }
1484 :
1485 : /**
1486 : * block_invalidatepage - invalidate part or all of a buffer-backed page
1487 : *
1488 : * @page: the page which is affected
1489 : * @offset: start of the range to invalidate
1490 : * @length: length of the range to invalidate
1491 : *
1492 : * block_invalidatepage() is called when all or part of the page has become
1493 : * invalidated by a truncate operation.
1494 : *
1495 : * block_invalidatepage() does not have to release all buffers, but it must
1496 : * ensure that no dirty buffer is left outside @offset and that no I/O
1497 : * is underway against any of the blocks which are outside the truncation
1498 : * point. Because the caller is about to free (and possibly reuse) those
1499 : * blocks on-disk.
1500 : */
1501 501 : void block_invalidatepage(struct page *page, unsigned int offset,
1502 : unsigned int length)
1503 : {
1504 501 : struct buffer_head *head, *bh, *next;
1505 501 : unsigned int curr_off = 0;
1506 501 : unsigned int stop = length + offset;
1507 :
1508 1002 : BUG_ON(!PageLocked(page));
1509 501 : if (!page_has_buffers(page))
1510 0 : goto out;
1511 :
1512 : /*
1513 : * Check for overflow
1514 : */
1515 501 : BUG_ON(stop > PAGE_SIZE || stop < length);
1516 :
1517 501 : head = page_buffers(page);
1518 501 : bh = head;
1519 507 : do {
1520 507 : unsigned int next_off = curr_off + bh->b_size;
1521 507 : next = bh->b_this_page;
1522 :
1523 : /*
1524 : * Are we still fully in range ?
1525 : */
1526 507 : if (next_off > stop)
1527 0 : goto out;
1528 :
1529 : /*
1530 : * is this block fully invalidated?
1531 : */
1532 507 : if (offset <= curr_off)
1533 505 : discard_buffer(bh);
1534 507 : curr_off = next_off;
1535 507 : bh = next;
1536 507 : } while (bh != head);
1537 :
1538 : /*
1539 : * We release buffers only if the entire page is being invalidated.
1540 : * The get_block cached value has been unconditionally invalidated,
1541 : * so real IO is not possible anymore.
1542 : */
1543 501 : if (length == PAGE_SIZE)
1544 499 : try_to_release_page(page, 0);
1545 2 : out:
1546 501 : return;
1547 : }
1548 : EXPORT_SYMBOL(block_invalidatepage);
1549 :
1550 :
1551 : /*
1552 : * We attach and possibly dirty the buffers atomically wrt
1553 : * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1554 : * is already excluded via the page lock.
1555 : */
1556 1713 : void create_empty_buffers(struct page *page,
1557 : unsigned long blocksize, unsigned long b_state)
1558 : {
1559 1713 : struct buffer_head *bh, *head, *tail;
1560 :
1561 1713 : head = alloc_page_buffers(page, blocksize, true);
1562 1713 : bh = head;
1563 1713 : do {
1564 1713 : bh->b_state |= b_state;
1565 1713 : tail = bh;
1566 1713 : bh = bh->b_this_page;
1567 1713 : } while (bh);
1568 1713 : tail->b_this_page = head;
1569 :
1570 1713 : spin_lock(&page->mapping->private_lock);
1571 3422 : if (PageUptodate(page) || PageDirty(page)) {
1572 : bh = head;
1573 4 : do {
1574 8 : if (PageDirty(page))
1575 0 : set_buffer_dirty(bh);
1576 4 : if (PageUptodate(page))
1577 4 : set_buffer_uptodate(bh);
1578 4 : bh = bh->b_this_page;
1579 4 : } while (bh != head);
1580 : }
1581 1713 : attach_page_private(page, head);
1582 1713 : spin_unlock(&page->mapping->private_lock);
1583 1713 : }
1584 : EXPORT_SYMBOL(create_empty_buffers);
1585 :
1586 : /**
1587 : * clean_bdev_aliases: clean a range of buffers in block device
1588 : * @bdev: Block device to clean buffers in
1589 : * @block: Start of a range of blocks to clean
1590 : * @len: Number of blocks to clean
1591 : *
1592 : * We are taking a range of blocks for data and we don't want writeback of any
1593 : * buffer-cache aliases starting from return from this function and until the
1594 : * moment when something will explicitly mark the buffer dirty (hopefully that
1595 : * will not happen until we will free that block ;-) We don't even need to mark
1596 : * it not-uptodate - nobody can expect anything from a newly allocated buffer
1597 : * anyway. We used to use unmap_buffer() for such invalidation, but that was
1598 : * wrong. We definitely don't want to mark the alias unmapped, for example - it
1599 : * would confuse anyone who might pick it with bread() afterwards...
1600 : *
1601 : * Also.. Note that bforget() doesn't lock the buffer. So there can be
1602 : * writeout I/O going on against recently-freed buffers. We don't wait on that
1603 : * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1604 : * need to. That happens here.
1605 : */
1606 1607 : void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1607 : {
1608 1607 : struct inode *bd_inode = bdev->bd_inode;
1609 1607 : struct address_space *bd_mapping = bd_inode->i_mapping;
1610 1607 : struct pagevec pvec;
1611 1607 : pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1612 1607 : pgoff_t end;
1613 1607 : int i, count;
1614 1607 : struct buffer_head *bh;
1615 1607 : struct buffer_head *head;
1616 :
1617 1607 : end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1618 1607 : pagevec_init(&pvec);
1619 1607 : while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
1620 0 : count = pagevec_count(&pvec);
1621 0 : for (i = 0; i < count; i++) {
1622 0 : struct page *page = pvec.pages[i];
1623 :
1624 0 : if (!page_has_buffers(page))
1625 0 : continue;
1626 : /*
1627 : * We use page lock instead of bd_mapping->private_lock
1628 : * to pin buffers here since we can afford to sleep and
1629 : * it scales better than a global spinlock lock.
1630 : */
1631 0 : lock_page(page);
1632 : /* Recheck when the page is locked which pins bhs */
1633 0 : if (!page_has_buffers(page))
1634 0 : goto unlock_page;
1635 0 : head = page_buffers(page);
1636 0 : bh = head;
1637 0 : do {
1638 0 : if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1639 0 : goto next;
1640 0 : if (bh->b_blocknr >= block + len)
1641 : break;
1642 0 : clear_buffer_dirty(bh);
1643 0 : wait_on_buffer(bh);
1644 0 : clear_buffer_req(bh);
1645 0 : next:
1646 0 : bh = bh->b_this_page;
1647 0 : } while (bh != head);
1648 0 : unlock_page:
1649 0 : unlock_page(page);
1650 : }
1651 0 : pagevec_release(&pvec);
1652 0 : cond_resched();
1653 : /* End of range already reached? */
1654 0 : if (index > end || !index)
1655 : break;
1656 : }
1657 1607 : }
1658 : EXPORT_SYMBOL(clean_bdev_aliases);
1659 :
1660 : /*
1661 : * Size is a power-of-two in the range 512..PAGE_SIZE,
1662 : * and the case we care about most is PAGE_SIZE.
1663 : *
1664 : * So this *could* possibly be written with those
1665 : * constraints in mind (relevant mostly if some
1666 : * architecture has a slow bit-scan instruction)
1667 : */
1668 3281 : static inline int block_size_bits(unsigned int blocksize)
1669 : {
1670 3281 : return ilog2(blocksize);
1671 : }
1672 :
1673 3281 : static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1674 : {
1675 6562 : BUG_ON(!PageLocked(page));
1676 :
1677 3281 : if (!page_has_buffers(page))
1678 1713 : create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
1679 : b_state);
1680 3281 : return page_buffers(page);
1681 : }
1682 :
1683 : /*
1684 : * NOTE! All mapped/uptodate combinations are valid:
1685 : *
1686 : * Mapped Uptodate Meaning
1687 : *
1688 : * No No "unknown" - must do get_block()
1689 : * No Yes "hole" - zero-filled
1690 : * Yes No "allocated" - allocated on disk, not read in
1691 : * Yes Yes "valid" - allocated and up-to-date in memory.
1692 : *
1693 : * "Dirty" is valid only with the last case (mapped+uptodate).
1694 : */
1695 :
1696 : /*
1697 : * While block_write_full_page is writing back the dirty buffers under
1698 : * the page lock, whoever dirtied the buffers may decide to clean them
1699 : * again at any time. We handle that by only looking at the buffer
1700 : * state inside lock_buffer().
1701 : *
1702 : * If block_write_full_page() is called for regular writeback
1703 : * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1704 : * locked buffer. This only can happen if someone has written the buffer
1705 : * directly, with submit_bh(). At the address_space level PageWriteback
1706 : * prevents this contention from occurring.
1707 : *
1708 : * If block_write_full_page() is called with wbc->sync_mode ==
1709 : * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1710 : * causes the writes to be flagged as synchronous writes.
1711 : */
1712 872 : int __block_write_full_page(struct inode *inode, struct page *page,
1713 : get_block_t *get_block, struct writeback_control *wbc,
1714 : bh_end_io_t *handler)
1715 : {
1716 872 : int err;
1717 872 : sector_t block;
1718 872 : sector_t last_block;
1719 872 : struct buffer_head *bh, *head;
1720 872 : unsigned int blocksize, bbits;
1721 872 : int nr_underway = 0;
1722 872 : int write_flags = wbc_to_write_flags(wbc);
1723 :
1724 872 : head = create_page_buffers(page, inode,
1725 : (1 << BH_Dirty)|(1 << BH_Uptodate));
1726 :
1727 : /*
1728 : * Be very careful. We have no exclusion from __set_page_dirty_buffers
1729 : * here, and the (potentially unmapped) buffers may become dirty at
1730 : * any time. If a buffer becomes dirty here after we've inspected it
1731 : * then we just miss that fact, and the page stays dirty.
1732 : *
1733 : * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1734 : * handle that here by just cleaning them.
1735 : */
1736 :
1737 872 : bh = head;
1738 872 : blocksize = bh->b_size;
1739 872 : bbits = block_size_bits(blocksize);
1740 :
1741 872 : block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1742 872 : last_block = (i_size_read(inode) - 1) >> bbits;
1743 :
1744 : /*
1745 : * Get all the dirty buffers mapped to disk addresses and
1746 : * handle any aliases from the underlying blockdev's mapping.
1747 : */
1748 872 : do {
1749 872 : if (block > last_block) {
1750 : /*
1751 : * mapped buffers outside i_size will occur, because
1752 : * this page can be outside i_size when there is a
1753 : * truncate in progress.
1754 : */
1755 : /*
1756 : * The buffer was zeroed by block_write_full_page()
1757 : */
1758 0 : clear_buffer_dirty(bh);
1759 0 : set_buffer_uptodate(bh);
1760 1744 : } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1761 0 : buffer_dirty(bh)) {
1762 0 : WARN_ON(bh->b_size != blocksize);
1763 0 : err = get_block(inode, block, bh, 1);
1764 0 : if (err)
1765 0 : goto recover;
1766 0 : clear_buffer_delay(bh);
1767 0 : if (buffer_new(bh)) {
1768 : /* blockdev mappings never come here */
1769 0 : clear_buffer_new(bh);
1770 0 : clean_bdev_bh_alias(bh);
1771 : }
1772 : }
1773 872 : bh = bh->b_this_page;
1774 872 : block++;
1775 872 : } while (bh != head);
1776 :
1777 872 : do {
1778 872 : if (!buffer_mapped(bh))
1779 0 : continue;
1780 : /*
1781 : * If it's a fully non-blocking write attempt and we cannot
1782 : * lock the buffer then redirty the page. Note that this can
1783 : * potentially cause a busy-wait loop from writeback threads
1784 : * and kswapd activity, but those code paths have their own
1785 : * higher-level throttling.
1786 : */
1787 872 : if (wbc->sync_mode != WB_SYNC_NONE) {
1788 0 : lock_buffer(bh);
1789 872 : } else if (!trylock_buffer(bh)) {
1790 0 : redirty_page_for_writepage(wbc, page);
1791 0 : continue;
1792 : }
1793 872 : if (test_clear_buffer_dirty(bh)) {
1794 867 : mark_buffer_async_write_endio(bh, handler);
1795 : } else {
1796 5 : unlock_buffer(bh);
1797 : }
1798 872 : } while ((bh = bh->b_this_page) != head);
1799 :
1800 : /*
1801 : * The page and its buffers are protected by PageWriteback(), so we can
1802 : * drop the bh refcounts early.
1803 : */
1804 1744 : BUG_ON(PageWriteback(page));
1805 872 : set_page_writeback(page);
1806 :
1807 872 : do {
1808 872 : struct buffer_head *next = bh->b_this_page;
1809 872 : if (buffer_async_write(bh)) {
1810 867 : submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
1811 867 : inode->i_write_hint, wbc);
1812 867 : nr_underway++;
1813 : }
1814 872 : bh = next;
1815 872 : } while (bh != head);
1816 872 : unlock_page(page);
1817 :
1818 872 : err = 0;
1819 872 : done:
1820 872 : if (nr_underway == 0) {
1821 : /*
1822 : * The page was marked dirty, but the buffers were
1823 : * clean. Someone wrote them back by hand with
1824 : * ll_rw_block/submit_bh. A rare case.
1825 : */
1826 5 : end_page_writeback(page);
1827 :
1828 : /*
1829 : * The page and buffer_heads can be released at any time from
1830 : * here on.
1831 : */
1832 : }
1833 872 : return err;
1834 :
1835 0 : recover:
1836 : /*
1837 : * ENOSPC, or some other error. We may already have added some
1838 : * blocks to the file, so we need to write these out to avoid
1839 : * exposing stale data.
1840 : * The page is currently locked and not marked for writeback
1841 : */
1842 0 : bh = head;
1843 : /* Recovery: lock and submit the mapped buffers */
1844 0 : do {
1845 0 : if (buffer_mapped(bh) && buffer_dirty(bh) &&
1846 0 : !buffer_delay(bh)) {
1847 0 : lock_buffer(bh);
1848 0 : mark_buffer_async_write_endio(bh, handler);
1849 : } else {
1850 : /*
1851 : * The buffer may have been set dirty during
1852 : * attachment to a dirty page.
1853 : */
1854 0 : clear_buffer_dirty(bh);
1855 : }
1856 0 : } while ((bh = bh->b_this_page) != head);
1857 0 : SetPageError(page);
1858 0 : BUG_ON(PageWriteback(page));
1859 0 : mapping_set_error(page->mapping, err);
1860 0 : set_page_writeback(page);
1861 0 : do {
1862 0 : struct buffer_head *next = bh->b_this_page;
1863 0 : if (buffer_async_write(bh)) {
1864 0 : clear_buffer_dirty(bh);
1865 0 : submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
1866 0 : inode->i_write_hint, wbc);
1867 0 : nr_underway++;
1868 : }
1869 0 : bh = next;
1870 0 : } while (bh != head);
1871 0 : unlock_page(page);
1872 0 : goto done;
1873 : }
1874 : EXPORT_SYMBOL(__block_write_full_page);
1875 :
1876 : /*
1877 : * If a page has any new buffers, zero them out here, and mark them uptodate
1878 : * and dirty so they'll be written out (in order to prevent uninitialised
1879 : * block data from leaking). And clear the new bit.
1880 : */
1881 0 : void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1882 : {
1883 0 : unsigned int block_start, block_end;
1884 0 : struct buffer_head *head, *bh;
1885 :
1886 0 : BUG_ON(!PageLocked(page));
1887 0 : if (!page_has_buffers(page))
1888 : return;
1889 :
1890 0 : bh = head = page_buffers(page);
1891 0 : block_start = 0;
1892 0 : do {
1893 0 : block_end = block_start + bh->b_size;
1894 :
1895 0 : if (buffer_new(bh)) {
1896 0 : if (block_end > from && block_start < to) {
1897 0 : if (!PageUptodate(page)) {
1898 0 : unsigned start, size;
1899 :
1900 0 : start = max(from, block_start);
1901 0 : size = min(to, block_end) - start;
1902 :
1903 0 : zero_user(page, start, size);
1904 0 : set_buffer_uptodate(bh);
1905 : }
1906 :
1907 0 : clear_buffer_new(bh);
1908 0 : mark_buffer_dirty(bh);
1909 : }
1910 : }
1911 :
1912 0 : block_start = block_end;
1913 0 : bh = bh->b_this_page;
1914 0 : } while (bh != head);
1915 : }
1916 : EXPORT_SYMBOL(page_zero_new_buffers);
1917 :
1918 : static void
1919 0 : iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1920 : struct iomap *iomap)
1921 : {
1922 0 : loff_t offset = block << inode->i_blkbits;
1923 :
1924 0 : bh->b_bdev = iomap->bdev;
1925 :
1926 : /*
1927 : * Block points to offset in file we need to map, iomap contains
1928 : * the offset at which the map starts. If the map ends before the
1929 : * current block, then do not map the buffer and let the caller
1930 : * handle it.
1931 : */
1932 0 : BUG_ON(offset >= iomap->offset + iomap->length);
1933 :
1934 0 : switch (iomap->type) {
1935 : case IOMAP_HOLE:
1936 : /*
1937 : * If the buffer is not up to date or beyond the current EOF,
1938 : * we need to mark it as new to ensure sub-block zeroing is
1939 : * executed if necessary.
1940 : */
1941 0 : if (!buffer_uptodate(bh) ||
1942 0 : (offset >= i_size_read(inode)))
1943 0 : set_buffer_new(bh);
1944 : break;
1945 : case IOMAP_DELALLOC:
1946 0 : if (!buffer_uptodate(bh) ||
1947 0 : (offset >= i_size_read(inode)))
1948 0 : set_buffer_new(bh);
1949 0 : set_buffer_uptodate(bh);
1950 0 : set_buffer_mapped(bh);
1951 0 : set_buffer_delay(bh);
1952 : break;
1953 : case IOMAP_UNWRITTEN:
1954 : /*
1955 : * For unwritten regions, we always need to ensure that regions
1956 : * in the block we are not writing to are zeroed. Mark the
1957 : * buffer as new to ensure this.
1958 : */
1959 0 : set_buffer_new(bh);
1960 0 : set_buffer_unwritten(bh);
1961 0 : fallthrough;
1962 0 : case IOMAP_MAPPED:
1963 0 : if ((iomap->flags & IOMAP_F_NEW) ||
1964 0 : offset >= i_size_read(inode))
1965 0 : set_buffer_new(bh);
1966 0 : bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
1967 0 : inode->i_blkbits;
1968 0 : set_buffer_mapped(bh);
1969 : break;
1970 : }
1971 0 : }
1972 :
1973 2398 : int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
1974 : get_block_t *get_block, struct iomap *iomap)
1975 : {
1976 2398 : unsigned from = pos & (PAGE_SIZE - 1);
1977 2398 : unsigned to = from + len;
1978 2398 : struct inode *inode = page->mapping->host;
1979 2398 : unsigned block_start, block_end;
1980 2398 : sector_t block;
1981 2398 : int err = 0;
1982 2398 : unsigned blocksize, bbits;
1983 2398 : struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1984 :
1985 4796 : BUG_ON(!PageLocked(page));
1986 2398 : BUG_ON(from > PAGE_SIZE);
1987 2398 : BUG_ON(to > PAGE_SIZE);
1988 2398 : BUG_ON(from > to);
1989 :
1990 2398 : head = create_page_buffers(page, inode, 0);
1991 2398 : blocksize = head->b_size;
1992 2398 : bbits = block_size_bits(blocksize);
1993 :
1994 2398 : block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1995 :
1996 4796 : for(bh = head, block_start = 0; bh != head || !block_start;
1997 2398 : block++, block_start=block_end, bh = bh->b_this_page) {
1998 2398 : block_end = block_start + blocksize;
1999 2398 : if (block_end <= from || block_start >= to) {
2000 0 : if (PageUptodate(page)) {
2001 0 : if (!buffer_uptodate(bh))
2002 0 : set_buffer_uptodate(bh);
2003 : }
2004 0 : continue;
2005 : }
2006 2398 : if (buffer_new(bh))
2007 0 : clear_buffer_new(bh);
2008 2398 : if (!buffer_mapped(bh)) {
2009 1712 : WARN_ON(bh->b_size != blocksize);
2010 1712 : if (get_block) {
2011 1712 : err = get_block(inode, block, bh, 1);
2012 1712 : if (err)
2013 : break;
2014 : } else {
2015 0 : iomap_to_bh(inode, block, bh, iomap);
2016 : }
2017 :
2018 1712 : if (buffer_new(bh)) {
2019 1607 : clean_bdev_bh_alias(bh);
2020 1607 : if (PageUptodate(page)) {
2021 0 : clear_buffer_new(bh);
2022 0 : set_buffer_uptodate(bh);
2023 0 : mark_buffer_dirty(bh);
2024 0 : continue;
2025 : }
2026 1607 : if (block_end > to || block_start < from)
2027 413 : zero_user_segments(page,
2028 : to, block_end,
2029 : block_start, from);
2030 1607 : continue;
2031 : }
2032 : }
2033 791 : if (PageUptodate(page)) {
2034 690 : if (!buffer_uptodate(bh))
2035 0 : set_buffer_uptodate(bh);
2036 690 : continue;
2037 : }
2038 101 : if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2039 101 : !buffer_unwritten(bh) &&
2040 101 : (block_start < from || block_end > to)) {
2041 8 : ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2042 8 : *wait_bh++=bh;
2043 : }
2044 : }
2045 : /*
2046 : * If we issued read requests - let them complete.
2047 : */
2048 2406 : while(wait_bh > wait) {
2049 8 : wait_on_buffer(*--wait_bh);
2050 8 : if (!buffer_uptodate(*wait_bh))
2051 0 : err = -EIO;
2052 : }
2053 2398 : if (unlikely(err))
2054 0 : page_zero_new_buffers(page, from, to);
2055 2398 : return err;
2056 : }
2057 :
2058 2398 : int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2059 : get_block_t *get_block)
2060 : {
2061 2094 : return __block_write_begin_int(page, pos, len, get_block, NULL);
2062 : }
2063 : EXPORT_SYMBOL(__block_write_begin);
2064 :
2065 2398 : static int __block_commit_write(struct inode *inode, struct page *page,
2066 : unsigned from, unsigned to)
2067 : {
2068 2398 : unsigned block_start, block_end;
2069 2398 : int partial = 0;
2070 2398 : unsigned blocksize;
2071 2398 : struct buffer_head *bh, *head;
2072 :
2073 2398 : bh = head = page_buffers(page);
2074 2398 : blocksize = bh->b_size;
2075 :
2076 2398 : block_start = 0;
2077 2398 : do {
2078 2398 : block_end = block_start + blocksize;
2079 2398 : if (block_end <= from || block_start >= to) {
2080 0 : if (!buffer_uptodate(bh))
2081 0 : partial = 1;
2082 : } else {
2083 2398 : set_buffer_uptodate(bh);
2084 2398 : mark_buffer_dirty(bh);
2085 : }
2086 2398 : if (buffer_new(bh))
2087 1607 : clear_buffer_new(bh);
2088 :
2089 2398 : block_start = block_end;
2090 2398 : bh = bh->b_this_page;
2091 2398 : } while (bh != head);
2092 :
2093 : /*
2094 : * If this is a partial write which happened to make all buffers
2095 : * uptodate then we can optimize away a bogus readpage() for
2096 : * the next read(). Here we 'discover' whether the page went
2097 : * uptodate as a result of this (potentially partial) write.
2098 : */
2099 2398 : if (!partial)
2100 2398 : SetPageUptodate(page);
2101 2398 : return 0;
2102 : }
2103 :
2104 : /*
2105 : * block_write_begin takes care of the basic task of block allocation and
2106 : * bringing partial write blocks uptodate first.
2107 : *
2108 : * The filesystem needs to handle block truncation upon failure.
2109 : */
2110 0 : int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2111 : unsigned flags, struct page **pagep, get_block_t *get_block)
2112 : {
2113 0 : pgoff_t index = pos >> PAGE_SHIFT;
2114 0 : struct page *page;
2115 0 : int status;
2116 :
2117 0 : page = grab_cache_page_write_begin(mapping, index, flags);
2118 0 : if (!page)
2119 : return -ENOMEM;
2120 :
2121 0 : status = __block_write_begin(page, pos, len, get_block);
2122 0 : if (unlikely(status)) {
2123 0 : unlock_page(page);
2124 0 : put_page(page);
2125 0 : page = NULL;
2126 : }
2127 :
2128 0 : *pagep = page;
2129 0 : return status;
2130 : }
2131 : EXPORT_SYMBOL(block_write_begin);
2132 :
2133 2094 : int block_write_end(struct file *file, struct address_space *mapping,
2134 : loff_t pos, unsigned len, unsigned copied,
2135 : struct page *page, void *fsdata)
2136 : {
2137 2094 : struct inode *inode = mapping->host;
2138 2094 : unsigned start;
2139 :
2140 2094 : start = pos & (PAGE_SIZE - 1);
2141 :
2142 2094 : if (unlikely(copied < len)) {
2143 : /*
2144 : * The buffers that were written will now be uptodate, so we
2145 : * don't have to worry about a readpage reading them and
2146 : * overwriting a partial write. However if we have encountered
2147 : * a short write and only partially written into a buffer, it
2148 : * will not be marked uptodate, so a readpage might come in and
2149 : * destroy our partial write.
2150 : *
2151 : * Do the simplest thing, and just treat any short write to a
2152 : * non uptodate page as a zero-length write, and force the
2153 : * caller to redo the whole thing.
2154 : */
2155 0 : if (!PageUptodate(page))
2156 0 : copied = 0;
2157 :
2158 0 : page_zero_new_buffers(page, start+copied, start+len);
2159 : }
2160 2094 : flush_dcache_page(page);
2161 :
2162 : /* This could be a short (even 0-length) commit */
2163 2094 : __block_commit_write(inode, page, start, start+copied);
2164 :
2165 2094 : return copied;
2166 : }
2167 : EXPORT_SYMBOL(block_write_end);
2168 :
2169 2094 : int generic_write_end(struct file *file, struct address_space *mapping,
2170 : loff_t pos, unsigned len, unsigned copied,
2171 : struct page *page, void *fsdata)
2172 : {
2173 2094 : struct inode *inode = mapping->host;
2174 2094 : loff_t old_size = inode->i_size;
2175 2094 : bool i_size_changed = false;
2176 :
2177 2094 : copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2178 :
2179 : /*
2180 : * No need to use i_size_read() here, the i_size cannot change under us
2181 : * because we hold i_rwsem.
2182 : *
2183 : * But it's important to update i_size while still holding page lock:
2184 : * page writeout could otherwise come in and zero beyond i_size.
2185 : */
2186 2094 : if (pos + copied > inode->i_size) {
2187 2001 : i_size_write(inode, pos + copied);
2188 2001 : i_size_changed = true;
2189 : }
2190 :
2191 2094 : unlock_page(page);
2192 2094 : put_page(page);
2193 :
2194 2094 : if (old_size < pos)
2195 0 : pagecache_isize_extended(inode, old_size, pos);
2196 : /*
2197 : * Don't mark the inode dirty under page lock. First, it unnecessarily
2198 : * makes the holding time of page lock longer. Second, it forces lock
2199 : * ordering of page lock and transaction start for journaling
2200 : * filesystems.
2201 : */
2202 2094 : if (i_size_changed)
2203 2001 : mark_inode_dirty(inode);
2204 2094 : return copied;
2205 : }
2206 : EXPORT_SYMBOL(generic_write_end);
2207 :
2208 : /*
2209 : * block_is_partially_uptodate checks whether buffers within a page are
2210 : * uptodate or not.
2211 : *
2212 : * Returns true if all buffers which correspond to a file portion
2213 : * we want to read are uptodate.
2214 : */
2215 0 : int block_is_partially_uptodate(struct page *page, unsigned long from,
2216 : unsigned long count)
2217 : {
2218 0 : unsigned block_start, block_end, blocksize;
2219 0 : unsigned to;
2220 0 : struct buffer_head *bh, *head;
2221 0 : int ret = 1;
2222 :
2223 0 : if (!page_has_buffers(page))
2224 : return 0;
2225 :
2226 0 : head = page_buffers(page);
2227 0 : blocksize = head->b_size;
2228 0 : to = min_t(unsigned, PAGE_SIZE - from, count);
2229 0 : to = from + to;
2230 0 : if (from < blocksize && to > PAGE_SIZE - blocksize)
2231 : return 0;
2232 :
2233 : bh = head;
2234 : block_start = 0;
2235 0 : do {
2236 0 : block_end = block_start + blocksize;
2237 0 : if (block_end > from && block_start < to) {
2238 0 : if (!buffer_uptodate(bh)) {
2239 : ret = 0;
2240 : break;
2241 : }
2242 0 : if (block_end >= to)
2243 : break;
2244 : }
2245 0 : block_start = block_end;
2246 0 : bh = bh->b_this_page;
2247 0 : } while (bh != head);
2248 :
2249 : return ret;
2250 : }
2251 : EXPORT_SYMBOL(block_is_partially_uptodate);
2252 :
2253 : /*
2254 : * Generic "read page" function for block devices that have the normal
2255 : * get_block functionality. This is most of the block device filesystems.
2256 : * Reads the page asynchronously --- the unlock_buffer() and
2257 : * set/clear_buffer_uptodate() functions propagate buffer state into the
2258 : * page struct once IO has completed.
2259 : */
2260 11 : int block_read_full_page(struct page *page, get_block_t *get_block)
2261 : {
2262 11 : struct inode *inode = page->mapping->host;
2263 11 : sector_t iblock, lblock;
2264 11 : struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2265 11 : unsigned int blocksize, bbits;
2266 11 : int nr, i;
2267 11 : int fully_mapped = 1;
2268 :
2269 11 : head = create_page_buffers(page, inode, 0);
2270 11 : blocksize = head->b_size;
2271 11 : bbits = block_size_bits(blocksize);
2272 :
2273 11 : iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
2274 11 : lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2275 11 : bh = head;
2276 11 : nr = 0;
2277 11 : i = 0;
2278 :
2279 11 : do {
2280 11 : if (buffer_uptodate(bh))
2281 10 : continue;
2282 :
2283 1 : if (!buffer_mapped(bh)) {
2284 1 : int err = 0;
2285 :
2286 1 : fully_mapped = 0;
2287 1 : if (iblock < lblock) {
2288 1 : WARN_ON(bh->b_size != blocksize);
2289 1 : err = get_block(inode, iblock, bh, 0);
2290 1 : if (err)
2291 0 : SetPageError(page);
2292 : }
2293 1 : if (!buffer_mapped(bh)) {
2294 0 : zero_user(page, i * blocksize, blocksize);
2295 0 : if (!err)
2296 0 : set_buffer_uptodate(bh);
2297 0 : continue;
2298 : }
2299 : /*
2300 : * get_block() might have updated the buffer
2301 : * synchronously
2302 : */
2303 1 : if (buffer_uptodate(bh))
2304 0 : continue;
2305 : }
2306 1 : arr[nr++] = bh;
2307 11 : } while (i++, iblock++, (bh = bh->b_this_page) != head);
2308 :
2309 11 : if (fully_mapped)
2310 10 : SetPageMappedToDisk(page);
2311 :
2312 11 : if (!nr) {
2313 : /*
2314 : * All buffers are uptodate - we can set the page uptodate
2315 : * as well. But not if get_block() returned an error.
2316 : */
2317 20 : if (!PageError(page))
2318 10 : SetPageUptodate(page);
2319 10 : unlock_page(page);
2320 10 : return 0;
2321 : }
2322 :
2323 : /* Stage two: lock the buffers */
2324 2 : for (i = 0; i < nr; i++) {
2325 1 : bh = arr[i];
2326 1 : lock_buffer(bh);
2327 1 : mark_buffer_async_read(bh);
2328 : }
2329 :
2330 : /*
2331 : * Stage 3: start the IO. Check for uptodateness
2332 : * inside the buffer lock in case another process reading
2333 : * the underlying blockdev brought it uptodate (the sct fix).
2334 : */
2335 2 : for (i = 0; i < nr; i++) {
2336 1 : bh = arr[i];
2337 1 : if (buffer_uptodate(bh))
2338 0 : end_buffer_async_read(bh, 1);
2339 : else
2340 1 : submit_bh(REQ_OP_READ, 0, bh);
2341 : }
2342 : return 0;
2343 : }
2344 : EXPORT_SYMBOL(block_read_full_page);
2345 :
2346 : /* utility function for filesystems that need to do work on expanding
2347 : * truncates. Uses filesystem pagecache writes to allow the filesystem to
2348 : * deal with the hole.
2349 : */
2350 0 : int generic_cont_expand_simple(struct inode *inode, loff_t size)
2351 : {
2352 0 : struct address_space *mapping = inode->i_mapping;
2353 0 : struct page *page;
2354 0 : void *fsdata;
2355 0 : int err;
2356 :
2357 0 : err = inode_newsize_ok(inode, size);
2358 0 : if (err)
2359 0 : goto out;
2360 :
2361 0 : err = pagecache_write_begin(NULL, mapping, size, 0,
2362 : AOP_FLAG_CONT_EXPAND, &page, &fsdata);
2363 0 : if (err)
2364 0 : goto out;
2365 :
2366 0 : err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2367 0 : BUG_ON(err > 0);
2368 :
2369 0 : out:
2370 0 : return err;
2371 : }
2372 : EXPORT_SYMBOL(generic_cont_expand_simple);
2373 :
2374 0 : static int cont_expand_zero(struct file *file, struct address_space *mapping,
2375 : loff_t pos, loff_t *bytes)
2376 : {
2377 0 : struct inode *inode = mapping->host;
2378 0 : unsigned int blocksize = i_blocksize(inode);
2379 0 : struct page *page;
2380 0 : void *fsdata;
2381 0 : pgoff_t index, curidx;
2382 0 : loff_t curpos;
2383 0 : unsigned zerofrom, offset, len;
2384 0 : int err = 0;
2385 :
2386 0 : index = pos >> PAGE_SHIFT;
2387 0 : offset = pos & ~PAGE_MASK;
2388 :
2389 0 : while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2390 0 : zerofrom = curpos & ~PAGE_MASK;
2391 0 : if (zerofrom & (blocksize-1)) {
2392 0 : *bytes |= (blocksize-1);
2393 0 : (*bytes)++;
2394 : }
2395 0 : len = PAGE_SIZE - zerofrom;
2396 :
2397 0 : err = pagecache_write_begin(file, mapping, curpos, len, 0,
2398 : &page, &fsdata);
2399 0 : if (err)
2400 0 : goto out;
2401 0 : zero_user(page, zerofrom, len);
2402 0 : err = pagecache_write_end(file, mapping, curpos, len, len,
2403 : page, fsdata);
2404 0 : if (err < 0)
2405 0 : goto out;
2406 0 : BUG_ON(err != len);
2407 0 : err = 0;
2408 :
2409 0 : balance_dirty_pages_ratelimited(mapping);
2410 :
2411 0 : if (fatal_signal_pending(current)) {
2412 0 : err = -EINTR;
2413 0 : goto out;
2414 : }
2415 : }
2416 :
2417 : /* page covers the boundary, find the boundary offset */
2418 0 : if (index == curidx) {
2419 0 : zerofrom = curpos & ~PAGE_MASK;
2420 : /* if we will expand the thing last block will be filled */
2421 0 : if (offset <= zerofrom) {
2422 0 : goto out;
2423 : }
2424 0 : if (zerofrom & (blocksize-1)) {
2425 0 : *bytes |= (blocksize-1);
2426 0 : (*bytes)++;
2427 : }
2428 0 : len = offset - zerofrom;
2429 :
2430 0 : err = pagecache_write_begin(file, mapping, curpos, len, 0,
2431 : &page, &fsdata);
2432 0 : if (err)
2433 0 : goto out;
2434 0 : zero_user(page, zerofrom, len);
2435 0 : err = pagecache_write_end(file, mapping, curpos, len, len,
2436 : page, fsdata);
2437 0 : if (err < 0)
2438 0 : goto out;
2439 0 : BUG_ON(err != len);
2440 : err = 0;
2441 : }
2442 0 : out:
2443 0 : return err;
2444 : }
2445 :
2446 : /*
2447 : * For moronic filesystems that do not allow holes in file.
2448 : * We may have to extend the file.
2449 : */
2450 0 : int cont_write_begin(struct file *file, struct address_space *mapping,
2451 : loff_t pos, unsigned len, unsigned flags,
2452 : struct page **pagep, void **fsdata,
2453 : get_block_t *get_block, loff_t *bytes)
2454 : {
2455 0 : struct inode *inode = mapping->host;
2456 0 : unsigned int blocksize = i_blocksize(inode);
2457 0 : unsigned int zerofrom;
2458 0 : int err;
2459 :
2460 0 : err = cont_expand_zero(file, mapping, pos, bytes);
2461 0 : if (err)
2462 : return err;
2463 :
2464 0 : zerofrom = *bytes & ~PAGE_MASK;
2465 0 : if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2466 0 : *bytes |= (blocksize-1);
2467 0 : (*bytes)++;
2468 : }
2469 :
2470 0 : return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2471 : }
2472 : EXPORT_SYMBOL(cont_write_begin);
2473 :
2474 304 : int block_commit_write(struct page *page, unsigned from, unsigned to)
2475 : {
2476 304 : struct inode *inode = page->mapping->host;
2477 0 : __block_commit_write(inode,page,from,to);
2478 304 : return 0;
2479 : }
2480 : EXPORT_SYMBOL(block_commit_write);
2481 :
2482 : /*
2483 : * block_page_mkwrite() is not allowed to change the file size as it gets
2484 : * called from a page fault handler when a page is first dirtied. Hence we must
2485 : * be careful to check for EOF conditions here. We set the page up correctly
2486 : * for a written page which means we get ENOSPC checking when writing into
2487 : * holes and correct delalloc and unwritten extent mapping on filesystems that
2488 : * support these features.
2489 : *
2490 : * We are not allowed to take the i_mutex here so we have to play games to
2491 : * protect against truncate races as the page could now be beyond EOF. Because
2492 : * truncate writes the inode size before removing pages, once we have the
2493 : * page lock we can determine safely if the page is beyond EOF. If it is not
2494 : * beyond EOF, then the page is guaranteed safe against truncation until we
2495 : * unlock the page.
2496 : *
2497 : * Direct callers of this function should protect against filesystem freezing
2498 : * using sb_start_pagefault() - sb_end_pagefault() functions.
2499 : */
2500 304 : int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2501 : get_block_t get_block)
2502 : {
2503 304 : struct page *page = vmf->page;
2504 304 : struct inode *inode = file_inode(vma->vm_file);
2505 304 : unsigned long end;
2506 304 : loff_t size;
2507 304 : int ret;
2508 :
2509 304 : lock_page(page);
2510 304 : size = i_size_read(inode);
2511 304 : if ((page->mapping != inode->i_mapping) ||
2512 304 : (page_offset(page) > size)) {
2513 : /* We overload EFAULT to mean page got truncated */
2514 0 : ret = -EFAULT;
2515 0 : goto out_unlock;
2516 : }
2517 :
2518 : /* page is wholly or partially inside EOF */
2519 304 : if (((page->index + 1) << PAGE_SHIFT) > size)
2520 0 : end = size & ~PAGE_MASK;
2521 : else
2522 : end = PAGE_SIZE;
2523 :
2524 304 : ret = __block_write_begin(page, 0, end, get_block);
2525 304 : if (!ret)
2526 304 : ret = block_commit_write(page, 0, end);
2527 :
2528 304 : if (unlikely(ret < 0))
2529 0 : goto out_unlock;
2530 304 : set_page_dirty(page);
2531 304 : wait_for_stable_page(page);
2532 304 : return 0;
2533 0 : out_unlock:
2534 0 : unlock_page(page);
2535 0 : return ret;
2536 : }
2537 : EXPORT_SYMBOL(block_page_mkwrite);
2538 :
2539 : /*
2540 : * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2541 : * immediately, while under the page lock. So it needs a special end_io
2542 : * handler which does not touch the bh after unlocking it.
2543 : */
2544 0 : static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2545 : {
2546 0 : __end_buffer_read_notouch(bh, uptodate);
2547 0 : }
2548 :
2549 : /*
2550 : * Attach the singly-linked list of buffers created by nobh_write_begin, to
2551 : * the page (converting it to circular linked list and taking care of page
2552 : * dirty races).
2553 : */
2554 0 : static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2555 : {
2556 0 : struct buffer_head *bh;
2557 :
2558 0 : BUG_ON(!PageLocked(page));
2559 :
2560 0 : spin_lock(&page->mapping->private_lock);
2561 0 : bh = head;
2562 0 : do {
2563 0 : if (PageDirty(page))
2564 0 : set_buffer_dirty(bh);
2565 0 : if (!bh->b_this_page)
2566 0 : bh->b_this_page = head;
2567 0 : bh = bh->b_this_page;
2568 0 : } while (bh != head);
2569 0 : attach_page_private(page, head);
2570 0 : spin_unlock(&page->mapping->private_lock);
2571 0 : }
2572 :
2573 : /*
2574 : * On entry, the page is fully not uptodate.
2575 : * On exit the page is fully uptodate in the areas outside (from,to)
2576 : * The filesystem needs to handle block truncation upon failure.
2577 : */
2578 0 : int nobh_write_begin(struct address_space *mapping,
2579 : loff_t pos, unsigned len, unsigned flags,
2580 : struct page **pagep, void **fsdata,
2581 : get_block_t *get_block)
2582 : {
2583 0 : struct inode *inode = mapping->host;
2584 0 : const unsigned blkbits = inode->i_blkbits;
2585 0 : const unsigned blocksize = 1 << blkbits;
2586 0 : struct buffer_head *head, *bh;
2587 0 : struct page *page;
2588 0 : pgoff_t index;
2589 0 : unsigned from, to;
2590 0 : unsigned block_in_page;
2591 0 : unsigned block_start, block_end;
2592 0 : sector_t block_in_file;
2593 0 : int nr_reads = 0;
2594 0 : int ret = 0;
2595 0 : int is_mapped_to_disk = 1;
2596 :
2597 0 : index = pos >> PAGE_SHIFT;
2598 0 : from = pos & (PAGE_SIZE - 1);
2599 0 : to = from + len;
2600 :
2601 0 : page = grab_cache_page_write_begin(mapping, index, flags);
2602 0 : if (!page)
2603 : return -ENOMEM;
2604 0 : *pagep = page;
2605 0 : *fsdata = NULL;
2606 :
2607 0 : if (page_has_buffers(page)) {
2608 0 : ret = __block_write_begin(page, pos, len, get_block);
2609 0 : if (unlikely(ret))
2610 0 : goto out_release;
2611 : return ret;
2612 : }
2613 :
2614 0 : if (PageMappedToDisk(page))
2615 : return 0;
2616 :
2617 : /*
2618 : * Allocate buffers so that we can keep track of state, and potentially
2619 : * attach them to the page if an error occurs. In the common case of
2620 : * no error, they will just be freed again without ever being attached
2621 : * to the page (which is all OK, because we're under the page lock).
2622 : *
2623 : * Be careful: the buffer linked list is a NULL terminated one, rather
2624 : * than the circular one we're used to.
2625 : */
2626 0 : head = alloc_page_buffers(page, blocksize, false);
2627 0 : if (!head) {
2628 0 : ret = -ENOMEM;
2629 0 : goto out_release;
2630 : }
2631 :
2632 0 : block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
2633 :
2634 : /*
2635 : * We loop across all blocks in the page, whether or not they are
2636 : * part of the affected region. This is so we can discover if the
2637 : * page is fully mapped-to-disk.
2638 : */
2639 0 : for (block_start = 0, block_in_page = 0, bh = head;
2640 0 : block_start < PAGE_SIZE;
2641 0 : block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2642 0 : int create;
2643 :
2644 0 : block_end = block_start + blocksize;
2645 0 : bh->b_state = 0;
2646 0 : create = 1;
2647 0 : if (block_start >= to)
2648 0 : create = 0;
2649 0 : ret = get_block(inode, block_in_file + block_in_page,
2650 : bh, create);
2651 0 : if (ret)
2652 0 : goto failed;
2653 0 : if (!buffer_mapped(bh))
2654 0 : is_mapped_to_disk = 0;
2655 0 : if (buffer_new(bh))
2656 0 : clean_bdev_bh_alias(bh);
2657 0 : if (PageUptodate(page)) {
2658 0 : set_buffer_uptodate(bh);
2659 0 : continue;
2660 : }
2661 0 : if (buffer_new(bh) || !buffer_mapped(bh)) {
2662 0 : zero_user_segments(page, block_start, from,
2663 : to, block_end);
2664 0 : continue;
2665 : }
2666 0 : if (buffer_uptodate(bh))
2667 0 : continue; /* reiserfs does this */
2668 0 : if (block_start < from || block_end > to) {
2669 0 : lock_buffer(bh);
2670 0 : bh->b_end_io = end_buffer_read_nobh;
2671 0 : submit_bh(REQ_OP_READ, 0, bh);
2672 0 : nr_reads++;
2673 : }
2674 : }
2675 :
2676 0 : if (nr_reads) {
2677 : /*
2678 : * The page is locked, so these buffers are protected from
2679 : * any VM or truncate activity. Hence we don't need to care
2680 : * for the buffer_head refcounts.
2681 : */
2682 0 : for (bh = head; bh; bh = bh->b_this_page) {
2683 0 : wait_on_buffer(bh);
2684 0 : if (!buffer_uptodate(bh))
2685 0 : ret = -EIO;
2686 : }
2687 0 : if (ret)
2688 0 : goto failed;
2689 : }
2690 :
2691 0 : if (is_mapped_to_disk)
2692 0 : SetPageMappedToDisk(page);
2693 :
2694 0 : *fsdata = head; /* to be released by nobh_write_end */
2695 :
2696 0 : return 0;
2697 :
2698 0 : failed:
2699 0 : BUG_ON(!ret);
2700 : /*
2701 : * Error recovery is a bit difficult. We need to zero out blocks that
2702 : * were newly allocated, and dirty them to ensure they get written out.
2703 : * Buffers need to be attached to the page at this point, otherwise
2704 : * the handling of potential IO errors during writeout would be hard
2705 : * (could try doing synchronous writeout, but what if that fails too?)
2706 : */
2707 0 : attach_nobh_buffers(page, head);
2708 0 : page_zero_new_buffers(page, from, to);
2709 :
2710 0 : out_release:
2711 0 : unlock_page(page);
2712 0 : put_page(page);
2713 0 : *pagep = NULL;
2714 :
2715 0 : return ret;
2716 : }
2717 : EXPORT_SYMBOL(nobh_write_begin);
2718 :
2719 0 : int nobh_write_end(struct file *file, struct address_space *mapping,
2720 : loff_t pos, unsigned len, unsigned copied,
2721 : struct page *page, void *fsdata)
2722 : {
2723 0 : struct inode *inode = page->mapping->host;
2724 0 : struct buffer_head *head = fsdata;
2725 0 : struct buffer_head *bh;
2726 0 : BUG_ON(fsdata != NULL && page_has_buffers(page));
2727 :
2728 0 : if (unlikely(copied < len) && head)
2729 0 : attach_nobh_buffers(page, head);
2730 0 : if (page_has_buffers(page))
2731 0 : return generic_write_end(file, mapping, pos, len,
2732 : copied, page, fsdata);
2733 :
2734 0 : SetPageUptodate(page);
2735 0 : set_page_dirty(page);
2736 0 : if (pos+copied > inode->i_size) {
2737 0 : i_size_write(inode, pos+copied);
2738 0 : mark_inode_dirty(inode);
2739 : }
2740 :
2741 0 : unlock_page(page);
2742 0 : put_page(page);
2743 :
2744 0 : while (head) {
2745 0 : bh = head;
2746 0 : head = head->b_this_page;
2747 0 : free_buffer_head(bh);
2748 : }
2749 :
2750 0 : return copied;
2751 : }
2752 : EXPORT_SYMBOL(nobh_write_end);
2753 :
2754 : /*
2755 : * nobh_writepage() - based on block_full_write_page() except
2756 : * that it tries to operate without attaching bufferheads to
2757 : * the page.
2758 : */
2759 0 : int nobh_writepage(struct page *page, get_block_t *get_block,
2760 : struct writeback_control *wbc)
2761 : {
2762 0 : struct inode * const inode = page->mapping->host;
2763 0 : loff_t i_size = i_size_read(inode);
2764 0 : const pgoff_t end_index = i_size >> PAGE_SHIFT;
2765 0 : unsigned offset;
2766 0 : int ret;
2767 :
2768 : /* Is the page fully inside i_size? */
2769 0 : if (page->index < end_index)
2770 0 : goto out;
2771 :
2772 : /* Is the page fully outside i_size? (truncate in progress) */
2773 0 : offset = i_size & (PAGE_SIZE-1);
2774 0 : if (page->index >= end_index+1 || !offset) {
2775 0 : unlock_page(page);
2776 0 : return 0; /* don't care */
2777 : }
2778 :
2779 : /*
2780 : * The page straddles i_size. It must be zeroed out on each and every
2781 : * writepage invocation because it may be mmapped. "A file is mapped
2782 : * in multiples of the page size. For a file that is not a multiple of
2783 : * the page size, the remaining memory is zeroed when mapped, and
2784 : * writes to that region are not written out to the file."
2785 : */
2786 0 : zero_user_segment(page, offset, PAGE_SIZE);
2787 0 : out:
2788 0 : ret = mpage_writepage(page, get_block, wbc);
2789 0 : if (ret == -EAGAIN)
2790 0 : ret = __block_write_full_page(inode, page, get_block, wbc,
2791 : end_buffer_async_write);
2792 : return ret;
2793 : }
2794 : EXPORT_SYMBOL(nobh_writepage);
2795 :
2796 0 : int nobh_truncate_page(struct address_space *mapping,
2797 : loff_t from, get_block_t *get_block)
2798 : {
2799 0 : pgoff_t index = from >> PAGE_SHIFT;
2800 0 : unsigned offset = from & (PAGE_SIZE-1);
2801 0 : unsigned blocksize;
2802 0 : sector_t iblock;
2803 0 : unsigned length, pos;
2804 0 : struct inode *inode = mapping->host;
2805 0 : struct page *page;
2806 0 : struct buffer_head map_bh;
2807 0 : int err;
2808 :
2809 0 : blocksize = i_blocksize(inode);
2810 0 : length = offset & (blocksize - 1);
2811 :
2812 : /* Block boundary? Nothing to do */
2813 0 : if (!length)
2814 : return 0;
2815 :
2816 0 : length = blocksize - length;
2817 0 : iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2818 :
2819 0 : page = grab_cache_page(mapping, index);
2820 0 : err = -ENOMEM;
2821 0 : if (!page)
2822 0 : goto out;
2823 :
2824 0 : if (page_has_buffers(page)) {
2825 0 : has_buffers:
2826 0 : unlock_page(page);
2827 0 : put_page(page);
2828 0 : return block_truncate_page(mapping, from, get_block);
2829 : }
2830 :
2831 : /* Find the buffer that contains "offset" */
2832 : pos = blocksize;
2833 0 : while (offset >= pos) {
2834 0 : iblock++;
2835 0 : pos += blocksize;
2836 : }
2837 :
2838 0 : map_bh.b_size = blocksize;
2839 0 : map_bh.b_state = 0;
2840 0 : err = get_block(inode, iblock, &map_bh, 0);
2841 0 : if (err)
2842 0 : goto unlock;
2843 : /* unmapped? It's a hole - nothing to do */
2844 0 : if (!buffer_mapped(&map_bh))
2845 0 : goto unlock;
2846 :
2847 : /* Ok, it's mapped. Make sure it's up-to-date */
2848 0 : if (!PageUptodate(page)) {
2849 0 : err = mapping->a_ops->readpage(NULL, page);
2850 0 : if (err) {
2851 0 : put_page(page);
2852 0 : goto out;
2853 : }
2854 0 : lock_page(page);
2855 0 : if (!PageUptodate(page)) {
2856 0 : err = -EIO;
2857 0 : goto unlock;
2858 : }
2859 0 : if (page_has_buffers(page))
2860 0 : goto has_buffers;
2861 : }
2862 0 : zero_user(page, offset, length);
2863 0 : set_page_dirty(page);
2864 0 : err = 0;
2865 :
2866 0 : unlock:
2867 0 : unlock_page(page);
2868 0 : put_page(page);
2869 : out:
2870 : return err;
2871 : }
2872 : EXPORT_SYMBOL(nobh_truncate_page);
2873 :
2874 0 : int block_truncate_page(struct address_space *mapping,
2875 : loff_t from, get_block_t *get_block)
2876 : {
2877 0 : pgoff_t index = from >> PAGE_SHIFT;
2878 0 : unsigned offset = from & (PAGE_SIZE-1);
2879 0 : unsigned blocksize;
2880 0 : sector_t iblock;
2881 0 : unsigned length, pos;
2882 0 : struct inode *inode = mapping->host;
2883 0 : struct page *page;
2884 0 : struct buffer_head *bh;
2885 0 : int err;
2886 :
2887 0 : blocksize = i_blocksize(inode);
2888 0 : length = offset & (blocksize - 1);
2889 :
2890 : /* Block boundary? Nothing to do */
2891 0 : if (!length)
2892 : return 0;
2893 :
2894 0 : length = blocksize - length;
2895 0 : iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2896 :
2897 0 : page = grab_cache_page(mapping, index);
2898 0 : err = -ENOMEM;
2899 0 : if (!page)
2900 0 : goto out;
2901 :
2902 0 : if (!page_has_buffers(page))
2903 0 : create_empty_buffers(page, blocksize, 0);
2904 :
2905 : /* Find the buffer that contains "offset" */
2906 0 : bh = page_buffers(page);
2907 0 : pos = blocksize;
2908 0 : while (offset >= pos) {
2909 0 : bh = bh->b_this_page;
2910 0 : iblock++;
2911 0 : pos += blocksize;
2912 : }
2913 :
2914 0 : err = 0;
2915 0 : if (!buffer_mapped(bh)) {
2916 0 : WARN_ON(bh->b_size != blocksize);
2917 0 : err = get_block(inode, iblock, bh, 0);
2918 0 : if (err)
2919 0 : goto unlock;
2920 : /* unmapped? It's a hole - nothing to do */
2921 0 : if (!buffer_mapped(bh))
2922 0 : goto unlock;
2923 : }
2924 :
2925 : /* Ok, it's mapped. Make sure it's up-to-date */
2926 0 : if (PageUptodate(page))
2927 0 : set_buffer_uptodate(bh);
2928 :
2929 0 : if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2930 0 : err = -EIO;
2931 0 : ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2932 0 : wait_on_buffer(bh);
2933 : /* Uhhuh. Read error. Complain and punt. */
2934 0 : if (!buffer_uptodate(bh))
2935 0 : goto unlock;
2936 : }
2937 :
2938 0 : zero_user(page, offset, length);
2939 0 : mark_buffer_dirty(bh);
2940 0 : err = 0;
2941 :
2942 0 : unlock:
2943 0 : unlock_page(page);
2944 0 : put_page(page);
2945 : out:
2946 : return err;
2947 : }
2948 : EXPORT_SYMBOL(block_truncate_page);
2949 :
2950 : /*
2951 : * The generic ->writepage function for buffer-backed address_spaces
2952 : */
2953 872 : int block_write_full_page(struct page *page, get_block_t *get_block,
2954 : struct writeback_control *wbc)
2955 : {
2956 872 : struct inode * const inode = page->mapping->host;
2957 872 : loff_t i_size = i_size_read(inode);
2958 872 : const pgoff_t end_index = i_size >> PAGE_SHIFT;
2959 872 : unsigned offset;
2960 :
2961 : /* Is the page fully inside i_size? */
2962 872 : if (page->index < end_index)
2963 872 : return __block_write_full_page(inode, page, get_block, wbc,
2964 : end_buffer_async_write);
2965 :
2966 : /* Is the page fully outside i_size? (truncate in progress) */
2967 0 : offset = i_size & (PAGE_SIZE-1);
2968 0 : if (page->index >= end_index+1 || !offset) {
2969 0 : unlock_page(page);
2970 0 : return 0; /* don't care */
2971 : }
2972 :
2973 : /*
2974 : * The page straddles i_size. It must be zeroed out on each and every
2975 : * writepage invocation because it may be mmapped. "A file is mapped
2976 : * in multiples of the page size. For a file that is not a multiple of
2977 : * the page size, the remaining memory is zeroed when mapped, and
2978 : * writes to that region are not written out to the file."
2979 : */
2980 0 : zero_user_segment(page, offset, PAGE_SIZE);
2981 0 : return __block_write_full_page(inode, page, get_block, wbc,
2982 : end_buffer_async_write);
2983 : }
2984 : EXPORT_SYMBOL(block_write_full_page);
2985 :
2986 0 : sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2987 : get_block_t *get_block)
2988 : {
2989 0 : struct inode *inode = mapping->host;
2990 0 : struct buffer_head tmp = {
2991 0 : .b_size = i_blocksize(inode),
2992 : };
2993 :
2994 0 : get_block(inode, block, &tmp, 0);
2995 0 : return tmp.b_blocknr;
2996 : }
2997 : EXPORT_SYMBOL(generic_block_bmap);
2998 :
2999 6715 : static void end_bio_bh_io_sync(struct bio *bio)
3000 : {
3001 6715 : struct buffer_head *bh = bio->bi_private;
3002 :
3003 6715 : if (unlikely(bio_flagged(bio, BIO_QUIET)))
3004 0 : set_bit(BH_Quiet, &bh->b_state);
3005 :
3006 6715 : bh->b_end_io(bh, !bio->bi_status);
3007 6715 : bio_put(bio);
3008 6715 : }
3009 :
3010 6715 : static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
3011 : enum rw_hint write_hint, struct writeback_control *wbc)
3012 : {
3013 6715 : struct bio *bio;
3014 :
3015 6715 : BUG_ON(!buffer_locked(bh));
3016 6715 : BUG_ON(!buffer_mapped(bh));
3017 6715 : BUG_ON(!bh->b_end_io);
3018 6715 : BUG_ON(buffer_delay(bh));
3019 6715 : BUG_ON(buffer_unwritten(bh));
3020 :
3021 : /*
3022 : * Only clear out a write error when rewriting
3023 : */
3024 6715 : if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
3025 768 : clear_buffer_write_io_error(bh);
3026 :
3027 6715 : bio = bio_alloc(GFP_NOIO, 1);
3028 :
3029 6715 : fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
3030 :
3031 6715 : bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3032 6715 : bio_set_dev(bio, bh->b_bdev);
3033 6715 : bio->bi_write_hint = write_hint;
3034 :
3035 6715 : bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
3036 6715 : BUG_ON(bio->bi_iter.bi_size != bh->b_size);
3037 :
3038 6715 : bio->bi_end_io = end_bio_bh_io_sync;
3039 6715 : bio->bi_private = bh;
3040 :
3041 6715 : if (buffer_meta(bh))
3042 867 : op_flags |= REQ_META;
3043 6715 : if (buffer_prio(bh))
3044 867 : op_flags |= REQ_PRIO;
3045 6715 : bio_set_op_attrs(bio, op, op_flags);
3046 :
3047 : /* Take care of bh's that straddle the end of the device */
3048 6715 : guard_bio_eod(bio);
3049 :
3050 6715 : if (wbc) {
3051 : wbc_init_bio(wbc, bio);
3052 6715 : wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
3053 : }
3054 :
3055 6715 : submit_bio(bio);
3056 6715 : return 0;
3057 : }
3058 :
3059 5848 : int submit_bh(int op, int op_flags, struct buffer_head *bh)
3060 : {
3061 5828 : return submit_bh_wbc(op, op_flags, bh, 0, NULL);
3062 : }
3063 : EXPORT_SYMBOL(submit_bh);
3064 :
3065 : /**
3066 : * ll_rw_block: low-level access to block devices (DEPRECATED)
3067 : * @op: whether to %READ or %WRITE
3068 : * @op_flags: req_flag_bits
3069 : * @nr: number of &struct buffer_heads in the array
3070 : * @bhs: array of pointers to &struct buffer_head
3071 : *
3072 : * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3073 : * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
3074 : * @op_flags contains flags modifying the detailed I/O behavior, most notably
3075 : * %REQ_RAHEAD.
3076 : *
3077 : * This function drops any buffer that it cannot get a lock on (with the
3078 : * BH_Lock state bit), any buffer that appears to be clean when doing a write
3079 : * request, and any buffer that appears to be up-to-date when doing read
3080 : * request. Further it marks as clean buffers that are processed for
3081 : * writing (the buffer cache won't assume that they are actually clean
3082 : * until the buffer gets unlocked).
3083 : *
3084 : * ll_rw_block sets b_end_io to simple completion handler that marks
3085 : * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3086 : * any waiters.
3087 : *
3088 : * All of the buffers must be for the same device, and must also be a
3089 : * multiple of the current approved size for the device.
3090 : */
3091 9 : void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[])
3092 : {
3093 9 : int i;
3094 :
3095 18 : for (i = 0; i < nr; i++) {
3096 9 : struct buffer_head *bh = bhs[i];
3097 :
3098 9 : if (!trylock_buffer(bh))
3099 0 : continue;
3100 9 : if (op == WRITE) {
3101 0 : if (test_clear_buffer_dirty(bh)) {
3102 0 : bh->b_end_io = end_buffer_write_sync;
3103 0 : get_bh(bh);
3104 0 : submit_bh(op, op_flags, bh);
3105 0 : continue;
3106 : }
3107 : } else {
3108 9 : if (!buffer_uptodate(bh)) {
3109 9 : bh->b_end_io = end_buffer_read_sync;
3110 9 : get_bh(bh);
3111 9 : submit_bh(op, op_flags, bh);
3112 9 : continue;
3113 : }
3114 : }
3115 0 : unlock_buffer(bh);
3116 : }
3117 9 : }
3118 : EXPORT_SYMBOL(ll_rw_block);
3119 :
3120 10 : void write_dirty_buffer(struct buffer_head *bh, int op_flags)
3121 : {
3122 10 : lock_buffer(bh);
3123 10 : if (!test_clear_buffer_dirty(bh)) {
3124 0 : unlock_buffer(bh);
3125 0 : return;
3126 : }
3127 10 : bh->b_end_io = end_buffer_write_sync;
3128 10 : get_bh(bh);
3129 10 : submit_bh(REQ_OP_WRITE, op_flags, bh);
3130 : }
3131 : EXPORT_SYMBOL(write_dirty_buffer);
3132 :
3133 : /*
3134 : * For a data-integrity writeout, we need to wait upon any in-progress I/O
3135 : * and then start new I/O and then wait upon it. The caller must have a ref on
3136 : * the buffer_head.
3137 : */
3138 1 : int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
3139 : {
3140 1 : int ret = 0;
3141 :
3142 1 : WARN_ON(atomic_read(&bh->b_count) < 1);
3143 1 : lock_buffer(bh);
3144 1 : if (test_clear_buffer_dirty(bh)) {
3145 : /*
3146 : * The bh should be mapped, but it might not be if the
3147 : * device was hot-removed. Not much we can do but fail the I/O.
3148 : */
3149 1 : if (!buffer_mapped(bh)) {
3150 0 : unlock_buffer(bh);
3151 0 : return -EIO;
3152 : }
3153 :
3154 1 : get_bh(bh);
3155 1 : bh->b_end_io = end_buffer_write_sync;
3156 1 : ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
3157 1 : wait_on_buffer(bh);
3158 2 : if (!ret && !buffer_uptodate(bh))
3159 0 : ret = -EIO;
3160 : } else {
3161 0 : unlock_buffer(bh);
3162 : }
3163 : return ret;
3164 : }
3165 : EXPORT_SYMBOL(__sync_dirty_buffer);
3166 :
3167 0 : int sync_dirty_buffer(struct buffer_head *bh)
3168 : {
3169 0 : return __sync_dirty_buffer(bh, REQ_SYNC);
3170 : }
3171 : EXPORT_SYMBOL(sync_dirty_buffer);
3172 :
3173 : /*
3174 : * try_to_free_buffers() checks if all the buffers on this particular page
3175 : * are unused, and releases them if so.
3176 : *
3177 : * Exclusion against try_to_free_buffers may be obtained by either
3178 : * locking the page or by holding its mapping's private_lock.
3179 : *
3180 : * If the page is dirty but all the buffers are clean then we need to
3181 : * be sure to mark the page clean as well. This is because the page
3182 : * may be against a block device, and a later reattachment of buffers
3183 : * to a dirty page will set *all* buffers dirty. Which would corrupt
3184 : * filesystem data on the same device.
3185 : *
3186 : * The same applies to regular filesystem pages: if all the buffers are
3187 : * clean then we set the page clean and proceed. To do that, we require
3188 : * total exclusion from __set_page_dirty_buffers(). That is obtained with
3189 : * private_lock.
3190 : *
3191 : * try_to_free_buffers() is non-blocking.
3192 : */
3193 503 : static inline int buffer_busy(struct buffer_head *bh)
3194 : {
3195 1006 : return atomic_read(&bh->b_count) |
3196 503 : (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3197 : }
3198 :
3199 : static int
3200 499 : drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3201 : {
3202 499 : struct buffer_head *head = page_buffers(page);
3203 499 : struct buffer_head *bh;
3204 :
3205 499 : bh = head;
3206 503 : do {
3207 503 : if (buffer_busy(bh))
3208 1 : goto failed;
3209 502 : bh = bh->b_this_page;
3210 502 : } while (bh != head);
3211 :
3212 501 : do {
3213 501 : struct buffer_head *next = bh->b_this_page;
3214 :
3215 501 : if (bh->b_assoc_map)
3216 0 : __remove_assoc_queue(bh);
3217 501 : bh = next;
3218 501 : } while (bh != head);
3219 498 : *buffers_to_free = head;
3220 498 : detach_page_private(page);
3221 498 : return 1;
3222 1 : failed:
3223 1 : return 0;
3224 : }
3225 :
3226 499 : int try_to_free_buffers(struct page *page)
3227 : {
3228 499 : struct address_space * const mapping = page->mapping;
3229 499 : struct buffer_head *buffers_to_free = NULL;
3230 499 : int ret = 0;
3231 :
3232 998 : BUG_ON(!PageLocked(page));
3233 998 : if (PageWriteback(page))
3234 : return 0;
3235 :
3236 499 : if (mapping == NULL) { /* can this still happen? */
3237 0 : ret = drop_buffers(page, &buffers_to_free);
3238 0 : goto out;
3239 : }
3240 :
3241 499 : spin_lock(&mapping->private_lock);
3242 499 : ret = drop_buffers(page, &buffers_to_free);
3243 :
3244 : /*
3245 : * If the filesystem writes its buffers by hand (eg ext3)
3246 : * then we can have clean buffers against a dirty page. We
3247 : * clean the page here; otherwise the VM will never notice
3248 : * that the filesystem did any IO at all.
3249 : *
3250 : * Also, during truncate, discard_buffer will have marked all
3251 : * the page's buffers clean. We discover that here and clean
3252 : * the page also.
3253 : *
3254 : * private_lock must be held over this entire operation in order
3255 : * to synchronise against __set_page_dirty_buffers and prevent the
3256 : * dirty bit from being lost.
3257 : */
3258 499 : if (ret)
3259 498 : cancel_dirty_page(page);
3260 499 : spin_unlock(&mapping->private_lock);
3261 499 : out:
3262 499 : if (buffers_to_free) {
3263 : struct buffer_head *bh = buffers_to_free;
3264 :
3265 501 : do {
3266 501 : struct buffer_head *next = bh->b_this_page;
3267 501 : free_buffer_head(bh);
3268 501 : bh = next;
3269 501 : } while (bh != buffers_to_free);
3270 : }
3271 : return ret;
3272 : }
3273 : EXPORT_SYMBOL(try_to_free_buffers);
3274 :
3275 : /*
3276 : * There are no bdflush tunables left. But distributions are
3277 : * still running obsolete flush daemons, so we terminate them here.
3278 : *
3279 : * Use of bdflush() is deprecated and will be removed in a future kernel.
3280 : * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3281 : */
3282 0 : SYSCALL_DEFINE2(bdflush, int, func, long, data)
3283 : {
3284 0 : static int msg_count;
3285 :
3286 0 : if (!capable(CAP_SYS_ADMIN))
3287 : return -EPERM;
3288 :
3289 0 : if (msg_count < 5) {
3290 0 : msg_count++;
3291 0 : printk(KERN_INFO
3292 : "warning: process `%s' used the obsolete bdflush"
3293 0 : " system call\n", current->comm);
3294 0 : printk(KERN_INFO "Fix your initscripts?\n");
3295 : }
3296 :
3297 0 : if (func == 1)
3298 0 : do_exit(0);
3299 : return 0;
3300 : }
3301 :
3302 : /*
3303 : * Buffer-head allocation
3304 : */
3305 : static struct kmem_cache *bh_cachep __read_mostly;
3306 :
3307 : /*
3308 : * Once the number of bh's in the machine exceeds this level, we start
3309 : * stripping them in writeback.
3310 : */
3311 : static unsigned long max_buffer_heads;
3312 :
3313 : int buffer_heads_over_limit;
3314 :
3315 : struct bh_accounting {
3316 : int nr; /* Number of live bh's */
3317 : int ratelimit; /* Limit cacheline bouncing */
3318 : };
3319 :
3320 : static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3321 :
3322 9971 : static void recalc_bh_state(void)
3323 : {
3324 9971 : int i;
3325 9971 : int tot = 0;
3326 :
3327 9971 : if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3328 : return;
3329 0 : __this_cpu_write(bh_accounting.ratelimit, 0);
3330 0 : for_each_online_cpu(i)
3331 0 : tot += per_cpu(bh_accounting, i).nr;
3332 0 : buffer_heads_over_limit = (tot > max_buffer_heads);
3333 : }
3334 :
3335 7659 : struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3336 : {
3337 7659 : struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3338 7659 : if (ret) {
3339 7659 : INIT_LIST_HEAD(&ret->b_assoc_buffers);
3340 7659 : spin_lock_init(&ret->b_uptodate_lock);
3341 7659 : preempt_disable();
3342 7659 : __this_cpu_inc(bh_accounting.nr);
3343 7659 : recalc_bh_state();
3344 7659 : preempt_enable();
3345 : }
3346 7659 : return ret;
3347 : }
3348 : EXPORT_SYMBOL(alloc_buffer_head);
3349 :
3350 2312 : void free_buffer_head(struct buffer_head *bh)
3351 : {
3352 2312 : BUG_ON(!list_empty(&bh->b_assoc_buffers));
3353 2312 : kmem_cache_free(bh_cachep, bh);
3354 2312 : preempt_disable();
3355 2312 : __this_cpu_dec(bh_accounting.nr);
3356 2312 : recalc_bh_state();
3357 2312 : preempt_enable();
3358 2312 : }
3359 : EXPORT_SYMBOL(free_buffer_head);
3360 :
3361 0 : static int buffer_exit_cpu_dead(unsigned int cpu)
3362 : {
3363 0 : int i;
3364 0 : struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3365 :
3366 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
3367 0 : brelse(b->bhs[i]);
3368 0 : b->bhs[i] = NULL;
3369 : }
3370 0 : this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3371 0 : per_cpu(bh_accounting, cpu).nr = 0;
3372 0 : return 0;
3373 : }
3374 :
3375 : /**
3376 : * bh_uptodate_or_lock - Test whether the buffer is uptodate
3377 : * @bh: struct buffer_head
3378 : *
3379 : * Return true if the buffer is up-to-date and false,
3380 : * with the buffer locked, if not.
3381 : */
3382 54 : int bh_uptodate_or_lock(struct buffer_head *bh)
3383 : {
3384 54 : if (!buffer_uptodate(bh)) {
3385 26 : lock_buffer(bh);
3386 26 : if (!buffer_uptodate(bh))
3387 : return 0;
3388 0 : unlock_buffer(bh);
3389 : }
3390 : return 1;
3391 : }
3392 : EXPORT_SYMBOL(bh_uptodate_or_lock);
3393 :
3394 : /**
3395 : * bh_submit_read - Submit a locked buffer for reading
3396 : * @bh: struct buffer_head
3397 : *
3398 : * Returns zero on success and -EIO on error.
3399 : */
3400 0 : int bh_submit_read(struct buffer_head *bh)
3401 : {
3402 0 : BUG_ON(!buffer_locked(bh));
3403 :
3404 0 : if (buffer_uptodate(bh)) {
3405 0 : unlock_buffer(bh);
3406 0 : return 0;
3407 : }
3408 :
3409 0 : get_bh(bh);
3410 0 : bh->b_end_io = end_buffer_read_sync;
3411 0 : submit_bh(REQ_OP_READ, 0, bh);
3412 0 : wait_on_buffer(bh);
3413 0 : if (buffer_uptodate(bh))
3414 0 : return 0;
3415 : return -EIO;
3416 : }
3417 : EXPORT_SYMBOL(bh_submit_read);
3418 :
3419 1 : void __init buffer_init(void)
3420 : {
3421 1 : unsigned long nrpages;
3422 1 : int ret;
3423 :
3424 1 : bh_cachep = kmem_cache_create("buffer_head",
3425 : sizeof(struct buffer_head), 0,
3426 : (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3427 : SLAB_MEM_SPREAD),
3428 : NULL);
3429 :
3430 : /*
3431 : * Limit the bh occupancy to 10% of ZONE_NORMAL
3432 : */
3433 1 : nrpages = (nr_free_buffer_pages() * 10) / 100;
3434 1 : max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3435 1 : ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3436 : NULL, buffer_exit_cpu_dead);
3437 1 : WARN_ON(ret < 0);
3438 1 : }
|