Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * mm/readahead.c - address_space-level file readahead.
4 : *
5 : * Copyright (C) 2002, Linus Torvalds
6 : *
7 : * 09Apr2002 Andrew Morton
8 : * Initial version.
9 : */
10 :
11 : #include <linux/kernel.h>
12 : #include <linux/dax.h>
13 : #include <linux/gfp.h>
14 : #include <linux/export.h>
15 : #include <linux/blkdev.h>
16 : #include <linux/backing-dev.h>
17 : #include <linux/task_io_accounting_ops.h>
18 : #include <linux/pagevec.h>
19 : #include <linux/pagemap.h>
20 : #include <linux/syscalls.h>
21 : #include <linux/file.h>
22 : #include <linux/mm_inline.h>
23 : #include <linux/blk-cgroup.h>
24 : #include <linux/fadvise.h>
25 : #include <linux/sched/mm.h>
26 :
27 : #include "internal.h"
28 :
29 : /*
30 : * Initialise a struct file's readahead state. Assumes that the caller has
31 : * memset *ra to zero.
32 : */
33 : void
34 14509 : file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
35 : {
36 14509 : ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
37 14509 : ra->prev_pos = -1;
38 14509 : }
39 : EXPORT_SYMBOL_GPL(file_ra_state_init);
40 :
41 : /*
42 : * see if a page needs releasing upon read_cache_pages() failure
43 : * - the caller of read_cache_pages() may have set PG_private or PG_fscache
44 : * before calling, such as the NFS fs marking pages that are cached locally
45 : * on disk, thus we need to give the fs a chance to clean up in the event of
46 : * an error
47 : */
48 0 : static void read_cache_pages_invalidate_page(struct address_space *mapping,
49 : struct page *page)
50 : {
51 0 : if (page_has_private(page)) {
52 0 : if (!trylock_page(page))
53 0 : BUG();
54 0 : page->mapping = mapping;
55 0 : do_invalidatepage(page, 0, PAGE_SIZE);
56 0 : page->mapping = NULL;
57 0 : unlock_page(page);
58 : }
59 0 : put_page(page);
60 0 : }
61 :
62 : /*
63 : * release a list of pages, invalidating them first if need be
64 : */
65 0 : static void read_cache_pages_invalidate_pages(struct address_space *mapping,
66 : struct list_head *pages)
67 : {
68 0 : struct page *victim;
69 :
70 0 : while (!list_empty(pages)) {
71 0 : victim = lru_to_page(pages);
72 0 : list_del(&victim->lru);
73 0 : read_cache_pages_invalidate_page(mapping, victim);
74 : }
75 0 : }
76 :
77 : /**
78 : * read_cache_pages - populate an address space with some pages & start reads against them
79 : * @mapping: the address_space
80 : * @pages: The address of a list_head which contains the target pages. These
81 : * pages have their ->index populated and are otherwise uninitialised.
82 : * @filler: callback routine for filling a single page.
83 : * @data: private data for the callback routine.
84 : *
85 : * Hides the details of the LRU cache etc from the filesystems.
86 : *
87 : * Returns: %0 on success, error return by @filler otherwise
88 : */
89 0 : int read_cache_pages(struct address_space *mapping, struct list_head *pages,
90 : int (*filler)(void *, struct page *), void *data)
91 : {
92 0 : struct page *page;
93 0 : int ret = 0;
94 :
95 0 : while (!list_empty(pages)) {
96 0 : page = lru_to_page(pages);
97 0 : list_del(&page->lru);
98 0 : if (add_to_page_cache_lru(page, mapping, page->index,
99 : readahead_gfp_mask(mapping))) {
100 0 : read_cache_pages_invalidate_page(mapping, page);
101 0 : continue;
102 : }
103 0 : put_page(page);
104 :
105 0 : ret = filler(data, page);
106 0 : if (unlikely(ret)) {
107 0 : read_cache_pages_invalidate_pages(mapping, pages);
108 0 : break;
109 : }
110 0 : task_io_account_read(PAGE_SIZE);
111 : }
112 0 : return ret;
113 : }
114 :
115 : EXPORT_SYMBOL(read_cache_pages);
116 :
117 6942 : static void read_pages(struct readahead_control *rac, struct list_head *pages,
118 : bool skip_page)
119 : {
120 6942 : const struct address_space_operations *aops = rac->mapping->a_ops;
121 6942 : struct page *page;
122 6942 : struct blk_plug plug;
123 :
124 6942 : if (!readahead_count(rac))
125 5342 : goto out;
126 :
127 1600 : blk_start_plug(&plug);
128 :
129 1600 : if (aops->readahead) {
130 1600 : aops->readahead(rac);
131 : /* Clean up the remaining pages */
132 1600 : while ((page = readahead_page(rac))) {
133 0 : unlock_page(page);
134 0 : put_page(page);
135 : }
136 0 : } else if (aops->readpages) {
137 0 : aops->readpages(rac->file, rac->mapping, pages,
138 : readahead_count(rac));
139 : /* Clean up the remaining pages */
140 0 : put_pages_list(pages);
141 0 : rac->_index += rac->_nr_pages;
142 0 : rac->_nr_pages = 0;
143 : } else {
144 0 : while ((page = readahead_page(rac))) {
145 0 : aops->readpage(rac->file, page);
146 0 : put_page(page);
147 : }
148 : }
149 :
150 1600 : blk_finish_plug(&plug);
151 :
152 1600 : BUG_ON(!list_empty(pages));
153 1600 : BUG_ON(readahead_count(rac));
154 :
155 1600 : out:
156 6942 : if (skip_page)
157 5286 : rac->_index++;
158 6942 : }
159 :
160 : /**
161 : * page_cache_ra_unbounded - Start unchecked readahead.
162 : * @ractl: Readahead control.
163 : * @nr_to_read: The number of pages to read.
164 : * @lookahead_size: Where to start the next readahead.
165 : *
166 : * This function is for filesystems to call when they want to start
167 : * readahead beyond a file's stated i_size. This is almost certainly
168 : * not the function you want to call. Use page_cache_async_readahead()
169 : * or page_cache_sync_readahead() instead.
170 : *
171 : * Context: File is referenced by caller. Mutexes may be held by caller.
172 : * May sleep, but will not reenter filesystem to reclaim memory.
173 : */
174 1656 : void page_cache_ra_unbounded(struct readahead_control *ractl,
175 : unsigned long nr_to_read, unsigned long lookahead_size)
176 : {
177 1656 : struct address_space *mapping = ractl->mapping;
178 1656 : unsigned long index = readahead_index(ractl);
179 1656 : LIST_HEAD(page_pool);
180 1656 : gfp_t gfp_mask = readahead_gfp_mask(mapping);
181 1656 : unsigned long i;
182 :
183 : /*
184 : * Partway through the readahead operation, we will have added
185 : * locked pages to the page cache, but will not yet have submitted
186 : * them for I/O. Adding another page may need to allocate memory,
187 : * which can trigger memory reclaim. Telling the VM we're in
188 : * the middle of a filesystem operation will cause it to not
189 : * touch file-backed pages, preventing a deadlock. Most (all?)
190 : * filesystems already specify __GFP_NOFS in their mapping's
191 : * gfp_mask, but let's be explicit here.
192 : */
193 1656 : unsigned int nofs = memalloc_nofs_save();
194 :
195 : /*
196 : * Preallocate as many pages as we will need.
197 : */
198 28497 : for (i = 0; i < nr_to_read; i++) {
199 26841 : struct page *page = xa_load(&mapping->i_pages, index + i);
200 :
201 26841 : BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
202 :
203 26841 : if (page && !xa_is_value(page)) {
204 : /*
205 : * Page already present? Kick off the current batch
206 : * of contiguous pages before continuing with the
207 : * next batch. This page may be the one we would
208 : * have intended to mark as Readahead, but we don't
209 : * have a stable reference to this page, and it's
210 : * not worth getting one just for that.
211 : */
212 5286 : read_pages(ractl, &page_pool, true);
213 5286 : continue;
214 : }
215 :
216 21555 : page = __page_cache_alloc(gfp_mask);
217 21555 : if (!page)
218 : break;
219 21555 : if (mapping->a_ops->readpages) {
220 0 : page->index = index + i;
221 0 : list_add(&page->lru, &page_pool);
222 21555 : } else if (add_to_page_cache_lru(page, mapping, index + i,
223 : gfp_mask) < 0) {
224 0 : put_page(page);
225 0 : read_pages(ractl, &page_pool, true);
226 0 : continue;
227 : }
228 21555 : if (i == nr_to_read - lookahead_size)
229 1002 : SetPageReadahead(page);
230 21555 : ractl->_nr_pages++;
231 : }
232 :
233 : /*
234 : * Now start the IO. We ignore I/O errors - if the page is not
235 : * uptodate then the caller will launch readpage again, and
236 : * will then handle the error.
237 : */
238 1656 : read_pages(ractl, &page_pool, false);
239 1656 : memalloc_nofs_restore(nofs);
240 1656 : }
241 : EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
242 :
243 : /*
244 : * do_page_cache_ra() actually reads a chunk of disk. It allocates
245 : * the pages first, then submits them for I/O. This avoids the very bad
246 : * behaviour which would occur if page allocations are causing VM writeback.
247 : * We really don't want to intermingle reads and writes like that.
248 : */
249 1813 : void do_page_cache_ra(struct readahead_control *ractl,
250 : unsigned long nr_to_read, unsigned long lookahead_size)
251 : {
252 1813 : struct inode *inode = ractl->mapping->host;
253 1813 : unsigned long index = readahead_index(ractl);
254 1813 : loff_t isize = i_size_read(inode);
255 1813 : pgoff_t end_index; /* The last page we want to read */
256 :
257 1813 : if (isize == 0)
258 : return;
259 :
260 1812 : end_index = (isize - 1) >> PAGE_SHIFT;
261 1812 : if (index > end_index)
262 : return;
263 : /* Don't read past the page containing the last byte of the file */
264 1656 : if (nr_to_read > end_index - index)
265 642 : nr_to_read = end_index - index + 1;
266 :
267 1656 : page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
268 : }
269 :
270 : /*
271 : * Chunk the readahead into 2 megabyte units, so that we don't pin too much
272 : * memory at once.
273 : */
274 50 : void force_page_cache_ra(struct readahead_control *ractl,
275 : struct file_ra_state *ra, unsigned long nr_to_read)
276 : {
277 50 : struct address_space *mapping = ractl->mapping;
278 50 : struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
279 50 : unsigned long max_pages, index;
280 :
281 50 : if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
282 : !mapping->a_ops->readahead))
283 : return;
284 :
285 : /*
286 : * If the request exceeds the readahead window, allow the read to
287 : * be up to the optimal hardware IO size
288 : */
289 50 : index = readahead_index(ractl);
290 50 : max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
291 50 : nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
292 100 : while (nr_to_read) {
293 50 : unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
294 :
295 50 : if (this_chunk > nr_to_read)
296 50 : this_chunk = nr_to_read;
297 50 : ractl->_index = index;
298 50 : do_page_cache_ra(ractl, this_chunk, 0);
299 :
300 50 : index += this_chunk;
301 50 : nr_to_read -= this_chunk;
302 : }
303 : }
304 :
305 : /*
306 : * Set the initial window size, round to next power of 2 and square
307 : * for small size, x 4 for medium, and x 2 for large
308 : * for 128k (32 page) max ra
309 : * 1-8 page = 32k initial, > 8 page = 128k initial
310 : */
311 611 : static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
312 : {
313 611 : unsigned long newsize = roundup_pow_of_two(size);
314 :
315 611 : if (newsize <= max / 32)
316 540 : newsize = newsize * 4;
317 71 : else if (newsize <= max / 4)
318 56 : newsize = newsize * 2;
319 : else
320 : newsize = max;
321 :
322 611 : return newsize;
323 : }
324 :
325 : /*
326 : * Get the previous window size, ramp it up, and
327 : * return it as the new window size.
328 : */
329 810 : static unsigned long get_next_ra_size(struct file_ra_state *ra,
330 : unsigned long max)
331 : {
332 810 : unsigned long cur = ra->size;
333 :
334 810 : if (cur < max / 16)
335 0 : return 4 * cur;
336 810 : if (cur <= max / 2)
337 251 : return 2 * cur;
338 : return max;
339 : }
340 :
341 : /*
342 : * On-demand readahead design.
343 : *
344 : * The fields in struct file_ra_state represent the most-recently-executed
345 : * readahead attempt:
346 : *
347 : * |<----- async_size ---------|
348 : * |------------------- size -------------------->|
349 : * |==================#===========================|
350 : * ^start ^page marked with PG_readahead
351 : *
352 : * To overlap application thinking time and disk I/O time, we do
353 : * `readahead pipelining': Do not wait until the application consumed all
354 : * readahead pages and stalled on the missing page at readahead_index;
355 : * Instead, submit an asynchronous readahead I/O as soon as there are
356 : * only async_size pages left in the readahead window. Normally async_size
357 : * will be equal to size, for maximum pipelining.
358 : *
359 : * In interleaved sequential reads, concurrent streams on the same fd can
360 : * be invalidating each other's readahead state. So we flag the new readahead
361 : * page at (start+size-async_size) with PG_readahead, and use it as readahead
362 : * indicator. The flag won't be set on already cached pages, to avoid the
363 : * readahead-for-nothing fuss, saving pointless page cache lookups.
364 : *
365 : * prev_pos tracks the last visited byte in the _previous_ read request.
366 : * It should be maintained by the caller, and will be used for detecting
367 : * small random reads. Note that the readahead algorithm checks loosely
368 : * for sequential patterns. Hence interleaved reads might be served as
369 : * sequential ones.
370 : *
371 : * There is a special-case: if the first page which the application tries to
372 : * read happens to be the first page of the file, it is assumed that a linear
373 : * read is about to happen and the window is immediately set to the initial size
374 : * based on I/O request size and the max_readahead.
375 : *
376 : * The code ramps up the readahead size aggressively at first, but slow down as
377 : * it approaches max_readhead.
378 : */
379 :
380 : /*
381 : * Count contiguously cached pages from @index-1 to @index-@max,
382 : * this count is a conservative estimation of
383 : * - length of the sequential read sequence, or
384 : * - thrashing threshold in memory tight systems
385 : */
386 1 : static pgoff_t count_history_pages(struct address_space *mapping,
387 : pgoff_t index, unsigned long max)
388 : {
389 1 : pgoff_t head;
390 :
391 1 : rcu_read_lock();
392 1 : head = page_cache_prev_miss(mapping, index - 1, max);
393 1 : rcu_read_unlock();
394 :
395 1 : return index - 1 - head;
396 : }
397 :
398 : /*
399 : * page cache context based read-ahead
400 : */
401 1 : static int try_context_readahead(struct address_space *mapping,
402 : struct file_ra_state *ra,
403 : pgoff_t index,
404 : unsigned long req_size,
405 : unsigned long max)
406 : {
407 1 : pgoff_t size;
408 :
409 1 : size = count_history_pages(mapping, index, max);
410 :
411 : /*
412 : * not enough history pages:
413 : * it could be a random read
414 : */
415 1 : if (size <= req_size)
416 : return 0;
417 :
418 : /*
419 : * starts from beginning of file:
420 : * it is a strong indication of long-run stream (or whole-file-read)
421 : */
422 0 : if (size >= index)
423 0 : size *= 2;
424 :
425 0 : ra->start = index;
426 0 : ra->size = min(size + req_size, max);
427 0 : ra->async_size = 1;
428 :
429 0 : return 1;
430 : }
431 :
432 : /*
433 : * A minimal readahead algorithm for trivial sequential/random reads.
434 : */
435 1421 : static void ondemand_readahead(struct readahead_control *ractl,
436 : struct file_ra_state *ra, bool hit_readahead_marker,
437 : unsigned long req_size)
438 : {
439 1421 : struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
440 1421 : unsigned long max_pages = ra->ra_pages;
441 1421 : unsigned long add_pages;
442 1421 : unsigned long index = readahead_index(ractl);
443 1421 : pgoff_t prev_index;
444 :
445 : /*
446 : * If the request exceeds the readahead window, allow the read to
447 : * be up to the optimal hardware IO size
448 : */
449 1421 : if (req_size > max_pages && bdi->io_pages > max_pages)
450 0 : max_pages = min(req_size, bdi->io_pages);
451 :
452 : /*
453 : * start of file
454 : */
455 1421 : if (!index)
456 584 : goto initial_readahead;
457 :
458 : /*
459 : * It's the expected callback index, assume sequential access.
460 : * Ramp up sizes, and push forward the readahead window.
461 : */
462 837 : if ((index == (ra->start + ra->size - ra->async_size) ||
463 : index == (ra->start + ra->size))) {
464 553 : ra->start += ra->size;
465 553 : ra->size = get_next_ra_size(ra, max_pages);
466 553 : ra->async_size = ra->size;
467 553 : goto readit;
468 : }
469 :
470 : /*
471 : * Hit a marked page without valid readahead state.
472 : * E.g. interleaved reads.
473 : * Query the pagecache for async_size, which normally equals to
474 : * readahead size. Ramp it up and use it as the new readahead size.
475 : */
476 284 : if (hit_readahead_marker) {
477 256 : pgoff_t start;
478 :
479 256 : rcu_read_lock();
480 256 : start = page_cache_next_miss(ractl->mapping, index + 1,
481 : max_pages);
482 256 : rcu_read_unlock();
483 :
484 256 : if (!start || start - index > max_pages)
485 : return;
486 :
487 256 : ra->start = start;
488 256 : ra->size = start - index; /* old async_size */
489 256 : ra->size += req_size;
490 256 : ra->size = get_next_ra_size(ra, max_pages);
491 256 : ra->async_size = ra->size;
492 256 : goto readit;
493 : }
494 :
495 : /*
496 : * oversize read
497 : */
498 28 : if (req_size > max_pages)
499 0 : goto initial_readahead;
500 :
501 : /*
502 : * sequential cache miss
503 : * trivial case: (index - prev_index) == 1
504 : * unaligned reads: (index - prev_index) == 0
505 : */
506 28 : prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
507 28 : if (index - prev_index <= 1UL)
508 27 : goto initial_readahead;
509 :
510 : /*
511 : * Query the page cache and look for the traces(cached history pages)
512 : * that a sequential stream would leave behind.
513 : */
514 1 : if (try_context_readahead(ractl->mapping, ra, index, req_size,
515 : max_pages))
516 0 : goto readit;
517 :
518 : /*
519 : * standalone, small random read
520 : * Read as is, and do not pollute the readahead state.
521 : */
522 1 : do_page_cache_ra(ractl, req_size, 0);
523 1 : return;
524 :
525 611 : initial_readahead:
526 611 : ra->start = index;
527 611 : ra->size = get_init_ra_size(req_size, max_pages);
528 611 : ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
529 :
530 1420 : readit:
531 : /*
532 : * Will this read hit the readahead marker made by itself?
533 : * If so, trigger the readahead marker hit now, and merge
534 : * the resulted next readahead window into the current one.
535 : * Take care of maximum IO pages as above.
536 : */
537 1420 : if (index == ra->start && ra->size == ra->async_size) {
538 1 : add_pages = get_next_ra_size(ra, max_pages);
539 1 : if (ra->size + add_pages <= max_pages) {
540 0 : ra->async_size = add_pages;
541 0 : ra->size += add_pages;
542 : } else {
543 1 : ra->size = max_pages;
544 1 : ra->async_size = max_pages >> 1;
545 : }
546 : }
547 :
548 1420 : ractl->_index = ra->start;
549 1420 : do_page_cache_ra(ractl, ra->size, ra->async_size);
550 : }
551 :
552 656 : void page_cache_sync_ra(struct readahead_control *ractl,
553 : struct file_ra_state *ra, unsigned long req_count)
554 : {
555 656 : bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
556 :
557 : /*
558 : * Even if read-ahead is disabled, issue this request as read-ahead
559 : * as we'll need it to satisfy the requested range. The forced
560 : * read-ahead will do the right thing and limit the read to just the
561 : * requested range, which we'll set to 1 page for this case.
562 : */
563 656 : if (!ra->ra_pages || blk_cgroup_congested()) {
564 0 : if (!ractl->file)
565 : return;
566 : req_count = 1;
567 : do_forced_ra = true;
568 : }
569 :
570 : /* be dumb */
571 656 : if (do_forced_ra) {
572 50 : force_page_cache_ra(ractl, ra, req_count);
573 50 : return;
574 : }
575 :
576 : /* do read-ahead */
577 606 : ondemand_readahead(ractl, ra, false, req_count);
578 : }
579 : EXPORT_SYMBOL_GPL(page_cache_sync_ra);
580 :
581 815 : void page_cache_async_ra(struct readahead_control *ractl,
582 : struct file_ra_state *ra, struct page *page,
583 : unsigned long req_count)
584 : {
585 : /* no read-ahead */
586 815 : if (!ra->ra_pages)
587 : return;
588 :
589 : /*
590 : * Same bit is used for PG_readahead and PG_reclaim.
591 : */
592 1630 : if (PageWriteback(page))
593 : return;
594 :
595 815 : ClearPageReadahead(page);
596 :
597 : /*
598 : * Defer asynchronous read-ahead on IO congestion.
599 : */
600 815 : if (inode_read_congested(ractl->mapping->host))
601 : return;
602 :
603 815 : if (blk_cgroup_congested())
604 : return;
605 :
606 : /* do read-ahead */
607 815 : ondemand_readahead(ractl, ra, true, req_count);
608 : }
609 : EXPORT_SYMBOL_GPL(page_cache_async_ra);
610 :
611 0 : ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
612 : {
613 0 : ssize_t ret;
614 0 : struct fd f;
615 :
616 0 : ret = -EBADF;
617 0 : f = fdget(fd);
618 0 : if (!f.file || !(f.file->f_mode & FMODE_READ))
619 0 : goto out;
620 :
621 : /*
622 : * The readahead() syscall is intended to run only on files
623 : * that can execute readahead. If readahead is not possible
624 : * on this file, then we must return -EINVAL.
625 : */
626 0 : ret = -EINVAL;
627 0 : if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
628 0 : !S_ISREG(file_inode(f.file)->i_mode))
629 0 : goto out;
630 :
631 0 : ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
632 0 : out:
633 0 : fdput(f);
634 0 : return ret;
635 : }
636 :
637 0 : SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
638 : {
639 0 : return ksys_readahead(fd, offset, count);
640 : }
|