Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/mm/swap.c
4 : *
5 : * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 : */
7 :
8 : /*
9 : * This file contains the default values for the operation of the
10 : * Linux VM subsystem. Fine-tuning documentation can be found in
11 : * Documentation/admin-guide/sysctl/vm.rst.
12 : * Started 18.12.91
13 : * Swap aging added 23.2.95, Stephen Tweedie.
14 : * Buffermem limits added 12.3.98, Rik van Riel.
15 : */
16 :
17 : #include <linux/mm.h>
18 : #include <linux/sched.h>
19 : #include <linux/kernel_stat.h>
20 : #include <linux/swap.h>
21 : #include <linux/mman.h>
22 : #include <linux/pagemap.h>
23 : #include <linux/pagevec.h>
24 : #include <linux/init.h>
25 : #include <linux/export.h>
26 : #include <linux/mm_inline.h>
27 : #include <linux/percpu_counter.h>
28 : #include <linux/memremap.h>
29 : #include <linux/percpu.h>
30 : #include <linux/cpu.h>
31 : #include <linux/notifier.h>
32 : #include <linux/backing-dev.h>
33 : #include <linux/memcontrol.h>
34 : #include <linux/gfp.h>
35 : #include <linux/uio.h>
36 : #include <linux/hugetlb.h>
37 : #include <linux/page_idle.h>
38 : #include <linux/local_lock.h>
39 :
40 : #include "internal.h"
41 :
42 : #define CREATE_TRACE_POINTS
43 : #include <trace/events/pagemap.h>
44 :
45 : /* How many pages do we try to swap or page in/out together? */
46 : int page_cluster;
47 :
48 : /* Protecting only lru_rotate.pvec which requires disabling interrupts */
49 : struct lru_rotate {
50 : local_lock_t lock;
51 : struct pagevec pvec;
52 : };
53 : static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
54 : .lock = INIT_LOCAL_LOCK(lock),
55 : };
56 :
57 : /*
58 : * The following struct pagevec are grouped together because they are protected
59 : * by disabling preemption (and interrupts remain enabled).
60 : */
61 : struct lru_pvecs {
62 : local_lock_t lock;
63 : struct pagevec lru_add;
64 : struct pagevec lru_deactivate_file;
65 : struct pagevec lru_deactivate;
66 : struct pagevec lru_lazyfree;
67 : #ifdef CONFIG_SMP
68 : struct pagevec activate_page;
69 : #endif
70 : };
71 : static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
72 : .lock = INIT_LOCAL_LOCK(lock),
73 : };
74 :
75 : /*
76 : * This path almost never happens for VM activity - pages are normally
77 : * freed via pagevecs. But it gets used by networking.
78 : */
79 32141 : static void __page_cache_release(struct page *page)
80 : {
81 64282 : if (PageLRU(page)) {
82 90 : struct lruvec *lruvec;
83 90 : unsigned long flags;
84 :
85 90 : lruvec = lock_page_lruvec_irqsave(page, &flags);
86 90 : del_page_from_lru_list(page, lruvec);
87 90 : __clear_page_lru_flags(page);
88 90 : unlock_page_lruvec_irqrestore(lruvec, flags);
89 : }
90 32141 : __ClearPageWaiters(page);
91 32142 : }
92 :
93 32078 : static void __put_single_page(struct page *page)
94 : {
95 32078 : __page_cache_release(page);
96 32079 : mem_cgroup_uncharge(page);
97 32079 : free_unref_page(page);
98 32079 : }
99 :
100 63 : static void __put_compound_page(struct page *page)
101 : {
102 : /*
103 : * __page_cache_release() is supposed to be called for thp, not for
104 : * hugetlb. This is because hugetlb page does never have PageLRU set
105 : * (it's never listed to any LRU lists) and no memcg routines should
106 : * be called for hugetlb (it has a separate hugetlb_cgroup.)
107 : */
108 63 : if (!PageHuge(page))
109 63 : __page_cache_release(page);
110 63 : destroy_compound_page(page);
111 46 : }
112 :
113 32125 : void __put_page(struct page *page)
114 : {
115 32125 : if (is_zone_device_page(page)) {
116 : put_dev_pagemap(page->pgmap);
117 :
118 : /*
119 : * The page belongs to the device that created pgmap. Do
120 : * not return it to page allocator.
121 : */
122 : return;
123 : }
124 :
125 64249 : if (unlikely(PageCompound(page)))
126 46 : __put_compound_page(page);
127 : else
128 32078 : __put_single_page(page);
129 : }
130 : EXPORT_SYMBOL(__put_page);
131 :
132 : /**
133 : * put_pages_list() - release a list of pages
134 : * @pages: list of pages threaded on page->lru
135 : *
136 : * Release a list of pages which are strung together on page.lru. Currently
137 : * used by read_cache_pages() and related error recovery code.
138 : */
139 0 : void put_pages_list(struct list_head *pages)
140 : {
141 0 : while (!list_empty(pages)) {
142 0 : struct page *victim;
143 :
144 0 : victim = lru_to_page(pages);
145 0 : list_del(&victim->lru);
146 0 : put_page(victim);
147 : }
148 0 : }
149 : EXPORT_SYMBOL(put_pages_list);
150 :
151 : /*
152 : * get_kernel_pages() - pin kernel pages in memory
153 : * @kiov: An array of struct kvec structures
154 : * @nr_segs: number of segments to pin
155 : * @write: pinning for read/write, currently ignored
156 : * @pages: array that receives pointers to the pages pinned.
157 : * Should be at least nr_segs long.
158 : *
159 : * Returns number of pages pinned. This may be fewer than the number
160 : * requested. If nr_pages is 0 or negative, returns 0. If no pages
161 : * were pinned, returns -errno. Each page returned must be released
162 : * with a put_page() call when it is finished with.
163 : */
164 0 : int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
165 : struct page **pages)
166 : {
167 0 : int seg;
168 :
169 0 : for (seg = 0; seg < nr_segs; seg++) {
170 0 : if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
171 0 : return seg;
172 :
173 0 : pages[seg] = kmap_to_page(kiov[seg].iov_base);
174 0 : get_page(pages[seg]);
175 : }
176 :
177 : return seg;
178 : }
179 : EXPORT_SYMBOL_GPL(get_kernel_pages);
180 :
181 : /*
182 : * get_kernel_page() - pin a kernel page in memory
183 : * @start: starting kernel address
184 : * @write: pinning for read/write, currently ignored
185 : * @pages: array that receives pointer to the page pinned.
186 : * Must be at least nr_segs long.
187 : *
188 : * Returns 1 if page is pinned. If the page was not pinned, returns
189 : * -errno. The page returned must be released with a put_page() call
190 : * when it is finished with.
191 : */
192 0 : int get_kernel_page(unsigned long start, int write, struct page **pages)
193 : {
194 0 : const struct kvec kiov = {
195 0 : .iov_base = (void *)start,
196 : .iov_len = PAGE_SIZE
197 : };
198 :
199 0 : return get_kernel_pages(&kiov, 1, write, pages);
200 : }
201 : EXPORT_SYMBOL_GPL(get_kernel_page);
202 :
203 1586 : static void pagevec_lru_move_fn(struct pagevec *pvec,
204 : void (*move_fn)(struct page *page, struct lruvec *lruvec))
205 : {
206 1586 : int i;
207 1586 : struct lruvec *lruvec = NULL;
208 1586 : unsigned long flags = 0;
209 :
210 18953 : for (i = 0; i < pagevec_count(pvec); i++) {
211 17367 : struct page *page = pvec->pages[i];
212 :
213 : /* block memcg migration during page moving between lru */
214 34734 : if (!TestClearPageLRU(page))
215 1 : continue;
216 :
217 17366 : lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
218 17366 : (*move_fn)(page, lruvec);
219 :
220 34733 : SetPageLRU(page);
221 : }
222 1586 : if (lruvec)
223 1586 : unlock_page_lruvec_irqrestore(lruvec, flags);
224 1586 : release_pages(pvec->pages, pvec->nr);
225 1586 : pagevec_reinit(pvec);
226 1586 : }
227 :
228 0 : static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
229 : {
230 0 : if (!PageUnevictable(page)) {
231 0 : del_page_from_lru_list(page, lruvec);
232 0 : ClearPageActive(page);
233 0 : add_page_to_lru_list_tail(page, lruvec);
234 0 : __count_vm_events(PGROTATED, thp_nr_pages(page));
235 : }
236 0 : }
237 :
238 : /*
239 : * Writeback is about to end against a page which has been marked for immediate
240 : * reclaim. If it still appears to be reclaimable, move it to the tail of the
241 : * inactive list.
242 : *
243 : * rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
244 : */
245 0 : void rotate_reclaimable_page(struct page *page)
246 : {
247 0 : if (!PageLocked(page) && !PageDirty(page) &&
248 0 : !PageUnevictable(page) && PageLRU(page)) {
249 0 : struct pagevec *pvec;
250 0 : unsigned long flags;
251 :
252 0 : get_page(page);
253 0 : local_lock_irqsave(&lru_rotate.lock, flags);
254 0 : pvec = this_cpu_ptr(&lru_rotate.pvec);
255 0 : if (!pagevec_add(pvec, page) || PageCompound(page))
256 0 : pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
257 0 : local_unlock_irqrestore(&lru_rotate.lock, flags);
258 : }
259 0 : }
260 :
261 0 : void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
262 : {
263 0 : do {
264 0 : unsigned long lrusize;
265 :
266 : /*
267 : * Hold lruvec->lru_lock is safe here, since
268 : * 1) The pinned lruvec in reclaim, or
269 : * 2) From a pre-LRU page during refault (which also holds the
270 : * rcu lock, so would be safe even if the page was on the LRU
271 : * and could move simultaneously to a new lruvec).
272 : */
273 0 : spin_lock_irq(&lruvec->lru_lock);
274 : /* Record cost event */
275 0 : if (file)
276 0 : lruvec->file_cost += nr_pages;
277 : else
278 0 : lruvec->anon_cost += nr_pages;
279 :
280 : /*
281 : * Decay previous events
282 : *
283 : * Because workloads change over time (and to avoid
284 : * overflow) we keep these statistics as a floating
285 : * average, which ends up weighing recent refaults
286 : * more than old ones.
287 : */
288 0 : lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
289 0 : lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
290 0 : lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
291 0 : lruvec_page_state(lruvec, NR_ACTIVE_FILE);
292 :
293 0 : if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
294 0 : lruvec->file_cost /= 2;
295 0 : lruvec->anon_cost /= 2;
296 : }
297 0 : spin_unlock_irq(&lruvec->lru_lock);
298 0 : } while ((lruvec = parent_lruvec(lruvec)));
299 0 : }
300 :
301 0 : void lru_note_cost_page(struct page *page)
302 : {
303 0 : lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
304 0 : page_is_file_lru(page), thp_nr_pages(page));
305 0 : }
306 :
307 17366 : static void __activate_page(struct page *page, struct lruvec *lruvec)
308 : {
309 51867 : if (!PageActive(page) && !PageUnevictable(page)) {
310 17135 : int nr_pages = thp_nr_pages(page);
311 :
312 17135 : del_page_from_lru_list(page, lruvec);
313 17135 : SetPageActive(page);
314 17135 : add_page_to_lru_list(page, lruvec);
315 17135 : trace_mm_lru_activate(page);
316 :
317 17135 : __count_vm_events(PGACTIVATE, nr_pages);
318 17135 : __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
319 : nr_pages);
320 : }
321 17366 : }
322 :
323 : #ifdef CONFIG_SMP
324 21436 : static void activate_page_drain(int cpu)
325 : {
326 21436 : struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
327 :
328 21436 : if (pagevec_count(pvec))
329 550 : pagevec_lru_move_fn(pvec, __activate_page);
330 21436 : }
331 :
332 10 : static bool need_activate_page_drain(int cpu)
333 : {
334 10 : return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
335 : }
336 :
337 17357 : static void activate_page(struct page *page)
338 : {
339 17357 : page = compound_head(page);
340 69438 : if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
341 17363 : struct pagevec *pvec;
342 :
343 17363 : local_lock(&lru_pvecs.lock);
344 17356 : pvec = this_cpu_ptr(&lru_pvecs.activate_page);
345 17356 : get_page(page);
346 33687 : if (!pagevec_add(pvec, page) || PageCompound(page))
347 1035 : pagevec_lru_move_fn(pvec, __activate_page);
348 17357 : local_unlock(&lru_pvecs.lock);
349 : }
350 17350 : }
351 :
352 : #else
353 : static inline void activate_page_drain(int cpu)
354 : {
355 : }
356 :
357 : static void activate_page(struct page *page)
358 : {
359 : struct lruvec *lruvec;
360 :
361 : page = compound_head(page);
362 : if (TestClearPageLRU(page)) {
363 : lruvec = lock_page_lruvec_irq(page);
364 : __activate_page(page, lruvec);
365 : unlock_page_lruvec_irq(lruvec);
366 : SetPageLRU(page);
367 : }
368 : }
369 : #endif
370 :
371 6673 : static void __lru_cache_activate_page(struct page *page)
372 : {
373 6673 : struct pagevec *pvec;
374 6673 : int i;
375 :
376 6673 : local_lock(&lru_pvecs.lock);
377 6673 : pvec = this_cpu_ptr(&lru_pvecs.lru_add);
378 :
379 : /*
380 : * Search backwards on the optimistic assumption that the page being
381 : * activated has just been added to this pagevec. Note that only
382 : * the local pagevec is examined as a !PageLRU page could be in the
383 : * process of being released, reclaimed, migrated or on a remote
384 : * pagevec that is currently being drained. Furthermore, marking
385 : * a remote pagevec's page PageActive potentially hits a race where
386 : * a page is marked PageActive just after it is added to the inactive
387 : * list causing accounting errors and BUG_ON checks to trigger.
388 : */
389 6720 : for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
390 6267 : struct page *pagevec_page = pvec->pages[i];
391 :
392 6267 : if (pagevec_page == page) {
393 6220 : SetPageActive(page);
394 : break;
395 : }
396 : }
397 :
398 6673 : local_unlock(&lru_pvecs.lock);
399 6673 : }
400 :
401 : /*
402 : * Mark a page as having seen activity.
403 : *
404 : * inactive,unreferenced -> inactive,referenced
405 : * inactive,referenced -> active,unreferenced
406 : * active,unreferenced -> active,referenced
407 : *
408 : * When a newly allocated page is not yet visible, so safe for non-atomic ops,
409 : * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
410 : */
411 779903 : void mark_page_accessed(struct page *page)
412 : {
413 779903 : page = compound_head(page);
414 :
415 1559805 : if (!PageReferenced(page)) {
416 33474 : SetPageReferenced(page);
417 1492808 : } else if (PageUnevictable(page)) {
418 : /*
419 : * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
420 : * this list is never rotated or maintained, so marking an
421 : * evictable page accessed has no effect.
422 : */
423 1492691 : } else if (!PageActive(page)) {
424 : /*
425 : * If the page is on the LRU, queue it for activation via
426 : * lru_pvecs.activate_page. Otherwise, assume the page is on a
427 : * pagevec, mark it active and it'll be moved to the active
428 : * LRU on the next drain.
429 : */
430 48054 : if (PageLRU(page))
431 17356 : activate_page(page);
432 : else
433 6673 : __lru_cache_activate_page(page);
434 24024 : ClearPageReferenced(page);
435 24036 : workingset_activation(page);
436 : }
437 779774 : if (page_is_idle(page))
438 779774 : clear_page_idle(page);
439 779774 : }
440 : EXPORT_SYMBOL(mark_page_accessed);
441 :
442 : /**
443 : * lru_cache_add - add a page to a page list
444 : * @page: the page to be added to the LRU.
445 : *
446 : * Queue the page for addition to the LRU via pagevec. The decision on whether
447 : * to add the page to the [in]active [file|anon] list is deferred until the
448 : * pagevec is drained. This gives a chance for the caller of lru_cache_add()
449 : * have the page added to the active list using mark_page_accessed().
450 : */
451 96315 : void lru_cache_add(struct page *page)
452 : {
453 96315 : struct pagevec *pvec;
454 :
455 192625 : VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
456 192619 : VM_BUG_ON_PAGE(PageLRU(page), page);
457 :
458 96309 : get_page(page);
459 96320 : local_lock(&lru_pvecs.lock);
460 96314 : pvec = this_cpu_ptr(&lru_pvecs.lru_add);
461 188796 : if (!pagevec_add(pvec, page) || PageCompound(page))
462 3838 : __pagevec_lru_add(pvec);
463 96319 : local_unlock(&lru_pvecs.lock);
464 96315 : }
465 : EXPORT_SYMBOL(lru_cache_add);
466 :
467 : /**
468 : * lru_cache_add_inactive_or_unevictable
469 : * @page: the page to be added to LRU
470 : * @vma: vma in which page is mapped for determining reclaimability
471 : *
472 : * Place @page on the inactive or unevictable LRU list, depending on its
473 : * evictability.
474 : */
475 67070 : void lru_cache_add_inactive_or_unevictable(struct page *page,
476 : struct vm_area_struct *vma)
477 : {
478 67070 : bool unevictable;
479 :
480 134141 : VM_BUG_ON_PAGE(PageLRU(page), page);
481 :
482 67071 : unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
483 67086 : if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
484 15 : int nr_pages = thp_nr_pages(page);
485 : /*
486 : * We use the irq-unsafe __mod_zone_page_stat because this
487 : * counter is not modified from interrupt context, and the pte
488 : * lock is held(spinlock), which implies preemption disabled.
489 : */
490 15 : __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
491 15 : count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
492 : }
493 67071 : lru_cache_add(page);
494 67067 : }
495 :
496 : /*
497 : * If the page can not be invalidated, it is moved to the
498 : * inactive list to speed up its reclaim. It is moved to the
499 : * head of the list, rather than the tail, to give the flusher
500 : * threads some time to write it out, as this is much more
501 : * effective than the single-page writeout from reclaim.
502 : *
503 : * If the page isn't page_mapped and dirty/writeback, the page
504 : * could reclaim asap using PG_reclaim.
505 : *
506 : * 1. active, mapped page -> none
507 : * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
508 : * 3. inactive, mapped page -> none
509 : * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
510 : * 5. inactive, clean -> inactive, tail
511 : * 6. Others -> none
512 : *
513 : * In 4, why it moves inactive's head, the VM expects the page would
514 : * be write it out by flusher threads as this is much more effective
515 : * than the single-page writeout from reclaim.
516 : */
517 0 : static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
518 : {
519 0 : bool active = PageActive(page);
520 0 : int nr_pages = thp_nr_pages(page);
521 :
522 0 : if (PageUnevictable(page))
523 : return;
524 :
525 : /* Some processes are using the page */
526 0 : if (page_mapped(page))
527 : return;
528 :
529 0 : del_page_from_lru_list(page, lruvec);
530 0 : ClearPageActive(page);
531 0 : ClearPageReferenced(page);
532 :
533 0 : if (PageWriteback(page) || PageDirty(page)) {
534 : /*
535 : * PG_reclaim could be raced with end_page_writeback
536 : * It can make readahead confusing. But race window
537 : * is _really_ small and it's non-critical problem.
538 : */
539 0 : add_page_to_lru_list(page, lruvec);
540 0 : SetPageReclaim(page);
541 : } else {
542 : /*
543 : * The page's writeback ends up during pagevec
544 : * We moves tha page into tail of inactive.
545 : */
546 0 : add_page_to_lru_list_tail(page, lruvec);
547 0 : __count_vm_events(PGROTATED, nr_pages);
548 : }
549 :
550 0 : if (active) {
551 0 : __count_vm_events(PGDEACTIVATE, nr_pages);
552 0 : __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
553 : nr_pages);
554 : }
555 : }
556 :
557 0 : static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
558 : {
559 0 : if (PageActive(page) && !PageUnevictable(page)) {
560 0 : int nr_pages = thp_nr_pages(page);
561 :
562 0 : del_page_from_lru_list(page, lruvec);
563 0 : ClearPageActive(page);
564 0 : ClearPageReferenced(page);
565 0 : add_page_to_lru_list(page, lruvec);
566 :
567 0 : __count_vm_events(PGDEACTIVATE, nr_pages);
568 0 : __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
569 : nr_pages);
570 : }
571 0 : }
572 :
573 0 : static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
574 : {
575 0 : if (PageAnon(page) && PageSwapBacked(page) &&
576 0 : !PageSwapCache(page) && !PageUnevictable(page)) {
577 0 : int nr_pages = thp_nr_pages(page);
578 :
579 0 : del_page_from_lru_list(page, lruvec);
580 0 : ClearPageActive(page);
581 0 : ClearPageReferenced(page);
582 : /*
583 : * Lazyfree pages are clean anonymous pages. They have
584 : * PG_swapbacked flag cleared, to distinguish them from normal
585 : * anonymous pages
586 : */
587 0 : ClearPageSwapBacked(page);
588 0 : add_page_to_lru_list(page, lruvec);
589 :
590 0 : __count_vm_events(PGLAZYFREE, nr_pages);
591 0 : __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
592 : nr_pages);
593 : }
594 0 : }
595 :
596 : /*
597 : * Drain pages out of the cpu's pagevecs.
598 : * Either "cpu" is the current CPU, and preemption has already been
599 : * disabled; or "cpu" is being hot-unplugged, and is already dead.
600 : */
601 21436 : void lru_add_drain_cpu(int cpu)
602 : {
603 21436 : struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
604 :
605 21436 : if (pagevec_count(pvec))
606 11435 : __pagevec_lru_add(pvec);
607 :
608 21436 : pvec = &per_cpu(lru_rotate.pvec, cpu);
609 : /* Disabling interrupts below acts as a compiler barrier. */
610 21436 : if (data_race(pagevec_count(pvec))) {
611 0 : unsigned long flags;
612 :
613 : /* No harm done if a racing interrupt already did this */
614 0 : local_lock_irqsave(&lru_rotate.lock, flags);
615 0 : pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
616 0 : local_unlock_irqrestore(&lru_rotate.lock, flags);
617 : }
618 :
619 21436 : pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
620 21436 : if (pagevec_count(pvec))
621 0 : pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
622 :
623 21436 : pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
624 21436 : if (pagevec_count(pvec))
625 0 : pagevec_lru_move_fn(pvec, lru_deactivate_fn);
626 :
627 21436 : pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
628 21436 : if (pagevec_count(pvec))
629 0 : pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
630 :
631 21436 : activate_page_drain(cpu);
632 21436 : }
633 :
634 : /**
635 : * deactivate_file_page - forcefully deactivate a file page
636 : * @page: page to deactivate
637 : *
638 : * This function hints the VM that @page is a good reclaim candidate,
639 : * for example if its invalidation fails due to the page being dirty
640 : * or under writeback.
641 : */
642 0 : void deactivate_file_page(struct page *page)
643 : {
644 : /*
645 : * In a workload with many unevictable page such as mprotect,
646 : * unevictable page deactivation for accelerating reclaim is pointless.
647 : */
648 0 : if (PageUnevictable(page))
649 : return;
650 :
651 0 : if (likely(get_page_unless_zero(page))) {
652 0 : struct pagevec *pvec;
653 :
654 0 : local_lock(&lru_pvecs.lock);
655 0 : pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
656 :
657 0 : if (!pagevec_add(pvec, page) || PageCompound(page))
658 0 : pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
659 0 : local_unlock(&lru_pvecs.lock);
660 : }
661 : }
662 :
663 : /*
664 : * deactivate_page - deactivate a page
665 : * @page: page to deactivate
666 : *
667 : * deactivate_page() moves @page to the inactive list if @page was on the active
668 : * list and was not an unevictable page. This is done to accelerate the reclaim
669 : * of @page.
670 : */
671 0 : void deactivate_page(struct page *page)
672 : {
673 0 : if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
674 0 : struct pagevec *pvec;
675 :
676 0 : local_lock(&lru_pvecs.lock);
677 0 : pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
678 0 : get_page(page);
679 0 : if (!pagevec_add(pvec, page) || PageCompound(page))
680 0 : pagevec_lru_move_fn(pvec, lru_deactivate_fn);
681 0 : local_unlock(&lru_pvecs.lock);
682 : }
683 0 : }
684 :
685 : /**
686 : * mark_page_lazyfree - make an anon page lazyfree
687 : * @page: page to deactivate
688 : *
689 : * mark_page_lazyfree() moves @page to the inactive file list.
690 : * This is done to accelerate the reclaim of @page.
691 : */
692 0 : void mark_page_lazyfree(struct page *page)
693 : {
694 0 : if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
695 0 : !PageSwapCache(page) && !PageUnevictable(page)) {
696 0 : struct pagevec *pvec;
697 :
698 0 : local_lock(&lru_pvecs.lock);
699 0 : pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
700 0 : get_page(page);
701 0 : if (!pagevec_add(pvec, page) || PageCompound(page))
702 0 : pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
703 0 : local_unlock(&lru_pvecs.lock);
704 : }
705 0 : }
706 :
707 21435 : void lru_add_drain(void)
708 : {
709 21435 : local_lock(&lru_pvecs.lock);
710 21436 : lru_add_drain_cpu(smp_processor_id());
711 21436 : local_unlock(&lru_pvecs.lock);
712 21436 : }
713 :
714 0 : void lru_add_drain_cpu_zone(struct zone *zone)
715 : {
716 0 : local_lock(&lru_pvecs.lock);
717 0 : lru_add_drain_cpu(smp_processor_id());
718 0 : drain_local_pages(zone);
719 0 : local_unlock(&lru_pvecs.lock);
720 0 : }
721 :
722 : #ifdef CONFIG_SMP
723 :
724 : static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
725 :
726 11 : static void lru_add_drain_per_cpu(struct work_struct *dummy)
727 : {
728 11 : lru_add_drain();
729 11 : }
730 :
731 : /*
732 : * Doesn't need any cpu hotplug locking because we do rely on per-cpu
733 : * kworkers being shut down before our page_alloc_cpu_dead callback is
734 : * executed on the offlined cpu.
735 : * Calling this function with cpu hotplug locks held can actually lead
736 : * to obscure indirect dependencies via WQ context.
737 : */
738 5 : void lru_add_drain_all(void)
739 : {
740 : /*
741 : * lru_drain_gen - Global pages generation number
742 : *
743 : * (A) Definition: global lru_drain_gen = x implies that all generations
744 : * 0 < n <= x are already *scheduled* for draining.
745 : *
746 : * This is an optimization for the highly-contended use case where a
747 : * user space workload keeps constantly generating a flow of pages for
748 : * each CPU.
749 : */
750 5 : static unsigned int lru_drain_gen;
751 5 : static struct cpumask has_work;
752 5 : static DEFINE_MUTEX(lock);
753 5 : unsigned cpu, this_gen;
754 :
755 : /*
756 : * Make sure nobody triggers this path before mm_percpu_wq is fully
757 : * initialized.
758 : */
759 5 : if (WARN_ON(!mm_percpu_wq))
760 : return;
761 :
762 : /*
763 : * Guarantee pagevec counter stores visible by this CPU are visible to
764 : * other CPUs before loading the current drain generation.
765 : */
766 5 : smp_mb();
767 :
768 : /*
769 : * (B) Locally cache global LRU draining generation number
770 : *
771 : * The read barrier ensures that the counter is loaded before the mutex
772 : * is taken. It pairs with smp_mb() inside the mutex critical section
773 : * at (D).
774 : */
775 5 : this_gen = smp_load_acquire(&lru_drain_gen);
776 :
777 5 : mutex_lock(&lock);
778 :
779 : /*
780 : * (C) Exit the draining operation if a newer generation, from another
781 : * lru_add_drain_all(), was already scheduled for draining. Check (A).
782 : */
783 5 : if (unlikely(this_gen != lru_drain_gen))
784 0 : goto done;
785 :
786 : /*
787 : * (D) Increment global generation number
788 : *
789 : * Pairs with smp_load_acquire() at (B), outside of the critical
790 : * section. Use a full memory barrier to guarantee that the new global
791 : * drain generation number is stored before loading pagevec counters.
792 : *
793 : * This pairing must be done here, before the for_each_online_cpu loop
794 : * below which drains the page vectors.
795 : *
796 : * Let x, y, and z represent some system CPU numbers, where x < y < z.
797 : * Assume CPU #z is is in the middle of the for_each_online_cpu loop
798 : * below and has already reached CPU #y's per-cpu data. CPU #x comes
799 : * along, adds some pages to its per-cpu vectors, then calls
800 : * lru_add_drain_all().
801 : *
802 : * If the paired barrier is done at any later step, e.g. after the
803 : * loop, CPU #x will just exit at (C) and miss flushing out all of its
804 : * added pages.
805 : */
806 5 : WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
807 5 : smp_mb();
808 :
809 5 : cpumask_clear(&has_work);
810 30 : for_each_online_cpu(cpu) {
811 20 : struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
812 :
813 20 : if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
814 10 : data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
815 10 : pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
816 10 : pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
817 10 : pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
818 10 : need_activate_page_drain(cpu)) {
819 11 : INIT_WORK(work, lru_add_drain_per_cpu);
820 11 : queue_work_on(cpu, mm_percpu_wq, work);
821 36 : __cpumask_set_cpu(cpu, &has_work);
822 : }
823 : }
824 :
825 16 : for_each_cpu(cpu, &has_work)
826 11 : flush_work(&per_cpu(lru_add_drain_work, cpu));
827 :
828 5 : done:
829 5 : mutex_unlock(&lock);
830 : }
831 : #else
832 : void lru_add_drain_all(void)
833 : {
834 : lru_add_drain();
835 : }
836 : #endif /* CONFIG_SMP */
837 :
838 : /**
839 : * release_pages - batched put_page()
840 : * @pages: array of pages to release
841 : * @nr: number of pages
842 : *
843 : * Decrement the reference count on all the pages in @pages. If it
844 : * fell to zero, remove the page from the LRU and free it.
845 : */
846 26286 : void release_pages(struct page **pages, int nr)
847 : {
848 26286 : int i;
849 26286 : LIST_HEAD(pages_to_free);
850 26286 : struct lruvec *lruvec = NULL;
851 26286 : unsigned long flags;
852 26286 : unsigned int lock_batch;
853 :
854 1000789 : for (i = 0; i < nr; i++) {
855 974502 : struct page *page = pages[i];
856 :
857 : /*
858 : * Make sure the IRQ-safe lock-holding time does not get
859 : * excessive with a continuous string of pages from the
860 : * same lruvec. The lock is held only if lruvec != NULL.
861 : */
862 974502 : if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
863 5675 : unlock_page_lruvec_irqrestore(lruvec, flags);
864 5675 : lruvec = NULL;
865 : }
866 :
867 974502 : page = compound_head(page);
868 974502 : if (is_huge_zero_page(page))
869 0 : continue;
870 :
871 974502 : if (is_zone_device_page(page)) {
872 : if (lruvec) {
873 : unlock_page_lruvec_irqrestore(lruvec, flags);
874 : lruvec = NULL;
875 : }
876 : /*
877 : * ZONE_DEVICE pages that return 'false' from
878 : * page_is_devmap_managed() do not require special
879 : * processing, and instead, expect a call to
880 : * put_page_testzero().
881 : */
882 : if (page_is_devmap_managed(page)) {
883 : put_devmap_managed_page(page);
884 : continue;
885 : }
886 : if (put_page_testzero(page))
887 : put_dev_pagemap(page->pgmap);
888 : continue;
889 : }
890 :
891 974502 : if (!put_page_testzero(page))
892 909578 : continue;
893 :
894 129833 : if (PageCompound(page)) {
895 17 : if (lruvec) {
896 17 : unlock_page_lruvec_irqrestore(lruvec, flags);
897 17 : lruvec = NULL;
898 : }
899 17 : __put_compound_page(page);
900 17 : continue;
901 : }
902 :
903 129816 : if (PageLRU(page)) {
904 64908 : struct lruvec *prev_lruvec = lruvec;
905 :
906 64908 : lruvec = relock_page_lruvec_irqsave(page, lruvec,
907 : &flags);
908 64908 : if (prev_lruvec != lruvec)
909 10717 : lock_batch = 0;
910 :
911 64908 : del_page_from_lru_list(page, lruvec);
912 64908 : __clear_page_lru_flags(page);
913 : }
914 :
915 64908 : __ClearPageWaiters(page);
916 :
917 974503 : list_add(&page->lru, &pages_to_free);
918 : }
919 26287 : if (lruvec)
920 5025 : unlock_page_lruvec_irqrestore(lruvec, flags);
921 :
922 26287 : mem_cgroup_uncharge_list(&pages_to_free);
923 26287 : free_unref_page_list(&pages_to_free);
924 26287 : }
925 : EXPORT_SYMBOL(release_pages);
926 :
927 : /*
928 : * The pages which we're about to release may be in the deferred lru-addition
929 : * queues. That would prevent them from really being freed right now. That's
930 : * OK from a correctness point of view but is inefficient - those pages may be
931 : * cache-warm and we want to give them back to the page allocator ASAP.
932 : *
933 : * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
934 : * and __pagevec_lru_add_active() call release_pages() directly to avoid
935 : * mutual recursion.
936 : */
937 773 : void __pagevec_release(struct pagevec *pvec)
938 : {
939 773 : if (!pvec->percpu_pvec_drained) {
940 533 : lru_add_drain();
941 533 : pvec->percpu_pvec_drained = true;
942 : }
943 773 : release_pages(pvec->pages, pagevec_count(pvec));
944 773 : pagevec_reinit(pvec);
945 773 : }
946 : EXPORT_SYMBOL(__pagevec_release);
947 :
948 96327 : static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
949 : {
950 96327 : int was_unevictable = TestClearPageUnevictable(page);
951 96327 : int nr_pages = thp_nr_pages(page);
952 :
953 192654 : VM_BUG_ON_PAGE(PageLRU(page), page);
954 :
955 : /*
956 : * Page becomes evictable in two ways:
957 : * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
958 : * 2) Before acquiring LRU lock to put the page to correct LRU and then
959 : * a) do PageLRU check with lock [check_move_unevictable_pages]
960 : * b) do PageLRU check before lock [clear_page_mlock]
961 : *
962 : * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
963 : * following strict ordering:
964 : *
965 : * #0: __pagevec_lru_add_fn #1: clear_page_mlock
966 : *
967 : * SetPageLRU() TestClearPageMlocked()
968 : * smp_mb() // explicit ordering // above provides strict
969 : * // ordering
970 : * PageMlocked() PageLRU()
971 : *
972 : *
973 : * if '#1' does not observe setting of PG_lru by '#0' and fails
974 : * isolation, the explicit barrier will make sure that page_evictable
975 : * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
976 : * can be reordered after PageMlocked check and can make '#1' to fail
977 : * the isolation of the page whose Mlocked bit is cleared (#0 is also
978 : * looking at the same page) and the evictable page will be stranded
979 : * in an unevictable LRU.
980 : */
981 96327 : SetPageLRU(page);
982 96327 : smp_mb__after_atomic();
983 :
984 96327 : if (page_evictable(page)) {
985 96311 : if (was_unevictable)
986 0 : __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
987 : } else {
988 16 : ClearPageActive(page);
989 16 : SetPageUnevictable(page);
990 16 : if (!was_unevictable)
991 16 : __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
992 : }
993 :
994 96327 : add_page_to_lru_list(page, lruvec);
995 96327 : trace_mm_lru_insertion(page);
996 96327 : }
997 :
998 : /*
999 : * Add the passed pages to the LRU, then drop the caller's refcount
1000 : * on them. Reinitialises the caller's pagevec.
1001 : */
1002 15276 : void __pagevec_lru_add(struct pagevec *pvec)
1003 : {
1004 15276 : int i;
1005 15276 : struct lruvec *lruvec = NULL;
1006 15276 : unsigned long flags = 0;
1007 :
1008 111601 : for (i = 0; i < pagevec_count(pvec); i++) {
1009 96325 : struct page *page = pvec->pages[i];
1010 :
1011 96325 : lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
1012 96327 : __pagevec_lru_add_fn(page, lruvec);
1013 : }
1014 15276 : if (lruvec)
1015 15276 : unlock_page_lruvec_irqrestore(lruvec, flags);
1016 15276 : release_pages(pvec->pages, pvec->nr);
1017 15276 : pagevec_reinit(pvec);
1018 15276 : }
1019 :
1020 : /**
1021 : * pagevec_remove_exceptionals - pagevec exceptionals pruning
1022 : * @pvec: The pagevec to prune
1023 : *
1024 : * find_get_entries() fills both pages and XArray value entries (aka
1025 : * exceptional entries) into the pagevec. This function prunes all
1026 : * exceptionals from @pvec without leaving holes, so that it can be
1027 : * passed on to page-only pagevec operations.
1028 : */
1029 131 : void pagevec_remove_exceptionals(struct pagevec *pvec)
1030 : {
1031 131 : int i, j;
1032 :
1033 799 : for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
1034 668 : struct page *page = pvec->pages[i];
1035 668 : if (!xa_is_value(page))
1036 668 : pvec->pages[j++] = page;
1037 : }
1038 131 : pvec->nr = j;
1039 131 : }
1040 :
1041 : /**
1042 : * pagevec_lookup_range - gang pagecache lookup
1043 : * @pvec: Where the resulting pages are placed
1044 : * @mapping: The address_space to search
1045 : * @start: The starting page index
1046 : * @end: The final page index
1047 : *
1048 : * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
1049 : * pages in the mapping starting from index @start and upto index @end
1050 : * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
1051 : * reference against the pages in @pvec.
1052 : *
1053 : * The search returns a group of mapping-contiguous pages with ascending
1054 : * indexes. There may be holes in the indices due to not-present pages. We
1055 : * also update @start to index the next page for the traversal.
1056 : *
1057 : * pagevec_lookup_range() returns the number of pages which were found. If this
1058 : * number is smaller than PAGEVEC_SIZE, the end of specified range has been
1059 : * reached.
1060 : */
1061 1590 : unsigned pagevec_lookup_range(struct pagevec *pvec,
1062 : struct address_space *mapping, pgoff_t *start, pgoff_t end)
1063 : {
1064 3180 : pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
1065 1590 : pvec->pages);
1066 1590 : return pagevec_count(pvec);
1067 : }
1068 : EXPORT_SYMBOL(pagevec_lookup_range);
1069 :
1070 388 : unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
1071 : struct address_space *mapping, pgoff_t *index, pgoff_t end,
1072 : xa_mark_t tag)
1073 : {
1074 776 : pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1075 388 : PAGEVEC_SIZE, pvec->pages);
1076 388 : return pagevec_count(pvec);
1077 : }
1078 : EXPORT_SYMBOL(pagevec_lookup_range_tag);
1079 :
1080 : /*
1081 : * Perform any setup for the swap system
1082 : */
1083 1 : void __init swap_setup(void)
1084 : {
1085 1 : unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
1086 :
1087 : /* Use a smaller cluster for small-memory machines */
1088 1 : if (megs < 16)
1089 0 : page_cluster = 2;
1090 : else
1091 1 : page_cluster = 3;
1092 : /*
1093 : * Right now other parts of the system means that we
1094 : * _really_ don't want to cluster much more
1095 : */
1096 1 : }
1097 :
1098 : #ifdef CONFIG_DEV_PAGEMAP_OPS
1099 : void put_devmap_managed_page(struct page *page)
1100 : {
1101 : int count;
1102 :
1103 : if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
1104 : return;
1105 :
1106 : count = page_ref_dec_return(page);
1107 :
1108 : /*
1109 : * devmap page refcounts are 1-based, rather than 0-based: if
1110 : * refcount is 1, then the page is free and the refcount is
1111 : * stable because nobody holds a reference on the page.
1112 : */
1113 : if (count == 1)
1114 : free_devmap_managed_page(page);
1115 : else if (!count)
1116 : __put_page(page);
1117 : }
1118 : EXPORT_SYMBOL(put_devmap_managed_page);
1119 : #endif
|