Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : #include <linux/mm.h>
3 : #include <linux/mmzone.h>
4 : #include <linux/page_reporting.h>
5 : #include <linux/gfp.h>
6 : #include <linux/export.h>
7 : #include <linux/delay.h>
8 : #include <linux/scatterlist.h>
9 :
10 : #include "page_reporting.h"
11 : #include "internal.h"
12 :
13 : #define PAGE_REPORTING_DELAY (2 * HZ)
14 : static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
15 :
16 : enum {
17 : PAGE_REPORTING_IDLE = 0,
18 : PAGE_REPORTING_REQUESTED,
19 : PAGE_REPORTING_ACTIVE
20 : };
21 :
22 : /* request page reporting */
23 : static void
24 0 : __page_reporting_request(struct page_reporting_dev_info *prdev)
25 : {
26 0 : unsigned int state;
27 :
28 : /* Check to see if we are in desired state */
29 0 : state = atomic_read(&prdev->state);
30 0 : if (state == PAGE_REPORTING_REQUESTED)
31 : return;
32 :
33 : /*
34 : * If reporting is already active there is nothing we need to do.
35 : * Test against 0 as that represents PAGE_REPORTING_IDLE.
36 : */
37 0 : state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
38 0 : if (state != PAGE_REPORTING_IDLE)
39 : return;
40 :
41 : /*
42 : * Delay the start of work to allow a sizable queue to build. For
43 : * now we are limiting this to running no more than once every
44 : * couple of seconds.
45 : */
46 0 : schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
47 : }
48 :
49 : /* notify prdev of free page reporting request */
50 0 : void __page_reporting_notify(void)
51 : {
52 0 : struct page_reporting_dev_info *prdev;
53 :
54 : /*
55 : * We use RCU to protect the pr_dev_info pointer. In almost all
56 : * cases this should be present, however in the unlikely case of
57 : * a shutdown this will be NULL and we should exit.
58 : */
59 0 : rcu_read_lock();
60 0 : prdev = rcu_dereference(pr_dev_info);
61 0 : if (likely(prdev))
62 0 : __page_reporting_request(prdev);
63 :
64 0 : rcu_read_unlock();
65 0 : }
66 :
67 : static void
68 0 : page_reporting_drain(struct page_reporting_dev_info *prdev,
69 : struct scatterlist *sgl, unsigned int nents, bool reported)
70 : {
71 0 : struct scatterlist *sg = sgl;
72 :
73 : /*
74 : * Drain the now reported pages back into their respective
75 : * free lists/areas. We assume at least one page is populated.
76 : */
77 0 : do {
78 0 : struct page *page = sg_page(sg);
79 0 : int mt = get_pageblock_migratetype(page);
80 0 : unsigned int order = get_order(sg->length);
81 :
82 0 : __putback_isolated_page(page, order, mt);
83 :
84 : /* If the pages were not reported due to error skip flagging */
85 0 : if (!reported)
86 0 : continue;
87 :
88 : /*
89 : * If page was not comingled with another page we can
90 : * consider the result to be "reported" since the page
91 : * hasn't been modified, otherwise we will need to
92 : * report on the new larger page when we make our way
93 : * up to that higher order.
94 : */
95 0 : if (PageBuddy(page) && buddy_order(page) == order)
96 0 : __SetPageReported(page);
97 0 : } while ((sg = sg_next(sg)));
98 :
99 : /* reinitialize scatterlist now that it is empty */
100 0 : sg_init_table(sgl, nents);
101 0 : }
102 :
103 : /*
104 : * The page reporting cycle consists of 4 stages, fill, report, drain, and
105 : * idle. We will cycle through the first 3 stages until we cannot obtain a
106 : * full scatterlist of pages, in that case we will switch to idle.
107 : */
108 : static int
109 0 : page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
110 : unsigned int order, unsigned int mt,
111 : struct scatterlist *sgl, unsigned int *offset)
112 : {
113 0 : struct free_area *area = &zone->free_area[order];
114 0 : struct list_head *list = &area->free_list[mt];
115 0 : unsigned int page_len = PAGE_SIZE << order;
116 0 : struct page *page, *next;
117 0 : long budget;
118 0 : int err = 0;
119 :
120 : /*
121 : * Perform early check, if free area is empty there is
122 : * nothing to process so we can skip this free_list.
123 : */
124 0 : if (list_empty(list))
125 : return err;
126 :
127 0 : spin_lock_irq(&zone->lock);
128 :
129 : /*
130 : * Limit how many calls we will be making to the page reporting
131 : * device for this list. By doing this we avoid processing any
132 : * given list for too long.
133 : *
134 : * The current value used allows us enough calls to process over a
135 : * sixteenth of the current list plus one additional call to handle
136 : * any pages that may have already been present from the previous
137 : * list processed. This should result in us reporting all pages on
138 : * an idle system in about 30 seconds.
139 : *
140 : * The division here should be cheap since PAGE_REPORTING_CAPACITY
141 : * should always be a power of 2.
142 : */
143 0 : budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
144 :
145 : /* loop through free list adding unreported pages to sg list */
146 0 : list_for_each_entry_safe(page, next, list, lru) {
147 : /* We are going to skip over the reported pages. */
148 0 : if (PageReported(page))
149 0 : continue;
150 :
151 : /*
152 : * If we fully consumed our budget then update our
153 : * state to indicate that we are requesting additional
154 : * processing and exit this list.
155 : */
156 0 : if (budget < 0) {
157 0 : atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
158 0 : next = page;
159 0 : break;
160 : }
161 :
162 : /* Attempt to pull page from list and place in scatterlist */
163 0 : if (*offset) {
164 0 : if (!__isolate_free_page(page, order)) {
165 : next = page;
166 : break;
167 : }
168 :
169 : /* Add page to scatter list */
170 0 : --(*offset);
171 0 : sg_set_page(&sgl[*offset], page, page_len, 0);
172 :
173 0 : continue;
174 : }
175 :
176 : /*
177 : * Make the first non-reported page in the free list
178 : * the new head of the free list before we release the
179 : * zone lock.
180 : */
181 0 : if (!list_is_first(&page->lru, list))
182 0 : list_rotate_to_front(&page->lru, list);
183 :
184 : /* release lock before waiting on report processing */
185 0 : spin_unlock_irq(&zone->lock);
186 :
187 : /* begin processing pages in local list */
188 0 : err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
189 :
190 : /* reset offset since the full list was reported */
191 0 : *offset = PAGE_REPORTING_CAPACITY;
192 :
193 : /* update budget to reflect call to report function */
194 0 : budget--;
195 :
196 : /* reacquire zone lock and resume processing */
197 0 : spin_lock_irq(&zone->lock);
198 :
199 : /* flush reported pages from the sg list */
200 0 : page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
201 :
202 : /*
203 : * Reset next to first entry, the old next isn't valid
204 : * since we dropped the lock to report the pages
205 : */
206 0 : next = list_first_entry(list, struct page, lru);
207 :
208 : /* exit on error */
209 0 : if (err)
210 : break;
211 : }
212 :
213 : /* Rotate any leftover pages to the head of the freelist */
214 0 : if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
215 0 : list_rotate_to_front(&next->lru, list);
216 :
217 0 : spin_unlock_irq(&zone->lock);
218 :
219 0 : return err;
220 : }
221 :
222 : static int
223 0 : page_reporting_process_zone(struct page_reporting_dev_info *prdev,
224 : struct scatterlist *sgl, struct zone *zone)
225 : {
226 0 : unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
227 0 : unsigned long watermark;
228 0 : int err = 0;
229 :
230 : /* Generate minimum watermark to be able to guarantee progress */
231 0 : watermark = low_wmark_pages(zone) +
232 : (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
233 :
234 : /*
235 : * Cancel request if insufficient free memory or if we failed
236 : * to allocate page reporting statistics for the zone.
237 : */
238 0 : if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
239 : return err;
240 :
241 : /* Process each free list starting from lowest order/mt */
242 0 : for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
243 0 : for (mt = 0; mt < MIGRATE_TYPES; mt++) {
244 : /* We do not pull pages from the isolate free list */
245 0 : if (is_migrate_isolate(mt))
246 : continue;
247 :
248 0 : err = page_reporting_cycle(prdev, zone, order, mt,
249 : sgl, &offset);
250 0 : if (err)
251 0 : return err;
252 : }
253 : }
254 :
255 : /* report the leftover pages before going idle */
256 0 : leftover = PAGE_REPORTING_CAPACITY - offset;
257 0 : if (leftover) {
258 0 : sgl = &sgl[offset];
259 0 : err = prdev->report(prdev, sgl, leftover);
260 :
261 : /* flush any remaining pages out from the last report */
262 0 : spin_lock_irq(&zone->lock);
263 0 : page_reporting_drain(prdev, sgl, leftover, !err);
264 0 : spin_unlock_irq(&zone->lock);
265 : }
266 :
267 : return err;
268 : }
269 :
270 0 : static void page_reporting_process(struct work_struct *work)
271 : {
272 0 : struct delayed_work *d_work = to_delayed_work(work);
273 0 : struct page_reporting_dev_info *prdev =
274 0 : container_of(d_work, struct page_reporting_dev_info, work);
275 0 : int err = 0, state = PAGE_REPORTING_ACTIVE;
276 0 : struct scatterlist *sgl;
277 0 : struct zone *zone;
278 :
279 : /*
280 : * Change the state to "Active" so that we can track if there is
281 : * anyone requests page reporting after we complete our pass. If
282 : * the state is not altered by the end of the pass we will switch
283 : * to idle and quit scheduling reporting runs.
284 : */
285 0 : atomic_set(&prdev->state, state);
286 :
287 : /* allocate scatterlist to store pages being reported on */
288 0 : sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
289 0 : if (!sgl)
290 0 : goto err_out;
291 :
292 0 : sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
293 :
294 0 : for_each_zone(zone) {
295 0 : err = page_reporting_process_zone(prdev, sgl, zone);
296 0 : if (err)
297 : break;
298 : }
299 :
300 0 : kfree(sgl);
301 0 : err_out:
302 : /*
303 : * If the state has reverted back to requested then there may be
304 : * additional pages to be processed. We will defer for 2s to allow
305 : * more pages to accumulate.
306 : */
307 0 : state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
308 0 : if (state == PAGE_REPORTING_REQUESTED)
309 0 : schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
310 0 : }
311 :
312 : static DEFINE_MUTEX(page_reporting_mutex);
313 : DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
314 :
315 0 : int page_reporting_register(struct page_reporting_dev_info *prdev)
316 : {
317 0 : int err = 0;
318 :
319 0 : mutex_lock(&page_reporting_mutex);
320 :
321 : /* nothing to do if already in use */
322 0 : if (rcu_access_pointer(pr_dev_info)) {
323 0 : err = -EBUSY;
324 0 : goto err_out;
325 : }
326 :
327 : /* initialize state and work structures */
328 0 : atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
329 0 : INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
330 :
331 : /* Begin initial flush of zones */
332 0 : __page_reporting_request(prdev);
333 :
334 : /* Assign device to allow notifications */
335 0 : rcu_assign_pointer(pr_dev_info, prdev);
336 :
337 : /* enable page reporting notification */
338 0 : if (!static_key_enabled(&page_reporting_enabled)) {
339 0 : static_branch_enable(&page_reporting_enabled);
340 0 : pr_info("Free page reporting enabled\n");
341 : }
342 0 : err_out:
343 0 : mutex_unlock(&page_reporting_mutex);
344 :
345 0 : return err;
346 : }
347 : EXPORT_SYMBOL_GPL(page_reporting_register);
348 :
349 0 : void page_reporting_unregister(struct page_reporting_dev_info *prdev)
350 : {
351 0 : mutex_lock(&page_reporting_mutex);
352 :
353 0 : if (rcu_access_pointer(pr_dev_info) == prdev) {
354 : /* Disable page reporting notification */
355 0 : RCU_INIT_POINTER(pr_dev_info, NULL);
356 0 : synchronize_rcu();
357 :
358 : /* Flush any existing work, and lock it out */
359 0 : cancel_delayed_work_sync(&prdev->work);
360 : }
361 :
362 0 : mutex_unlock(&page_reporting_mutex);
363 0 : }
364 : EXPORT_SYMBOL_GPL(page_reporting_unregister);
|