Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : #include <linux/kernel.h>
3 : #include <linux/errno.h>
4 : #include <linux/err.h>
5 : #include <linux/spinlock.h>
6 :
7 : #include <linux/mm.h>
8 : #include <linux/memremap.h>
9 : #include <linux/pagemap.h>
10 : #include <linux/rmap.h>
11 : #include <linux/swap.h>
12 : #include <linux/swapops.h>
13 :
14 : #include <linux/sched/signal.h>
15 : #include <linux/rwsem.h>
16 : #include <linux/hugetlb.h>
17 : #include <linux/migrate.h>
18 : #include <linux/mm_inline.h>
19 : #include <linux/sched/mm.h>
20 :
21 : #include <asm/mmu_context.h>
22 : #include <asm/tlbflush.h>
23 :
24 : #include "internal.h"
25 :
26 : struct follow_page_context {
27 : struct dev_pagemap *pgmap;
28 : unsigned int page_mask;
29 : };
30 :
31 0 : static void hpage_pincount_add(struct page *page, int refs)
32 : {
33 0 : VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
34 0 : VM_BUG_ON_PAGE(page != compound_head(page), page);
35 :
36 0 : atomic_add(refs, compound_pincount_ptr(page));
37 0 : }
38 :
39 0 : static void hpage_pincount_sub(struct page *page, int refs)
40 : {
41 0 : VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
42 0 : VM_BUG_ON_PAGE(page != compound_head(page), page);
43 :
44 0 : atomic_sub(refs, compound_pincount_ptr(page));
45 0 : }
46 :
47 : /*
48 : * Return the compound head page with ref appropriately incremented,
49 : * or NULL if that failed.
50 : */
51 2 : static inline struct page *try_get_compound_head(struct page *page, int refs)
52 : {
53 2 : struct page *head = compound_head(page);
54 :
55 2 : if (WARN_ON_ONCE(page_ref_count(head) < 0))
56 : return NULL;
57 2 : if (unlikely(!page_cache_add_speculative(head, refs)))
58 0 : return NULL;
59 : return head;
60 : }
61 :
62 : /*
63 : * try_grab_compound_head() - attempt to elevate a page's refcount, by a
64 : * flags-dependent amount.
65 : *
66 : * "grab" names in this file mean, "look at flags to decide whether to use
67 : * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
68 : *
69 : * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
70 : * same time. (That's true throughout the get_user_pages*() and
71 : * pin_user_pages*() APIs.) Cases:
72 : *
73 : * FOLL_GET: page's refcount will be incremented by 1.
74 : * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
75 : *
76 : * Return: head page (with refcount appropriately incremented) for success, or
77 : * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
78 : * considered failure, and furthermore, a likely bug in the caller, so a warning
79 : * is also emitted.
80 : */
81 2 : __maybe_unused struct page *try_grab_compound_head(struct page *page,
82 : int refs, unsigned int flags)
83 : {
84 2 : if (flags & FOLL_GET)
85 2 : return try_get_compound_head(page, refs);
86 0 : else if (flags & FOLL_PIN) {
87 0 : int orig_refs = refs;
88 :
89 : /*
90 : * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
91 : * path, so fail and let the caller fall back to the slow path.
92 : */
93 0 : if (unlikely(flags & FOLL_LONGTERM) &&
94 : is_migrate_cma_page(page))
95 : return NULL;
96 :
97 : /*
98 : * When pinning a compound page of order > 1 (which is what
99 : * hpage_pincount_available() checks for), use an exact count to
100 : * track it, via hpage_pincount_add/_sub().
101 : *
102 : * However, be sure to *also* increment the normal page refcount
103 : * field at least once, so that the page really is pinned.
104 : */
105 0 : if (!hpage_pincount_available(page))
106 0 : refs *= GUP_PIN_COUNTING_BIAS;
107 :
108 0 : page = try_get_compound_head(page, refs);
109 0 : if (!page)
110 : return NULL;
111 :
112 0 : if (hpage_pincount_available(page))
113 0 : hpage_pincount_add(page, refs);
114 :
115 0 : mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
116 : orig_refs);
117 :
118 0 : return page;
119 : }
120 :
121 0 : WARN_ON_ONCE(1);
122 0 : return NULL;
123 : }
124 :
125 0 : static void put_compound_head(struct page *page, int refs, unsigned int flags)
126 : {
127 0 : if (flags & FOLL_PIN) {
128 0 : mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
129 : refs);
130 :
131 0 : if (hpage_pincount_available(page))
132 0 : hpage_pincount_sub(page, refs);
133 : else
134 0 : refs *= GUP_PIN_COUNTING_BIAS;
135 : }
136 :
137 0 : VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
138 : /*
139 : * Calling put_page() for each ref is unnecessarily slow. Only the last
140 : * ref needs a put_page().
141 : */
142 0 : if (refs > 1)
143 0 : page_ref_sub(page, refs - 1);
144 0 : put_page(page);
145 0 : }
146 :
147 : /**
148 : * try_grab_page() - elevate a page's refcount by a flag-dependent amount
149 : *
150 : * This might not do anything at all, depending on the flags argument.
151 : *
152 : * "grab" names in this file mean, "look at flags to decide whether to use
153 : * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
154 : *
155 : * @page: pointer to page to be grabbed
156 : * @flags: gup flags: these are the FOLL_* flag values.
157 : *
158 : * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
159 : * time. Cases:
160 : *
161 : * FOLL_GET: page's refcount will be incremented by 1.
162 : * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
163 : *
164 : * Return: true for success, or if no action was required (if neither FOLL_PIN
165 : * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
166 : * FOLL_PIN was set, but the page could not be grabbed.
167 : */
168 7798 : bool __must_check try_grab_page(struct page *page, unsigned int flags)
169 : {
170 7798 : WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
171 :
172 7798 : if (flags & FOLL_GET)
173 7772 : return try_get_page(page);
174 26 : else if (flags & FOLL_PIN) {
175 0 : int refs = 1;
176 :
177 0 : page = compound_head(page);
178 :
179 0 : if (WARN_ON_ONCE(page_ref_count(page) <= 0))
180 : return false;
181 :
182 0 : if (hpage_pincount_available(page))
183 0 : hpage_pincount_add(page, 1);
184 : else
185 : refs = GUP_PIN_COUNTING_BIAS;
186 :
187 : /*
188 : * Similar to try_grab_compound_head(): even if using the
189 : * hpage_pincount_add/_sub() routines, be sure to
190 : * *also* increment the normal page refcount field at least
191 : * once, so that the page really is pinned.
192 : */
193 0 : page_ref_add(page, refs);
194 :
195 0 : mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
196 : }
197 :
198 : return true;
199 : }
200 :
201 : /**
202 : * unpin_user_page() - release a dma-pinned page
203 : * @page: pointer to page to be released
204 : *
205 : * Pages that were pinned via pin_user_pages*() must be released via either
206 : * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
207 : * that such pages can be separately tracked and uniquely handled. In
208 : * particular, interactions with RDMA and filesystems need special handling.
209 : */
210 0 : void unpin_user_page(struct page *page)
211 : {
212 0 : put_compound_head(compound_head(page), 1, FOLL_PIN);
213 0 : }
214 : EXPORT_SYMBOL(unpin_user_page);
215 :
216 : /**
217 : * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
218 : * @pages: array of pages to be maybe marked dirty, and definitely released.
219 : * @npages: number of pages in the @pages array.
220 : * @make_dirty: whether to mark the pages dirty
221 : *
222 : * "gup-pinned page" refers to a page that has had one of the get_user_pages()
223 : * variants called on that page.
224 : *
225 : * For each page in the @pages array, make that page (or its head page, if a
226 : * compound page) dirty, if @make_dirty is true, and if the page was previously
227 : * listed as clean. In any case, releases all pages using unpin_user_page(),
228 : * possibly via unpin_user_pages(), for the non-dirty case.
229 : *
230 : * Please see the unpin_user_page() documentation for details.
231 : *
232 : * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
233 : * required, then the caller should a) verify that this is really correct,
234 : * because _lock() is usually required, and b) hand code it:
235 : * set_page_dirty_lock(), unpin_user_page().
236 : *
237 : */
238 0 : void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
239 : bool make_dirty)
240 : {
241 0 : unsigned long index;
242 :
243 : /*
244 : * TODO: this can be optimized for huge pages: if a series of pages is
245 : * physically contiguous and part of the same compound page, then a
246 : * single operation to the head page should suffice.
247 : */
248 :
249 0 : if (!make_dirty) {
250 0 : unpin_user_pages(pages, npages);
251 0 : return;
252 : }
253 :
254 0 : for (index = 0; index < npages; index++) {
255 0 : struct page *page = compound_head(pages[index]);
256 : /*
257 : * Checking PageDirty at this point may race with
258 : * clear_page_dirty_for_io(), but that's OK. Two key
259 : * cases:
260 : *
261 : * 1) This code sees the page as already dirty, so it
262 : * skips the call to set_page_dirty(). That could happen
263 : * because clear_page_dirty_for_io() called
264 : * page_mkclean(), followed by set_page_dirty().
265 : * However, now the page is going to get written back,
266 : * which meets the original intention of setting it
267 : * dirty, so all is well: clear_page_dirty_for_io() goes
268 : * on to call TestClearPageDirty(), and write the page
269 : * back.
270 : *
271 : * 2) This code sees the page as clean, so it calls
272 : * set_page_dirty(). The page stays dirty, despite being
273 : * written back, so it gets written back again in the
274 : * next writeback cycle. This is harmless.
275 : */
276 0 : if (!PageDirty(page))
277 0 : set_page_dirty_lock(page);
278 0 : unpin_user_page(page);
279 : }
280 : }
281 : EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
282 :
283 : /**
284 : * unpin_user_pages() - release an array of gup-pinned pages.
285 : * @pages: array of pages to be marked dirty and released.
286 : * @npages: number of pages in the @pages array.
287 : *
288 : * For each page in the @pages array, release the page using unpin_user_page().
289 : *
290 : * Please see the unpin_user_page() documentation for details.
291 : */
292 0 : void unpin_user_pages(struct page **pages, unsigned long npages)
293 : {
294 0 : unsigned long index;
295 :
296 : /*
297 : * If this WARN_ON() fires, then the system *might* be leaking pages (by
298 : * leaving them pinned), but probably not. More likely, gup/pup returned
299 : * a hard -ERRNO error to the caller, who erroneously passed it here.
300 : */
301 0 : if (WARN_ON(IS_ERR_VALUE(npages)))
302 : return;
303 : /*
304 : * TODO: this can be optimized for huge pages: if a series of pages is
305 : * physically contiguous and part of the same compound page, then a
306 : * single operation to the head page should suffice.
307 : */
308 0 : for (index = 0; index < npages; index++)
309 0 : unpin_user_page(pages[index]);
310 : }
311 : EXPORT_SYMBOL(unpin_user_pages);
312 :
313 : #ifdef CONFIG_MMU
314 2508 : static struct page *no_page_table(struct vm_area_struct *vma,
315 : unsigned int flags)
316 : {
317 : /*
318 : * When core dumping an enormous anonymous area that nobody
319 : * has touched so far, we don't want to allocate unnecessary pages or
320 : * page tables. Return error instead of NULL to skip handle_mm_fault,
321 : * then get_dump_page() will return NULL to leave a hole in the dump.
322 : * But we can only make this optimization where a hole would surely
323 : * be zero-filled if handle_mm_fault() actually did handle it.
324 : */
325 2508 : if ((flags & FOLL_DUMP) &&
326 0 : (vma_is_anonymous(vma) || !vma->vm_ops->fault))
327 0 : return ERR_PTR(-EFAULT);
328 : return NULL;
329 : }
330 :
331 0 : static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
332 : pte_t *pte, unsigned int flags)
333 : {
334 : /* No page to get reference */
335 0 : if (flags & FOLL_GET)
336 : return -EFAULT;
337 :
338 0 : if (flags & FOLL_TOUCH) {
339 0 : pte_t entry = *pte;
340 :
341 0 : if (flags & FOLL_WRITE)
342 0 : entry = pte_mkdirty(entry);
343 0 : entry = pte_mkyoung(entry);
344 :
345 0 : if (!pte_same(*pte, entry)) {
346 0 : set_pte_at(vma->vm_mm, address, pte, entry);
347 0 : update_mmu_cache(vma, address, pte);
348 : }
349 : }
350 :
351 : /* Proper page table entry exists, but no corresponding struct page */
352 : return -EEXIST;
353 : }
354 :
355 : /*
356 : * FOLL_FORCE can write to even unwritable pte's, but only
357 : * after we've gone through a COW cycle and they are dirty.
358 : */
359 7524 : static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
360 : {
361 7524 : return pte_write(pte) ||
362 0 : ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
363 : }
364 :
365 7833 : static struct page *follow_page_pte(struct vm_area_struct *vma,
366 : unsigned long address, pmd_t *pmd, unsigned int flags,
367 : struct dev_pagemap **pgmap)
368 : {
369 7833 : struct mm_struct *mm = vma->vm_mm;
370 7833 : struct page *page;
371 7833 : spinlock_t *ptl;
372 7833 : pte_t *ptep, pte;
373 7833 : int ret;
374 :
375 : /* FOLL_GET and FOLL_PIN are mutually exclusive. */
376 7833 : if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
377 : (FOLL_PIN | FOLL_GET)))
378 7833 : return ERR_PTR(-EINVAL);
379 7833 : retry:
380 15666 : if (unlikely(pmd_bad(*pmd)))
381 0 : return no_page_table(vma, flags);
382 :
383 7833 : ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
384 7833 : pte = *ptep;
385 7833 : if (!pte_present(pte)) {
386 35 : swp_entry_t entry;
387 : /*
388 : * KSM's break_ksm() relies upon recognizing a ksm page
389 : * even while it is being migrated, so for that case we
390 : * need migration_entry_wait().
391 : */
392 35 : if (likely(!(flags & FOLL_MIGRATION)))
393 35 : goto no_page;
394 0 : if (pte_none(pte))
395 0 : goto no_page;
396 0 : entry = pte_to_swp_entry(pte);
397 0 : if (!is_migration_entry(entry))
398 0 : goto no_page;
399 0 : pte_unmap_unlock(ptep, ptl);
400 0 : migration_entry_wait(mm, pmd, address);
401 0 : goto retry;
402 : }
403 7798 : if ((flags & FOLL_NUMA) && pte_protnone(pte))
404 : goto no_page;
405 15322 : if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
406 0 : pte_unmap_unlock(ptep, ptl);
407 0 : return NULL;
408 : }
409 :
410 7798 : page = vm_normal_page(vma, address, pte);
411 7798 : if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
412 : /*
413 : * Only return device mapping pages in the FOLL_GET or FOLL_PIN
414 : * case since they are only valid while holding the pgmap
415 : * reference.
416 : */
417 0 : *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
418 0 : if (*pgmap)
419 : page = pte_page(pte);
420 : else
421 0 : goto no_page;
422 7798 : } else if (unlikely(!page)) {
423 0 : if (flags & FOLL_DUMP) {
424 : /* Avoid special (like zero) pages in core dumps */
425 0 : page = ERR_PTR(-EFAULT);
426 0 : goto out;
427 : }
428 :
429 0 : if (is_zero_pfn(pte_pfn(pte))) {
430 0 : page = pte_page(pte);
431 : } else {
432 0 : ret = follow_pfn_pte(vma, address, ptep, flags);
433 0 : page = ERR_PTR(ret);
434 0 : goto out;
435 : }
436 : }
437 :
438 7798 : if (flags & FOLL_SPLIT && PageTransCompound(page)) {
439 0 : get_page(page);
440 0 : pte_unmap_unlock(ptep, ptl);
441 0 : lock_page(page);
442 0 : ret = split_huge_page(page);
443 0 : unlock_page(page);
444 0 : put_page(page);
445 0 : if (ret)
446 0 : return ERR_PTR(ret);
447 0 : goto retry;
448 : }
449 :
450 : /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
451 7798 : if (unlikely(!try_grab_page(page, flags))) {
452 0 : page = ERR_PTR(-ENOMEM);
453 0 : goto out;
454 : }
455 : /*
456 : * We need to make the page accessible if and only if we are going
457 : * to access its content (the FOLL_PIN case). Please see
458 : * Documentation/core-api/pin_user_pages.rst for details.
459 : */
460 7798 : if (flags & FOLL_PIN) {
461 7798 : ret = arch_make_page_accessible(page);
462 : if (ret) {
463 : unpin_user_page(page);
464 : page = ERR_PTR(ret);
465 : goto out;
466 : }
467 : }
468 7798 : if (flags & FOLL_TOUCH) {
469 7796 : if ((flags & FOLL_WRITE) &&
470 7524 : !pte_dirty(pte) && !PageDirty(page))
471 0 : set_page_dirty(page);
472 : /*
473 : * pte_mkyoung() would be more correct here, but atomic care
474 : * is needed to avoid losing the dirty bit: it is easier to use
475 : * mark_page_accessed().
476 : */
477 7796 : mark_page_accessed(page);
478 : }
479 7798 : if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
480 : /* Do not mlock pte-mapped THP */
481 16 : if (PageTransCompound(page))
482 0 : goto out;
483 :
484 : /*
485 : * The preliminary mapping check is mainly to avoid the
486 : * pointless overhead of lock_page on the ZERO_PAGE
487 : * which might bounce very badly if there is contention.
488 : *
489 : * If the page is already locked, we don't need to
490 : * handle it now - vmscan will handle it later if and
491 : * when it attempts to reclaim the page.
492 : */
493 16 : if (page->mapping && trylock_page(page)) {
494 16 : lru_add_drain(); /* push cached pages to LRU */
495 : /*
496 : * Because we lock page here, and migration is
497 : * blocked by the pte's page reference, and we
498 : * know the page is still mapped, we don't even
499 : * need to check for file-cache page truncation.
500 : */
501 16 : mlock_vma_page(page);
502 16 : unlock_page(page);
503 : }
504 : }
505 7782 : out:
506 7798 : pte_unmap_unlock(ptep, ptl);
507 7798 : return page;
508 35 : no_page:
509 35 : pte_unmap_unlock(ptep, ptl);
510 35 : if (!pte_none(pte))
511 : return NULL;
512 35 : return no_page_table(vma, flags);
513 : }
514 :
515 7833 : static struct page *follow_pmd_mask(struct vm_area_struct *vma,
516 : unsigned long address, pud_t *pudp,
517 : unsigned int flags,
518 : struct follow_page_context *ctx)
519 : {
520 7833 : pmd_t *pmd, pmdval;
521 7833 : spinlock_t *ptl;
522 7833 : struct page *page;
523 7833 : struct mm_struct *mm = vma->vm_mm;
524 :
525 7833 : pmd = pmd_offset(pudp, address);
526 : /*
527 : * The READ_ONCE() will stabilize the pmdval in a register or
528 : * on the stack so that it will stop changing under the code.
529 : */
530 7833 : pmdval = READ_ONCE(*pmd);
531 7833 : if (pmd_none(pmdval))
532 0 : return no_page_table(vma, flags);
533 : if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
534 : page = follow_huge_pmd(mm, address, pmd, flags);
535 : if (page)
536 : return page;
537 : return no_page_table(vma, flags);
538 : }
539 : if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
540 : page = follow_huge_pd(vma, address,
541 : __hugepd(pmd_val(pmdval)), flags,
542 : PMD_SHIFT);
543 : if (page)
544 : return page;
545 : return no_page_table(vma, flags);
546 : }
547 7833 : retry:
548 15666 : if (!pmd_present(pmdval)) {
549 0 : if (likely(!(flags & FOLL_MIGRATION)))
550 0 : return no_page_table(vma, flags);
551 0 : VM_BUG_ON(thp_migration_supported() &&
552 : !is_pmd_migration_entry(pmdval));
553 0 : if (is_pmd_migration_entry(pmdval))
554 0 : pmd_migration_entry_wait(mm, pmd);
555 0 : pmdval = READ_ONCE(*pmd);
556 : /*
557 : * MADV_DONTNEED may convert the pmd to null because
558 : * mmap_lock is held in read mode
559 : */
560 0 : if (pmd_none(pmdval))
561 0 : return no_page_table(vma, flags);
562 0 : goto retry;
563 : }
564 7833 : if (pmd_devmap(pmdval)) {
565 0 : ptl = pmd_lock(mm, pmd);
566 0 : page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
567 0 : spin_unlock(ptl);
568 0 : if (page)
569 : return page;
570 : }
571 7833 : if (likely(!pmd_trans_huge(pmdval)))
572 7833 : return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
573 :
574 0 : if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
575 : return no_page_table(vma, flags);
576 :
577 : retry_locked:
578 0 : ptl = pmd_lock(mm, pmd);
579 0 : if (unlikely(pmd_none(*pmd))) {
580 0 : spin_unlock(ptl);
581 0 : return no_page_table(vma, flags);
582 : }
583 0 : if (unlikely(!pmd_present(*pmd))) {
584 0 : spin_unlock(ptl);
585 0 : if (likely(!(flags & FOLL_MIGRATION)))
586 0 : return no_page_table(vma, flags);
587 0 : pmd_migration_entry_wait(mm, pmd);
588 0 : goto retry_locked;
589 : }
590 0 : if (unlikely(!pmd_trans_huge(*pmd))) {
591 0 : spin_unlock(ptl);
592 0 : return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
593 : }
594 0 : if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
595 0 : int ret;
596 0 : page = pmd_page(*pmd);
597 0 : if (is_huge_zero_page(page)) {
598 0 : spin_unlock(ptl);
599 0 : ret = 0;
600 0 : split_huge_pmd(vma, pmd, address);
601 0 : if (pmd_trans_unstable(pmd))
602 : ret = -EBUSY;
603 0 : } else if (flags & FOLL_SPLIT) {
604 0 : if (unlikely(!try_get_page(page))) {
605 0 : spin_unlock(ptl);
606 0 : return ERR_PTR(-ENOMEM);
607 : }
608 0 : spin_unlock(ptl);
609 0 : lock_page(page);
610 0 : ret = split_huge_page(page);
611 0 : unlock_page(page);
612 0 : put_page(page);
613 0 : if (pmd_none(*pmd))
614 0 : return no_page_table(vma, flags);
615 : } else { /* flags & FOLL_SPLIT_PMD */
616 0 : spin_unlock(ptl);
617 0 : split_huge_pmd(vma, pmd, address);
618 0 : ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
619 : }
620 :
621 0 : return ret ? ERR_PTR(ret) :
622 0 : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
623 : }
624 0 : page = follow_trans_huge_pmd(vma, address, pmd, flags);
625 0 : spin_unlock(ptl);
626 0 : ctx->page_mask = HPAGE_PMD_NR - 1;
627 0 : return page;
628 : }
629 :
630 7833 : static struct page *follow_pud_mask(struct vm_area_struct *vma,
631 : unsigned long address, p4d_t *p4dp,
632 : unsigned int flags,
633 : struct follow_page_context *ctx)
634 : {
635 7833 : pud_t *pud;
636 7833 : spinlock_t *ptl;
637 7833 : struct page *page;
638 7833 : struct mm_struct *mm = vma->vm_mm;
639 :
640 7833 : pud = pud_offset(p4dp, address);
641 7833 : if (pud_none(*pud))
642 0 : return no_page_table(vma, flags);
643 7833 : if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
644 : page = follow_huge_pud(mm, address, pud, flags);
645 : if (page)
646 : return page;
647 : return no_page_table(vma, flags);
648 : }
649 7833 : if (is_hugepd(__hugepd(pud_val(*pud)))) {
650 : page = follow_huge_pd(vma, address,
651 : __hugepd(pud_val(*pud)), flags,
652 : PUD_SHIFT);
653 : if (page)
654 : return page;
655 : return no_page_table(vma, flags);
656 : }
657 7833 : if (pud_devmap(*pud)) {
658 0 : ptl = pud_lock(mm, pud);
659 0 : page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
660 0 : spin_unlock(ptl);
661 0 : if (page)
662 : return page;
663 : }
664 15666 : if (unlikely(pud_bad(*pud)))
665 0 : return no_page_table(vma, flags);
666 :
667 7833 : return follow_pmd_mask(vma, address, pud, flags, ctx);
668 : }
669 :
670 10306 : static struct page *follow_p4d_mask(struct vm_area_struct *vma,
671 : unsigned long address, pgd_t *pgdp,
672 : unsigned int flags,
673 : struct follow_page_context *ctx)
674 : {
675 10306 : p4d_t *p4d;
676 10306 : struct page *page;
677 :
678 10306 : p4d = p4d_offset(pgdp, address);
679 10306 : if (p4d_none(*p4d))
680 2473 : return no_page_table(vma, flags);
681 7833 : BUILD_BUG_ON(p4d_huge(*p4d));
682 7833 : if (unlikely(p4d_bad(*p4d)))
683 0 : return no_page_table(vma, flags);
684 :
685 7833 : if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
686 : page = follow_huge_pd(vma, address,
687 : __hugepd(p4d_val(*p4d)), flags,
688 : P4D_SHIFT);
689 : if (page)
690 : return page;
691 : return no_page_table(vma, flags);
692 : }
693 7833 : return follow_pud_mask(vma, address, p4d, flags, ctx);
694 : }
695 :
696 : /**
697 : * follow_page_mask - look up a page descriptor from a user-virtual address
698 : * @vma: vm_area_struct mapping @address
699 : * @address: virtual address to look up
700 : * @flags: flags modifying lookup behaviour
701 : * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
702 : * pointer to output page_mask
703 : *
704 : * @flags can have FOLL_ flags set, defined in <linux/mm.h>
705 : *
706 : * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
707 : * the device's dev_pagemap metadata to avoid repeating expensive lookups.
708 : *
709 : * On output, the @ctx->page_mask is set according to the size of the page.
710 : *
711 : * Return: the mapped (struct page *), %NULL if no mapping exists, or
712 : * an error pointer if there is a mapping to something not represented
713 : * by a page descriptor (see also vm_normal_page()).
714 : */
715 10306 : static struct page *follow_page_mask(struct vm_area_struct *vma,
716 : unsigned long address, unsigned int flags,
717 : struct follow_page_context *ctx)
718 : {
719 10306 : pgd_t *pgd;
720 10306 : struct page *page;
721 10306 : struct mm_struct *mm = vma->vm_mm;
722 :
723 10306 : ctx->page_mask = 0;
724 :
725 : /* make this handle hugepd */
726 10306 : page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
727 10306 : if (!IS_ERR(page)) {
728 : WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
729 : return page;
730 : }
731 :
732 10306 : pgd = pgd_offset(mm, address);
733 :
734 10306 : if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
735 : return no_page_table(vma, flags);
736 :
737 10306 : if (pgd_huge(*pgd)) {
738 : page = follow_huge_pgd(mm, address, pgd, flags);
739 : if (page)
740 : return page;
741 : return no_page_table(vma, flags);
742 : }
743 10306 : if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
744 : page = follow_huge_pd(vma, address,
745 : __hugepd(pgd_val(*pgd)), flags,
746 : PGDIR_SHIFT);
747 : if (page)
748 : return page;
749 : return no_page_table(vma, flags);
750 : }
751 :
752 10306 : return follow_p4d_mask(vma, address, pgd, flags, ctx);
753 : }
754 :
755 2 : struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
756 : unsigned int foll_flags)
757 : {
758 2 : struct follow_page_context ctx = { NULL };
759 2 : struct page *page;
760 :
761 2 : page = follow_page_mask(vma, address, foll_flags, &ctx);
762 2 : if (ctx.pgmap)
763 0 : put_dev_pagemap(ctx.pgmap);
764 2 : return page;
765 : }
766 :
767 : static int get_gate_page(struct mm_struct *mm, unsigned long address,
768 : unsigned int gup_flags, struct vm_area_struct **vma,
769 : struct page **page)
770 : {
771 : pgd_t *pgd;
772 : p4d_t *p4d;
773 : pud_t *pud;
774 : pmd_t *pmd;
775 : pte_t *pte;
776 : int ret = -EFAULT;
777 :
778 : /* user gate pages are read-only */
779 : if (gup_flags & FOLL_WRITE)
780 : return -EFAULT;
781 : if (address > TASK_SIZE)
782 : pgd = pgd_offset_k(address);
783 : else
784 : pgd = pgd_offset_gate(mm, address);
785 : if (pgd_none(*pgd))
786 : return -EFAULT;
787 : p4d = p4d_offset(pgd, address);
788 : if (p4d_none(*p4d))
789 : return -EFAULT;
790 : pud = pud_offset(p4d, address);
791 : if (pud_none(*pud))
792 : return -EFAULT;
793 : pmd = pmd_offset(pud, address);
794 : if (!pmd_present(*pmd))
795 : return -EFAULT;
796 : VM_BUG_ON(pmd_trans_huge(*pmd));
797 : pte = pte_offset_map(pmd, address);
798 : if (pte_none(*pte))
799 : goto unmap;
800 : *vma = get_gate_vma(mm);
801 : if (!page)
802 : goto out;
803 : *page = vm_normal_page(*vma, address, *pte);
804 : if (!*page) {
805 : if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
806 : goto unmap;
807 : *page = pte_page(*pte);
808 : }
809 : if (unlikely(!try_grab_page(*page, gup_flags))) {
810 : ret = -ENOMEM;
811 : goto unmap;
812 : }
813 : out:
814 : ret = 0;
815 : unmap:
816 : pte_unmap(pte);
817 : return ret;
818 : }
819 :
820 : /*
821 : * mmap_lock must be held on entry. If @locked != NULL and *@flags
822 : * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
823 : * is, *@locked will be set to 0 and -EBUSY returned.
824 : */
825 2508 : static int faultin_page(struct vm_area_struct *vma,
826 : unsigned long address, unsigned int *flags, int *locked)
827 : {
828 2508 : unsigned int fault_flags = 0;
829 2508 : vm_fault_t ret;
830 :
831 : /* mlock all present pages, but do not fault in new pages */
832 2508 : if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
833 : return -ENOENT;
834 2508 : if (*flags & FOLL_WRITE)
835 : fault_flags |= FAULT_FLAG_WRITE;
836 2508 : if (*flags & FOLL_REMOTE)
837 2473 : fault_flags |= FAULT_FLAG_REMOTE;
838 2508 : if (locked)
839 35 : fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
840 2508 : if (*flags & FOLL_NOWAIT)
841 0 : fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
842 2508 : if (*flags & FOLL_TRIED) {
843 : /*
844 : * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
845 : * can co-exist
846 : */
847 0 : fault_flags |= FAULT_FLAG_TRIED;
848 : }
849 :
850 2508 : ret = handle_mm_fault(vma, address, fault_flags, NULL);
851 2508 : if (ret & VM_FAULT_ERROR) {
852 0 : int err = vm_fault_to_errno(ret, *flags);
853 :
854 0 : if (err)
855 : return err;
856 0 : BUG();
857 : }
858 :
859 2508 : if (ret & VM_FAULT_RETRY) {
860 10 : if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
861 10 : *locked = 0;
862 10 : return -EBUSY;
863 : }
864 :
865 : /*
866 : * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
867 : * necessary, even if maybe_mkwrite decided not to set pte_write. We
868 : * can thus safely do subsequent page lookups as if they were reads.
869 : * But only do so when looping for pte_write is futile: in some cases
870 : * userspace may also be wanting to write to the gotten user page,
871 : * which a read fault here might prevent (a readonly page might get
872 : * reCOWed by userspace write).
873 : */
874 2498 : if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
875 0 : *flags |= FOLL_COW;
876 : return 0;
877 : }
878 :
879 7791 : static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
880 : {
881 7791 : vm_flags_t vm_flags = vma->vm_flags;
882 7791 : int write = (gup_flags & FOLL_WRITE);
883 7791 : int foreign = (gup_flags & FOLL_REMOTE);
884 :
885 7791 : if (vm_flags & (VM_IO | VM_PFNMAP))
886 : return -EFAULT;
887 :
888 7791 : if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
889 : return -EFAULT;
890 :
891 7791 : if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
892 : return -EOPNOTSUPP;
893 :
894 7791 : if (write) {
895 7509 : if (!(vm_flags & VM_WRITE)) {
896 0 : if (!(gup_flags & FOLL_FORCE))
897 : return -EFAULT;
898 : /*
899 : * We used to let the write,force case do COW in a
900 : * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
901 : * set a breakpoint in a read-only mapping of an
902 : * executable, without corrupting the file (yet only
903 : * when that file had been opened for writing!).
904 : * Anon pages in shared mappings are surprising: now
905 : * just reject it.
906 : */
907 0 : if (!is_cow_mapping(vm_flags))
908 : return -EFAULT;
909 : }
910 282 : } else if (!(vm_flags & VM_READ)) {
911 0 : if (!(gup_flags & FOLL_FORCE))
912 : return -EFAULT;
913 : /*
914 : * Is there actually any vma we can reach here which does not
915 : * have VM_MAYREAD set?
916 : */
917 0 : if (!(vm_flags & VM_MAYREAD))
918 : return -EFAULT;
919 : }
920 : /*
921 : * gups are always data accesses, not instruction
922 : * fetches, so execute=false here
923 : */
924 7791 : if (!arch_vma_access_permitted(vma, write, false, foreign))
925 0 : return -EFAULT;
926 : return 0;
927 : }
928 :
929 : /**
930 : * __get_user_pages() - pin user pages in memory
931 : * @mm: mm_struct of target mm
932 : * @start: starting user address
933 : * @nr_pages: number of pages from start to pin
934 : * @gup_flags: flags modifying pin behaviour
935 : * @pages: array that receives pointers to the pages pinned.
936 : * Should be at least nr_pages long. Or NULL, if caller
937 : * only intends to ensure the pages are faulted in.
938 : * @vmas: array of pointers to vmas corresponding to each page.
939 : * Or NULL if the caller does not require them.
940 : * @locked: whether we're still with the mmap_lock held
941 : *
942 : * Returns either number of pages pinned (which may be less than the
943 : * number requested), or an error. Details about the return value:
944 : *
945 : * -- If nr_pages is 0, returns 0.
946 : * -- If nr_pages is >0, but no pages were pinned, returns -errno.
947 : * -- If nr_pages is >0, and some pages were pinned, returns the number of
948 : * pages pinned. Again, this may be less than nr_pages.
949 : * -- 0 return value is possible when the fault would need to be retried.
950 : *
951 : * The caller is responsible for releasing returned @pages, via put_page().
952 : *
953 : * @vmas are valid only as long as mmap_lock is held.
954 : *
955 : * Must be called with mmap_lock held. It may be released. See below.
956 : *
957 : * __get_user_pages walks a process's page tables and takes a reference to
958 : * each struct page that each user address corresponds to at a given
959 : * instant. That is, it takes the page that would be accessed if a user
960 : * thread accesses the given user virtual address at that instant.
961 : *
962 : * This does not guarantee that the page exists in the user mappings when
963 : * __get_user_pages returns, and there may even be a completely different
964 : * page there in some cases (eg. if mmapped pagecache has been invalidated
965 : * and subsequently re faulted). However it does guarantee that the page
966 : * won't be freed completely. And mostly callers simply care that the page
967 : * contains data that was valid *at some point in time*. Typically, an IO
968 : * or similar operation cannot guarantee anything stronger anyway because
969 : * locks can't be held over the syscall boundary.
970 : *
971 : * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
972 : * the page is written to, set_page_dirty (or set_page_dirty_lock, as
973 : * appropriate) must be called after the page is finished with, and
974 : * before put_page is called.
975 : *
976 : * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
977 : * released by an up_read(). That can happen if @gup_flags does not
978 : * have FOLL_NOWAIT.
979 : *
980 : * A caller using such a combination of @locked and @gup_flags
981 : * must therefore hold the mmap_lock for reading only, and recognize
982 : * when it's been released. Otherwise, it must be held for either
983 : * reading or writing and will not be released.
984 : *
985 : * In most cases, get_user_pages or get_user_pages_fast should be used
986 : * instead of __get_user_pages. __get_user_pages should be used only if
987 : * you need some special @gup_flags.
988 : */
989 7791 : static long __get_user_pages(struct mm_struct *mm,
990 : unsigned long start, unsigned long nr_pages,
991 : unsigned int gup_flags, struct page **pages,
992 : struct vm_area_struct **vmas, int *locked)
993 : {
994 7791 : long ret = 0, i = 0;
995 7791 : struct vm_area_struct *vma = NULL;
996 7791 : struct follow_page_context ctx = { NULL };
997 :
998 7791 : if (!nr_pages)
999 : return 0;
1000 :
1001 7791 : start = untagged_addr(start);
1002 :
1003 7791 : VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
1004 :
1005 : /*
1006 : * If FOLL_FORCE is set then do not force a full fault as the hinting
1007 : * fault information is unrelated to the reference behaviour of a task
1008 : * using the address space
1009 : */
1010 7791 : if (!(gup_flags & FOLL_FORCE))
1011 220 : gup_flags |= FOLL_NUMA;
1012 :
1013 7806 : do {
1014 7806 : struct page *page;
1015 7806 : unsigned int foll_flags = gup_flags;
1016 7806 : unsigned int page_increm;
1017 :
1018 : /* first iteration or cross vma bound */
1019 7806 : if (!vma || start >= vma->vm_end) {
1020 7791 : vma = find_extend_vma(mm, start);
1021 7791 : if (!vma && in_gate_area(mm, start)) {
1022 : ret = get_gate_page(mm, start & PAGE_MASK,
1023 : gup_flags, &vma,
1024 : pages ? &pages[i] : NULL);
1025 : if (ret)
1026 10 : goto out;
1027 : ctx.page_mask = 0;
1028 : goto next_page;
1029 : }
1030 :
1031 7791 : if (!vma) {
1032 0 : ret = -EFAULT;
1033 0 : goto out;
1034 : }
1035 7791 : ret = check_vma_flags(vma, gup_flags);
1036 7791 : if (ret)
1037 0 : goto out;
1038 :
1039 7806 : if (is_vm_hugetlb_page(vma)) {
1040 : i = follow_hugetlb_page(mm, vma, pages, vmas,
1041 : &start, &nr_pages, i,
1042 : gup_flags, locked);
1043 : if (locked && *locked == 0) {
1044 : /*
1045 : * We've got a VM_FAULT_RETRY
1046 : * and we've lost mmap_lock.
1047 : * We must stop here.
1048 : */
1049 : BUG_ON(gup_flags & FOLL_NOWAIT);
1050 : BUG_ON(ret != 0);
1051 : goto out;
1052 : }
1053 : continue;
1054 : }
1055 : }
1056 7806 : retry:
1057 : /*
1058 : * If we have a pending SIGKILL, don't keep faulting pages and
1059 : * potentially allocating memory.
1060 : */
1061 10304 : if (fatal_signal_pending(current)) {
1062 0 : ret = -EINTR;
1063 0 : goto out;
1064 : }
1065 10304 : cond_resched();
1066 :
1067 10304 : page = follow_page_mask(vma, start, foll_flags, &ctx);
1068 10304 : if (!page) {
1069 2508 : ret = faultin_page(vma, start, &foll_flags, locked);
1070 2508 : switch (ret) {
1071 2498 : case 0:
1072 2498 : goto retry;
1073 : case -EBUSY:
1074 : ret = 0;
1075 10 : fallthrough;
1076 10 : case -EFAULT:
1077 : case -ENOMEM:
1078 : case -EHWPOISON:
1079 10 : goto out;
1080 0 : case -ENOENT:
1081 0 : goto next_page;
1082 : }
1083 0 : BUG();
1084 7796 : } else if (PTR_ERR(page) == -EEXIST) {
1085 : /*
1086 : * Proper page table entry exists, but no corresponding
1087 : * struct page.
1088 : */
1089 0 : goto next_page;
1090 7796 : } else if (IS_ERR(page)) {
1091 0 : ret = PTR_ERR(page);
1092 0 : goto out;
1093 : }
1094 7796 : if (pages) {
1095 7770 : pages[i] = page;
1096 7770 : flush_anon_page(vma, page, start);
1097 7770 : flush_dcache_page(page);
1098 7770 : ctx.page_mask = 0;
1099 : }
1100 26 : next_page:
1101 7796 : if (vmas) {
1102 220 : vmas[i] = vma;
1103 220 : ctx.page_mask = 0;
1104 : }
1105 7796 : page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
1106 7796 : if (page_increm > nr_pages)
1107 0 : page_increm = nr_pages;
1108 7796 : i += page_increm;
1109 7796 : start += page_increm * PAGE_SIZE;
1110 7796 : nr_pages -= page_increm;
1111 7796 : } while (nr_pages);
1112 7781 : out:
1113 7791 : if (ctx.pgmap)
1114 0 : put_dev_pagemap(ctx.pgmap);
1115 7791 : return i ? i : ret;
1116 : }
1117 :
1118 0 : static bool vma_permits_fault(struct vm_area_struct *vma,
1119 : unsigned int fault_flags)
1120 : {
1121 0 : bool write = !!(fault_flags & FAULT_FLAG_WRITE);
1122 0 : bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
1123 0 : vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1124 :
1125 0 : if (!(vm_flags & vma->vm_flags))
1126 : return false;
1127 :
1128 : /*
1129 : * The architecture might have a hardware protection
1130 : * mechanism other than read/write that can deny access.
1131 : *
1132 : * gup always represents data access, not instruction
1133 : * fetches, so execute=false here:
1134 : */
1135 0 : if (!arch_vma_access_permitted(vma, write, false, foreign))
1136 0 : return false;
1137 :
1138 : return true;
1139 : }
1140 :
1141 : /**
1142 : * fixup_user_fault() - manually resolve a user page fault
1143 : * @mm: mm_struct of target mm
1144 : * @address: user address
1145 : * @fault_flags:flags to pass down to handle_mm_fault()
1146 : * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
1147 : * does not allow retry. If NULL, the caller must guarantee
1148 : * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
1149 : *
1150 : * This is meant to be called in the specific scenario where for locking reasons
1151 : * we try to access user memory in atomic context (within a pagefault_disable()
1152 : * section), this returns -EFAULT, and we want to resolve the user fault before
1153 : * trying again.
1154 : *
1155 : * Typically this is meant to be used by the futex code.
1156 : *
1157 : * The main difference with get_user_pages() is that this function will
1158 : * unconditionally call handle_mm_fault() which will in turn perform all the
1159 : * necessary SW fixup of the dirty and young bits in the PTE, while
1160 : * get_user_pages() only guarantees to update these in the struct page.
1161 : *
1162 : * This is important for some architectures where those bits also gate the
1163 : * access permission to the page because they are maintained in software. On
1164 : * such architectures, gup() will not be enough to make a subsequent access
1165 : * succeed.
1166 : *
1167 : * This function will not return with an unlocked mmap_lock. So it has not the
1168 : * same semantics wrt the @mm->mmap_lock as does filemap_fault().
1169 : */
1170 0 : int fixup_user_fault(struct mm_struct *mm,
1171 : unsigned long address, unsigned int fault_flags,
1172 : bool *unlocked)
1173 : {
1174 0 : struct vm_area_struct *vma;
1175 0 : vm_fault_t ret, major = 0;
1176 :
1177 0 : address = untagged_addr(address);
1178 :
1179 0 : if (unlocked)
1180 0 : fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1181 :
1182 0 : retry:
1183 0 : vma = find_extend_vma(mm, address);
1184 0 : if (!vma || address < vma->vm_start)
1185 : return -EFAULT;
1186 :
1187 0 : if (!vma_permits_fault(vma, fault_flags))
1188 : return -EFAULT;
1189 :
1190 0 : if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1191 0 : fatal_signal_pending(current))
1192 : return -EINTR;
1193 :
1194 0 : ret = handle_mm_fault(vma, address, fault_flags, NULL);
1195 0 : major |= ret & VM_FAULT_MAJOR;
1196 0 : if (ret & VM_FAULT_ERROR) {
1197 0 : int err = vm_fault_to_errno(ret, 0);
1198 :
1199 0 : if (err)
1200 0 : return err;
1201 0 : BUG();
1202 : }
1203 :
1204 0 : if (ret & VM_FAULT_RETRY) {
1205 0 : mmap_read_lock(mm);
1206 0 : *unlocked = true;
1207 0 : fault_flags |= FAULT_FLAG_TRIED;
1208 0 : goto retry;
1209 : }
1210 :
1211 : return 0;
1212 : }
1213 : EXPORT_SYMBOL_GPL(fixup_user_fault);
1214 :
1215 : /*
1216 : * Please note that this function, unlike __get_user_pages will not
1217 : * return 0 for nr_pages > 0 without FOLL_NOWAIT
1218 : */
1219 7770 : static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
1220 : unsigned long start,
1221 : unsigned long nr_pages,
1222 : struct page **pages,
1223 : struct vm_area_struct **vmas,
1224 : int *locked,
1225 : unsigned int flags)
1226 : {
1227 7770 : long ret, pages_done;
1228 7770 : bool lock_dropped;
1229 :
1230 0 : if (locked) {
1231 : /* if VM_FAULT_RETRY can be returned, vmas become invalid */
1232 0 : BUG_ON(vmas);
1233 : /* check caller initialized locked */
1234 0 : BUG_ON(*locked != 1);
1235 : }
1236 :
1237 7770 : if (flags & FOLL_PIN)
1238 0 : atomic_set(&mm->has_pinned, 1);
1239 :
1240 : /*
1241 : * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1242 : * is to set FOLL_GET if the caller wants pages[] filled in (but has
1243 : * carelessly failed to specify FOLL_GET), so keep doing that, but only
1244 : * for FOLL_GET, not for the newer FOLL_PIN.
1245 : *
1246 : * FOLL_PIN always expects pages to be non-null, but no need to assert
1247 : * that here, as any failures will be obvious enough.
1248 : */
1249 7770 : if (pages && !(flags & FOLL_PIN))
1250 7770 : flags |= FOLL_GET;
1251 :
1252 7770 : pages_done = 0;
1253 7770 : lock_dropped = false;
1254 7770 : for (;;) {
1255 7770 : ret = __get_user_pages(mm, start, nr_pages, flags, pages,
1256 : vmas, locked);
1257 7770 : if (!locked)
1258 : /* VM_FAULT_RETRY couldn't trigger, bypass */
1259 0 : return ret;
1260 :
1261 : /* VM_FAULT_RETRY cannot return errors */
1262 0 : if (!*locked) {
1263 0 : BUG_ON(ret < 0);
1264 0 : BUG_ON(ret >= nr_pages);
1265 : }
1266 :
1267 0 : if (ret > 0) {
1268 0 : nr_pages -= ret;
1269 0 : pages_done += ret;
1270 0 : if (!nr_pages)
1271 : break;
1272 : }
1273 0 : if (*locked) {
1274 : /*
1275 : * VM_FAULT_RETRY didn't trigger or it was a
1276 : * FOLL_NOWAIT.
1277 : */
1278 0 : if (!pages_done)
1279 0 : pages_done = ret;
1280 : break;
1281 : }
1282 : /*
1283 : * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1284 : * For the prefault case (!pages) we only update counts.
1285 : */
1286 0 : if (likely(pages))
1287 0 : pages += ret;
1288 0 : start += ret << PAGE_SHIFT;
1289 0 : lock_dropped = true;
1290 :
1291 0 : retry:
1292 : /*
1293 : * Repeat on the address that fired VM_FAULT_RETRY
1294 : * with both FAULT_FLAG_ALLOW_RETRY and
1295 : * FAULT_FLAG_TRIED. Note that GUP can be interrupted
1296 : * by fatal signals, so we need to check it before we
1297 : * start trying again otherwise it can loop forever.
1298 : */
1299 :
1300 0 : if (fatal_signal_pending(current)) {
1301 0 : if (!pages_done)
1302 : pages_done = -EINTR;
1303 : break;
1304 : }
1305 :
1306 0 : ret = mmap_read_lock_killable(mm);
1307 0 : if (ret) {
1308 0 : BUG_ON(ret > 0);
1309 0 : if (!pages_done)
1310 : pages_done = ret;
1311 : break;
1312 : }
1313 :
1314 0 : *locked = 1;
1315 0 : ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
1316 : pages, NULL, locked);
1317 0 : if (!*locked) {
1318 : /* Continue to retry until we succeeded */
1319 0 : BUG_ON(ret != 0);
1320 0 : goto retry;
1321 : }
1322 0 : if (ret != 1) {
1323 0 : BUG_ON(ret > 1);
1324 0 : if (!pages_done)
1325 : pages_done = ret;
1326 : break;
1327 : }
1328 0 : nr_pages--;
1329 0 : pages_done++;
1330 0 : if (!nr_pages)
1331 : break;
1332 0 : if (likely(pages))
1333 0 : pages++;
1334 0 : start += PAGE_SIZE;
1335 : }
1336 0 : if (lock_dropped && *locked) {
1337 : /*
1338 : * We must let the caller know we temporarily dropped the lock
1339 : * and so the critical section protected by it was lost.
1340 : */
1341 0 : mmap_read_unlock(mm);
1342 0 : *locked = 0;
1343 : }
1344 : return pages_done;
1345 : }
1346 :
1347 : /**
1348 : * populate_vma_page_range() - populate a range of pages in the vma.
1349 : * @vma: target vma
1350 : * @start: start address
1351 : * @end: end address
1352 : * @locked: whether the mmap_lock is still held
1353 : *
1354 : * This takes care of mlocking the pages too if VM_LOCKED is set.
1355 : *
1356 : * Return either number of pages pinned in the vma, or a negative error
1357 : * code on error.
1358 : *
1359 : * vma->vm_mm->mmap_lock must be held.
1360 : *
1361 : * If @locked is NULL, it may be held for read or write and will
1362 : * be unperturbed.
1363 : *
1364 : * If @locked is non-NULL, it must held for read only and may be
1365 : * released. If it's released, *@locked will be set to 0.
1366 : */
1367 21 : long populate_vma_page_range(struct vm_area_struct *vma,
1368 : unsigned long start, unsigned long end, int *locked)
1369 : {
1370 21 : struct mm_struct *mm = vma->vm_mm;
1371 21 : unsigned long nr_pages = (end - start) / PAGE_SIZE;
1372 21 : int gup_flags;
1373 :
1374 21 : VM_BUG_ON(start & ~PAGE_MASK);
1375 21 : VM_BUG_ON(end & ~PAGE_MASK);
1376 21 : VM_BUG_ON_VMA(start < vma->vm_start, vma);
1377 21 : VM_BUG_ON_VMA(end > vma->vm_end, vma);
1378 21 : mmap_assert_locked(mm);
1379 :
1380 21 : gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
1381 21 : if (vma->vm_flags & VM_LOCKONFAULT)
1382 0 : gup_flags &= ~FOLL_POPULATE;
1383 : /*
1384 : * We want to touch writable mappings with a write fault in order
1385 : * to break COW, except for shared mappings because these don't COW
1386 : * and we would not want to dirty them for nothing.
1387 : */
1388 21 : if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1389 1 : gup_flags |= FOLL_WRITE;
1390 :
1391 : /*
1392 : * We want mlock to succeed for regions that have any permissions
1393 : * other than PROT_NONE.
1394 : */
1395 21 : if (vma_is_accessible(vma))
1396 21 : gup_flags |= FOLL_FORCE;
1397 :
1398 : /*
1399 : * We made sure addr is within a VMA, so the following will
1400 : * not result in a stack expansion that recurses back here.
1401 : */
1402 21 : return __get_user_pages(mm, start, nr_pages, gup_flags,
1403 : NULL, NULL, locked);
1404 : }
1405 :
1406 : /*
1407 : * __mm_populate - populate and/or mlock pages within a range of address space.
1408 : *
1409 : * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1410 : * flags. VMAs must be already marked with the desired vm_flags, and
1411 : * mmap_lock must not be held.
1412 : */
1413 11 : int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1414 : {
1415 11 : struct mm_struct *mm = current->mm;
1416 11 : unsigned long end, nstart, nend;
1417 11 : struct vm_area_struct *vma = NULL;
1418 11 : int locked = 0;
1419 11 : long ret = 0;
1420 :
1421 11 : end = start + len;
1422 :
1423 32 : for (nstart = start; nstart < end; nstart = nend) {
1424 : /*
1425 : * We want to fault in pages for [nstart; end) address range.
1426 : * Find first corresponding VMA.
1427 : */
1428 21 : if (!locked) {
1429 21 : locked = 1;
1430 21 : mmap_read_lock(mm);
1431 21 : vma = find_vma(mm, nstart);
1432 0 : } else if (nstart >= vma->vm_end)
1433 0 : vma = vma->vm_next;
1434 21 : if (!vma || vma->vm_start >= end)
1435 : break;
1436 : /*
1437 : * Set [nstart; nend) to intersection of desired address
1438 : * range with the first VMA. Also, skip undesirable VMA types.
1439 : */
1440 21 : nend = min(end, vma->vm_end);
1441 21 : if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1442 0 : continue;
1443 21 : if (nstart < vma->vm_start)
1444 : nstart = vma->vm_start;
1445 : /*
1446 : * Now fault in a range of pages. populate_vma_page_range()
1447 : * double checks the vma flags, so that it won't mlock pages
1448 : * if the vma was already munlocked.
1449 : */
1450 21 : ret = populate_vma_page_range(vma, nstart, nend, &locked);
1451 21 : if (ret < 0) {
1452 0 : if (ignore_errors) {
1453 0 : ret = 0;
1454 0 : continue; /* continue at next VMA */
1455 : }
1456 : break;
1457 : }
1458 21 : nend = nstart + ret * PAGE_SIZE;
1459 21 : ret = 0;
1460 : }
1461 11 : if (locked)
1462 11 : mmap_read_unlock(mm);
1463 11 : return ret; /* 0 or negative error code */
1464 : }
1465 : #else /* CONFIG_MMU */
1466 : static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
1467 : unsigned long nr_pages, struct page **pages,
1468 : struct vm_area_struct **vmas, int *locked,
1469 : unsigned int foll_flags)
1470 : {
1471 : struct vm_area_struct *vma;
1472 : unsigned long vm_flags;
1473 : int i;
1474 :
1475 : /* calculate required read or write permissions.
1476 : * If FOLL_FORCE is set, we only require the "MAY" flags.
1477 : */
1478 : vm_flags = (foll_flags & FOLL_WRITE) ?
1479 : (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1480 : vm_flags &= (foll_flags & FOLL_FORCE) ?
1481 : (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1482 :
1483 : for (i = 0; i < nr_pages; i++) {
1484 : vma = find_vma(mm, start);
1485 : if (!vma)
1486 : goto finish_or_fault;
1487 :
1488 : /* protect what we can, including chardevs */
1489 : if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1490 : !(vm_flags & vma->vm_flags))
1491 : goto finish_or_fault;
1492 :
1493 : if (pages) {
1494 : pages[i] = virt_to_page(start);
1495 : if (pages[i])
1496 : get_page(pages[i]);
1497 : }
1498 : if (vmas)
1499 : vmas[i] = vma;
1500 : start = (start + PAGE_SIZE) & PAGE_MASK;
1501 : }
1502 :
1503 : return i;
1504 :
1505 : finish_or_fault:
1506 : return i ? : -EFAULT;
1507 : }
1508 : #endif /* !CONFIG_MMU */
1509 :
1510 : /**
1511 : * get_dump_page() - pin user page in memory while writing it to core dump
1512 : * @addr: user address
1513 : *
1514 : * Returns struct page pointer of user page pinned for dump,
1515 : * to be freed afterwards by put_page().
1516 : *
1517 : * Returns NULL on any kind of failure - a hole must then be inserted into
1518 : * the corefile, to preserve alignment with its headers; and also returns
1519 : * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1520 : * allowing a hole to be left in the corefile to save diskspace.
1521 : *
1522 : * Called without mmap_lock (takes and releases the mmap_lock by itself).
1523 : */
1524 : #ifdef CONFIG_ELF_CORE
1525 : struct page *get_dump_page(unsigned long addr)
1526 : {
1527 : struct mm_struct *mm = current->mm;
1528 : struct page *page;
1529 : int locked = 1;
1530 : int ret;
1531 :
1532 : if (mmap_read_lock_killable(mm))
1533 : return NULL;
1534 : ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
1535 : FOLL_FORCE | FOLL_DUMP | FOLL_GET);
1536 : if (locked)
1537 : mmap_read_unlock(mm);
1538 : return (ret == 1) ? page : NULL;
1539 : }
1540 : #endif /* CONFIG_ELF_CORE */
1541 :
1542 : #ifdef CONFIG_CMA
1543 : static long check_and_migrate_cma_pages(struct mm_struct *mm,
1544 : unsigned long start,
1545 : unsigned long nr_pages,
1546 : struct page **pages,
1547 : struct vm_area_struct **vmas,
1548 : unsigned int gup_flags)
1549 : {
1550 : unsigned long i;
1551 : unsigned long step;
1552 : bool drain_allow = true;
1553 : bool migrate_allow = true;
1554 : LIST_HEAD(cma_page_list);
1555 : long ret = nr_pages;
1556 : struct migration_target_control mtc = {
1557 : .nid = NUMA_NO_NODE,
1558 : .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
1559 : };
1560 :
1561 : check_again:
1562 : for (i = 0; i < nr_pages;) {
1563 :
1564 : struct page *head = compound_head(pages[i]);
1565 :
1566 : /*
1567 : * gup may start from a tail page. Advance step by the left
1568 : * part.
1569 : */
1570 : step = compound_nr(head) - (pages[i] - head);
1571 : /*
1572 : * If we get a page from the CMA zone, since we are going to
1573 : * be pinning these entries, we might as well move them out
1574 : * of the CMA zone if possible.
1575 : */
1576 : if (is_migrate_cma_page(head)) {
1577 : if (PageHuge(head))
1578 : isolate_huge_page(head, &cma_page_list);
1579 : else {
1580 : if (!PageLRU(head) && drain_allow) {
1581 : lru_add_drain_all();
1582 : drain_allow = false;
1583 : }
1584 :
1585 : if (!isolate_lru_page(head)) {
1586 : list_add_tail(&head->lru, &cma_page_list);
1587 : mod_node_page_state(page_pgdat(head),
1588 : NR_ISOLATED_ANON +
1589 : page_is_file_lru(head),
1590 : thp_nr_pages(head));
1591 : }
1592 : }
1593 : }
1594 :
1595 : i += step;
1596 : }
1597 :
1598 : if (!list_empty(&cma_page_list)) {
1599 : /*
1600 : * drop the above get_user_pages reference.
1601 : */
1602 : if (gup_flags & FOLL_PIN)
1603 : unpin_user_pages(pages, nr_pages);
1604 : else
1605 : for (i = 0; i < nr_pages; i++)
1606 : put_page(pages[i]);
1607 :
1608 : if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
1609 : (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
1610 : /*
1611 : * some of the pages failed migration. Do get_user_pages
1612 : * without migration.
1613 : */
1614 : migrate_allow = false;
1615 :
1616 : if (!list_empty(&cma_page_list))
1617 : putback_movable_pages(&cma_page_list);
1618 : }
1619 : /*
1620 : * We did migrate all the pages, Try to get the page references
1621 : * again migrating any new CMA pages which we failed to isolate
1622 : * earlier.
1623 : */
1624 : ret = __get_user_pages_locked(mm, start, nr_pages,
1625 : pages, vmas, NULL,
1626 : gup_flags);
1627 :
1628 : if ((ret > 0) && migrate_allow) {
1629 : nr_pages = ret;
1630 : drain_allow = true;
1631 : goto check_again;
1632 : }
1633 : }
1634 :
1635 : return ret;
1636 : }
1637 : #else
1638 : static long check_and_migrate_cma_pages(struct mm_struct *mm,
1639 : unsigned long start,
1640 : unsigned long nr_pages,
1641 : struct page **pages,
1642 : struct vm_area_struct **vmas,
1643 : unsigned int gup_flags)
1644 : {
1645 : return nr_pages;
1646 : }
1647 : #endif /* CONFIG_CMA */
1648 :
1649 : /*
1650 : * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
1651 : * allows us to process the FOLL_LONGTERM flag.
1652 : */
1653 0 : static long __gup_longterm_locked(struct mm_struct *mm,
1654 : unsigned long start,
1655 : unsigned long nr_pages,
1656 : struct page **pages,
1657 : struct vm_area_struct **vmas,
1658 : unsigned int gup_flags)
1659 : {
1660 0 : unsigned long flags = 0;
1661 0 : long rc;
1662 :
1663 0 : if (gup_flags & FOLL_LONGTERM)
1664 : flags = memalloc_nocma_save();
1665 :
1666 0 : rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
1667 : gup_flags);
1668 :
1669 0 : if (gup_flags & FOLL_LONGTERM) {
1670 : if (rc > 0)
1671 : rc = check_and_migrate_cma_pages(mm, start, rc, pages,
1672 : vmas, gup_flags);
1673 0 : memalloc_nocma_restore(flags);
1674 : }
1675 0 : return rc;
1676 : }
1677 :
1678 7772 : static bool is_valid_gup_flags(unsigned int gup_flags)
1679 : {
1680 : /*
1681 : * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
1682 : * never directly by the caller, so enforce that with an assertion:
1683 : */
1684 7772 : if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
1685 : return false;
1686 : /*
1687 : * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
1688 : * that is, FOLL_LONGTERM is a specific case, more restrictive case of
1689 : * FOLL_PIN.
1690 : */
1691 7772 : if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1692 0 : return false;
1693 :
1694 : return true;
1695 : }
1696 :
1697 : #ifdef CONFIG_MMU
1698 7770 : static long __get_user_pages_remote(struct mm_struct *mm,
1699 : unsigned long start, unsigned long nr_pages,
1700 : unsigned int gup_flags, struct page **pages,
1701 : struct vm_area_struct **vmas, int *locked)
1702 : {
1703 : /*
1704 : * Parts of FOLL_LONGTERM behavior are incompatible with
1705 : * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1706 : * vmas. However, this only comes up if locked is set, and there are
1707 : * callers that do request FOLL_LONGTERM, but do not set locked. So,
1708 : * allow what we can.
1709 : */
1710 7770 : if (gup_flags & FOLL_LONGTERM) {
1711 0 : if (WARN_ON_ONCE(locked))
1712 : return -EINVAL;
1713 : /*
1714 : * This will check the vmas (even if our vmas arg is NULL)
1715 : * and return -ENOTSUPP if DAX isn't allowed in this case:
1716 : */
1717 0 : return __gup_longterm_locked(mm, start, nr_pages, pages,
1718 : vmas, gup_flags | FOLL_TOUCH |
1719 : FOLL_REMOTE);
1720 : }
1721 :
1722 7770 : return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1723 : locked,
1724 : gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1725 : }
1726 :
1727 : /**
1728 : * get_user_pages_remote() - pin user pages in memory
1729 : * @mm: mm_struct of target mm
1730 : * @start: starting user address
1731 : * @nr_pages: number of pages from start to pin
1732 : * @gup_flags: flags modifying lookup behaviour
1733 : * @pages: array that receives pointers to the pages pinned.
1734 : * Should be at least nr_pages long. Or NULL, if caller
1735 : * only intends to ensure the pages are faulted in.
1736 : * @vmas: array of pointers to vmas corresponding to each page.
1737 : * Or NULL if the caller does not require them.
1738 : * @locked: pointer to lock flag indicating whether lock is held and
1739 : * subsequently whether VM_FAULT_RETRY functionality can be
1740 : * utilised. Lock must initially be held.
1741 : *
1742 : * Returns either number of pages pinned (which may be less than the
1743 : * number requested), or an error. Details about the return value:
1744 : *
1745 : * -- If nr_pages is 0, returns 0.
1746 : * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1747 : * -- If nr_pages is >0, and some pages were pinned, returns the number of
1748 : * pages pinned. Again, this may be less than nr_pages.
1749 : *
1750 : * The caller is responsible for releasing returned @pages, via put_page().
1751 : *
1752 : * @vmas are valid only as long as mmap_lock is held.
1753 : *
1754 : * Must be called with mmap_lock held for read or write.
1755 : *
1756 : * get_user_pages_remote walks a process's page tables and takes a reference
1757 : * to each struct page that each user address corresponds to at a given
1758 : * instant. That is, it takes the page that would be accessed if a user
1759 : * thread accesses the given user virtual address at that instant.
1760 : *
1761 : * This does not guarantee that the page exists in the user mappings when
1762 : * get_user_pages_remote returns, and there may even be a completely different
1763 : * page there in some cases (eg. if mmapped pagecache has been invalidated
1764 : * and subsequently re faulted). However it does guarantee that the page
1765 : * won't be freed completely. And mostly callers simply care that the page
1766 : * contains data that was valid *at some point in time*. Typically, an IO
1767 : * or similar operation cannot guarantee anything stronger anyway because
1768 : * locks can't be held over the syscall boundary.
1769 : *
1770 : * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1771 : * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1772 : * be called after the page is finished with, and before put_page is called.
1773 : *
1774 : * get_user_pages_remote is typically used for fewer-copy IO operations,
1775 : * to get a handle on the memory by some means other than accesses
1776 : * via the user virtual addresses. The pages may be submitted for
1777 : * DMA to devices or accessed via their kernel linear mapping (via the
1778 : * kmap APIs). Care should be taken to use the correct cache flushing APIs.
1779 : *
1780 : * See also get_user_pages_fast, for performance critical applications.
1781 : *
1782 : * get_user_pages_remote should be phased out in favor of
1783 : * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
1784 : * should use get_user_pages_remote because it cannot pass
1785 : * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1786 : */
1787 7769 : long get_user_pages_remote(struct mm_struct *mm,
1788 : unsigned long start, unsigned long nr_pages,
1789 : unsigned int gup_flags, struct page **pages,
1790 : struct vm_area_struct **vmas, int *locked)
1791 : {
1792 7769 : if (!is_valid_gup_flags(gup_flags))
1793 : return -EINVAL;
1794 :
1795 7769 : return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
1796 : pages, vmas, locked);
1797 : }
1798 : EXPORT_SYMBOL(get_user_pages_remote);
1799 :
1800 : #else /* CONFIG_MMU */
1801 : long get_user_pages_remote(struct mm_struct *mm,
1802 : unsigned long start, unsigned long nr_pages,
1803 : unsigned int gup_flags, struct page **pages,
1804 : struct vm_area_struct **vmas, int *locked)
1805 : {
1806 : return 0;
1807 : }
1808 :
1809 : static long __get_user_pages_remote(struct mm_struct *mm,
1810 : unsigned long start, unsigned long nr_pages,
1811 : unsigned int gup_flags, struct page **pages,
1812 : struct vm_area_struct **vmas, int *locked)
1813 : {
1814 : return 0;
1815 : }
1816 : #endif /* !CONFIG_MMU */
1817 :
1818 : /**
1819 : * get_user_pages() - pin user pages in memory
1820 : * @start: starting user address
1821 : * @nr_pages: number of pages from start to pin
1822 : * @gup_flags: flags modifying lookup behaviour
1823 : * @pages: array that receives pointers to the pages pinned.
1824 : * Should be at least nr_pages long. Or NULL, if caller
1825 : * only intends to ensure the pages are faulted in.
1826 : * @vmas: array of pointers to vmas corresponding to each page.
1827 : * Or NULL if the caller does not require them.
1828 : *
1829 : * This is the same as get_user_pages_remote(), just with a less-flexible
1830 : * calling convention where we assume that the mm being operated on belongs to
1831 : * the current task, and doesn't allow passing of a locked parameter. We also
1832 : * obviously don't pass FOLL_REMOTE in here.
1833 : */
1834 0 : long get_user_pages(unsigned long start, unsigned long nr_pages,
1835 : unsigned int gup_flags, struct page **pages,
1836 : struct vm_area_struct **vmas)
1837 : {
1838 0 : if (!is_valid_gup_flags(gup_flags))
1839 : return -EINVAL;
1840 :
1841 0 : return __gup_longterm_locked(current->mm, start, nr_pages,
1842 : pages, vmas, gup_flags | FOLL_TOUCH);
1843 : }
1844 : EXPORT_SYMBOL(get_user_pages);
1845 :
1846 : /**
1847 : * get_user_pages_locked() - variant of get_user_pages()
1848 : *
1849 : * @start: starting user address
1850 : * @nr_pages: number of pages from start to pin
1851 : * @gup_flags: flags modifying lookup behaviour
1852 : * @pages: array that receives pointers to the pages pinned.
1853 : * Should be at least nr_pages long. Or NULL, if caller
1854 : * only intends to ensure the pages are faulted in.
1855 : * @locked: pointer to lock flag indicating whether lock is held and
1856 : * subsequently whether VM_FAULT_RETRY functionality can be
1857 : * utilised. Lock must initially be held.
1858 : *
1859 : * It is suitable to replace the form:
1860 : *
1861 : * mmap_read_lock(mm);
1862 : * do_something()
1863 : * get_user_pages(mm, ..., pages, NULL);
1864 : * mmap_read_unlock(mm);
1865 : *
1866 : * to:
1867 : *
1868 : * int locked = 1;
1869 : * mmap_read_lock(mm);
1870 : * do_something()
1871 : * get_user_pages_locked(mm, ..., pages, &locked);
1872 : * if (locked)
1873 : * mmap_read_unlock(mm);
1874 : *
1875 : * We can leverage the VM_FAULT_RETRY functionality in the page fault
1876 : * paths better by using either get_user_pages_locked() or
1877 : * get_user_pages_unlocked().
1878 : *
1879 : */
1880 0 : long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1881 : unsigned int gup_flags, struct page **pages,
1882 : int *locked)
1883 : {
1884 : /*
1885 : * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1886 : * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1887 : * vmas. As there are no users of this flag in this call we simply
1888 : * disallow this option for now.
1889 : */
1890 0 : if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1891 : return -EINVAL;
1892 : /*
1893 : * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
1894 : * never directly by the caller, so enforce that:
1895 : */
1896 0 : if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
1897 : return -EINVAL;
1898 :
1899 0 : return __get_user_pages_locked(current->mm, start, nr_pages,
1900 : pages, NULL, locked,
1901 : gup_flags | FOLL_TOUCH);
1902 : }
1903 : EXPORT_SYMBOL(get_user_pages_locked);
1904 :
1905 : /*
1906 : * get_user_pages_unlocked() is suitable to replace the form:
1907 : *
1908 : * mmap_read_lock(mm);
1909 : * get_user_pages(mm, ..., pages, NULL);
1910 : * mmap_read_unlock(mm);
1911 : *
1912 : * with:
1913 : *
1914 : * get_user_pages_unlocked(mm, ..., pages);
1915 : *
1916 : * It is functionally equivalent to get_user_pages_fast so
1917 : * get_user_pages_fast should be used instead if specific gup_flags
1918 : * (e.g. FOLL_FORCE) are not required.
1919 : */
1920 0 : long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1921 : struct page **pages, unsigned int gup_flags)
1922 : {
1923 0 : struct mm_struct *mm = current->mm;
1924 0 : int locked = 1;
1925 0 : long ret;
1926 :
1927 : /*
1928 : * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1929 : * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1930 : * vmas. As there are no users of this flag in this call we simply
1931 : * disallow this option for now.
1932 : */
1933 0 : if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1934 : return -EINVAL;
1935 :
1936 0 : mmap_read_lock(mm);
1937 0 : ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
1938 : &locked, gup_flags | FOLL_TOUCH);
1939 0 : if (locked)
1940 0 : mmap_read_unlock(mm);
1941 : return ret;
1942 : }
1943 : EXPORT_SYMBOL(get_user_pages_unlocked);
1944 :
1945 : /*
1946 : * Fast GUP
1947 : *
1948 : * get_user_pages_fast attempts to pin user pages by walking the page
1949 : * tables directly and avoids taking locks. Thus the walker needs to be
1950 : * protected from page table pages being freed from under it, and should
1951 : * block any THP splits.
1952 : *
1953 : * One way to achieve this is to have the walker disable interrupts, and
1954 : * rely on IPIs from the TLB flushing code blocking before the page table
1955 : * pages are freed. This is unsuitable for architectures that do not need
1956 : * to broadcast an IPI when invalidating TLBs.
1957 : *
1958 : * Another way to achieve this is to batch up page table containing pages
1959 : * belonging to more than one mm_user, then rcu_sched a callback to free those
1960 : * pages. Disabling interrupts will allow the fast_gup walker to both block
1961 : * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
1962 : * (which is a relatively rare event). The code below adopts this strategy.
1963 : *
1964 : * Before activating this code, please be aware that the following assumptions
1965 : * are currently made:
1966 : *
1967 : * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
1968 : * free pages containing page tables or TLB flushing requires IPI broadcast.
1969 : *
1970 : * *) ptes can be read atomically by the architecture.
1971 : *
1972 : * *) access_ok is sufficient to validate userspace address ranges.
1973 : *
1974 : * The last two assumptions can be relaxed by the addition of helper functions.
1975 : *
1976 : * This code is based heavily on the PowerPC implementation by Nick Piggin.
1977 : */
1978 : #ifdef CONFIG_HAVE_FAST_GUP
1979 :
1980 0 : static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
1981 : unsigned int flags,
1982 : struct page **pages)
1983 : {
1984 0 : while ((*nr) - nr_start) {
1985 0 : struct page *page = pages[--(*nr)];
1986 :
1987 0 : ClearPageReferenced(page);
1988 0 : if (flags & FOLL_PIN)
1989 0 : unpin_user_page(page);
1990 : else
1991 0 : put_page(page);
1992 : }
1993 0 : }
1994 :
1995 : #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
1996 2 : static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1997 : unsigned int flags, struct page **pages, int *nr)
1998 : {
1999 2 : struct dev_pagemap *pgmap = NULL;
2000 2 : int nr_start = *nr, ret = 0;
2001 2 : pte_t *ptep, *ptem;
2002 :
2003 4 : ptem = ptep = pte_offset_map(&pmd, addr);
2004 2 : do {
2005 2 : pte_t pte = ptep_get_lockless(ptep);
2006 2 : struct page *head, *page;
2007 :
2008 : /*
2009 : * Similar to the PMD case below, NUMA hinting must take slow
2010 : * path using the pte_protnone check.
2011 : */
2012 2 : if (pte_protnone(pte))
2013 2 : goto pte_unmap;
2014 :
2015 2 : if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2016 0 : goto pte_unmap;
2017 :
2018 2 : if (pte_devmap(pte)) {
2019 0 : if (unlikely(flags & FOLL_LONGTERM))
2020 0 : goto pte_unmap;
2021 :
2022 0 : pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
2023 0 : if (unlikely(!pgmap)) {
2024 0 : undo_dev_pagemap(nr, nr_start, flags, pages);
2025 0 : goto pte_unmap;
2026 : }
2027 2 : } else if (pte_special(pte))
2028 0 : goto pte_unmap;
2029 :
2030 2 : VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2031 2 : page = pte_page(pte);
2032 :
2033 2 : head = try_grab_compound_head(page, 1, flags);
2034 2 : if (!head)
2035 0 : goto pte_unmap;
2036 :
2037 2 : if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2038 0 : put_compound_head(head, 1, flags);
2039 0 : goto pte_unmap;
2040 : }
2041 :
2042 2 : VM_BUG_ON_PAGE(compound_head(page) != head, page);
2043 :
2044 : /*
2045 : * We need to make the page accessible if and only if we are
2046 : * going to access its content (the FOLL_PIN case). Please
2047 : * see Documentation/core-api/pin_user_pages.rst for
2048 : * details.
2049 : */
2050 2 : if (flags & FOLL_PIN) {
2051 2 : ret = arch_make_page_accessible(page);
2052 : if (ret) {
2053 : unpin_user_page(page);
2054 : goto pte_unmap;
2055 : }
2056 : }
2057 2 : SetPageReferenced(page);
2058 2 : pages[*nr] = page;
2059 2 : (*nr)++;
2060 :
2061 2 : } while (ptep++, addr += PAGE_SIZE, addr != end);
2062 :
2063 : ret = 1;
2064 :
2065 2 : pte_unmap:
2066 2 : if (pgmap)
2067 : put_dev_pagemap(pgmap);
2068 2 : pte_unmap(ptem);
2069 2 : return ret;
2070 : }
2071 : #else
2072 :
2073 : /*
2074 : * If we can't determine whether or not a pte is special, then fail immediately
2075 : * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
2076 : * to be special.
2077 : *
2078 : * For a futex to be placed on a THP tail page, get_futex_key requires a
2079 : * get_user_pages_fast_only implementation that can pin pages. Thus it's still
2080 : * useful to have gup_huge_pmd even if we can't operate on ptes.
2081 : */
2082 : static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
2083 : unsigned int flags, struct page **pages, int *nr)
2084 : {
2085 : return 0;
2086 : }
2087 : #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
2088 :
2089 : #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
2090 0 : static int __gup_device_huge(unsigned long pfn, unsigned long addr,
2091 : unsigned long end, unsigned int flags,
2092 : struct page **pages, int *nr)
2093 : {
2094 0 : int nr_start = *nr;
2095 0 : struct dev_pagemap *pgmap = NULL;
2096 :
2097 0 : do {
2098 0 : struct page *page = pfn_to_page(pfn);
2099 :
2100 0 : pgmap = get_dev_pagemap(pfn, pgmap);
2101 0 : if (unlikely(!pgmap)) {
2102 0 : undo_dev_pagemap(nr, nr_start, flags, pages);
2103 0 : return 0;
2104 : }
2105 : SetPageReferenced(page);
2106 : pages[*nr] = page;
2107 : if (unlikely(!try_grab_page(page, flags))) {
2108 : undo_dev_pagemap(nr, nr_start, flags, pages);
2109 : return 0;
2110 : }
2111 : (*nr)++;
2112 : pfn++;
2113 : } while (addr += PAGE_SIZE, addr != end);
2114 :
2115 : if (pgmap)
2116 : put_dev_pagemap(pgmap);
2117 : return 1;
2118 : }
2119 :
2120 0 : static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2121 : unsigned long end, unsigned int flags,
2122 : struct page **pages, int *nr)
2123 : {
2124 0 : unsigned long fault_pfn;
2125 0 : int nr_start = *nr;
2126 :
2127 0 : fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2128 0 : if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
2129 0 : return 0;
2130 :
2131 : if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2132 : undo_dev_pagemap(nr, nr_start, flags, pages);
2133 : return 0;
2134 : }
2135 : return 1;
2136 : }
2137 :
2138 : static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2139 : unsigned long end, unsigned int flags,
2140 : struct page **pages, int *nr)
2141 : {
2142 : unsigned long fault_pfn;
2143 : int nr_start = *nr;
2144 :
2145 : fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2146 : if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
2147 : return 0;
2148 :
2149 : if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2150 : undo_dev_pagemap(nr, nr_start, flags, pages);
2151 : return 0;
2152 : }
2153 : return 1;
2154 : }
2155 : #else
2156 : static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2157 : unsigned long end, unsigned int flags,
2158 : struct page **pages, int *nr)
2159 : {
2160 : BUILD_BUG();
2161 : return 0;
2162 : }
2163 :
2164 : static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
2165 : unsigned long end, unsigned int flags,
2166 : struct page **pages, int *nr)
2167 : {
2168 : BUILD_BUG();
2169 : return 0;
2170 : }
2171 : #endif
2172 :
2173 0 : static int record_subpages(struct page *page, unsigned long addr,
2174 : unsigned long end, struct page **pages)
2175 : {
2176 : int nr;
2177 :
2178 0 : for (nr = 0; addr != end; addr += PAGE_SIZE)
2179 0 : pages[nr++] = page++;
2180 :
2181 0 : return nr;
2182 : }
2183 :
2184 : #ifdef CONFIG_ARCH_HAS_HUGEPD
2185 : static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
2186 : unsigned long sz)
2187 : {
2188 : unsigned long __boundary = (addr + sz) & ~(sz-1);
2189 : return (__boundary - 1 < end - 1) ? __boundary : end;
2190 : }
2191 :
2192 : static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
2193 : unsigned long end, unsigned int flags,
2194 : struct page **pages, int *nr)
2195 : {
2196 : unsigned long pte_end;
2197 : struct page *head, *page;
2198 : pte_t pte;
2199 : int refs;
2200 :
2201 : pte_end = (addr + sz) & ~(sz-1);
2202 : if (pte_end < end)
2203 : end = pte_end;
2204 :
2205 : pte = huge_ptep_get(ptep);
2206 :
2207 : if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2208 : return 0;
2209 :
2210 : /* hugepages are never "special" */
2211 : VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2212 :
2213 : head = pte_page(pte);
2214 : page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
2215 : refs = record_subpages(page, addr, end, pages + *nr);
2216 :
2217 : head = try_grab_compound_head(head, refs, flags);
2218 : if (!head)
2219 : return 0;
2220 :
2221 : if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2222 : put_compound_head(head, refs, flags);
2223 : return 0;
2224 : }
2225 :
2226 : *nr += refs;
2227 : SetPageReferenced(head);
2228 : return 1;
2229 : }
2230 :
2231 : static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2232 : unsigned int pdshift, unsigned long end, unsigned int flags,
2233 : struct page **pages, int *nr)
2234 : {
2235 : pte_t *ptep;
2236 : unsigned long sz = 1UL << hugepd_shift(hugepd);
2237 : unsigned long next;
2238 :
2239 : ptep = hugepte_offset(hugepd, addr, pdshift);
2240 : do {
2241 : next = hugepte_addr_end(addr, end, sz);
2242 : if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
2243 : return 0;
2244 : } while (ptep++, addr = next, addr != end);
2245 :
2246 : return 1;
2247 : }
2248 : #else
2249 : static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2250 : unsigned int pdshift, unsigned long end, unsigned int flags,
2251 : struct page **pages, int *nr)
2252 : {
2253 : return 0;
2254 : }
2255 : #endif /* CONFIG_ARCH_HAS_HUGEPD */
2256 :
2257 0 : static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2258 : unsigned long end, unsigned int flags,
2259 : struct page **pages, int *nr)
2260 : {
2261 0 : struct page *head, *page;
2262 0 : int refs;
2263 :
2264 0 : if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
2265 : return 0;
2266 :
2267 0 : if (pmd_devmap(orig)) {
2268 0 : if (unlikely(flags & FOLL_LONGTERM))
2269 : return 0;
2270 0 : return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
2271 : pages, nr);
2272 : }
2273 :
2274 0 : page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2275 0 : refs = record_subpages(page, addr, end, pages + *nr);
2276 :
2277 0 : head = try_grab_compound_head(pmd_page(orig), refs, flags);
2278 0 : if (!head)
2279 : return 0;
2280 :
2281 0 : if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2282 0 : put_compound_head(head, refs, flags);
2283 0 : return 0;
2284 : }
2285 :
2286 0 : *nr += refs;
2287 0 : SetPageReferenced(head);
2288 0 : return 1;
2289 : }
2290 :
2291 : static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2292 : unsigned long end, unsigned int flags,
2293 : struct page **pages, int *nr)
2294 : {
2295 : struct page *head, *page;
2296 : int refs;
2297 :
2298 : if (!pud_access_permitted(orig, flags & FOLL_WRITE))
2299 : return 0;
2300 :
2301 : if (pud_devmap(orig)) {
2302 : if (unlikely(flags & FOLL_LONGTERM))
2303 : return 0;
2304 : return __gup_device_huge_pud(orig, pudp, addr, end, flags,
2305 : pages, nr);
2306 : }
2307 :
2308 : page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2309 : refs = record_subpages(page, addr, end, pages + *nr);
2310 :
2311 : head = try_grab_compound_head(pud_page(orig), refs, flags);
2312 : if (!head)
2313 : return 0;
2314 :
2315 : if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2316 : put_compound_head(head, refs, flags);
2317 : return 0;
2318 : }
2319 :
2320 : *nr += refs;
2321 : SetPageReferenced(head);
2322 : return 1;
2323 : }
2324 :
2325 : static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
2326 : unsigned long end, unsigned int flags,
2327 : struct page **pages, int *nr)
2328 : {
2329 : int refs;
2330 : struct page *head, *page;
2331 :
2332 : if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
2333 : return 0;
2334 :
2335 : BUILD_BUG_ON(pgd_devmap(orig));
2336 :
2337 : page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
2338 : refs = record_subpages(page, addr, end, pages + *nr);
2339 :
2340 : head = try_grab_compound_head(pgd_page(orig), refs, flags);
2341 : if (!head)
2342 : return 0;
2343 :
2344 : if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
2345 : put_compound_head(head, refs, flags);
2346 : return 0;
2347 : }
2348 :
2349 : *nr += refs;
2350 : SetPageReferenced(head);
2351 : return 1;
2352 : }
2353 :
2354 2 : static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
2355 : unsigned int flags, struct page **pages, int *nr)
2356 : {
2357 2 : unsigned long next;
2358 2 : pmd_t *pmdp;
2359 :
2360 4 : pmdp = pmd_offset_lockless(pudp, pud, addr);
2361 2 : do {
2362 2 : pmd_t pmd = READ_ONCE(*pmdp);
2363 :
2364 2 : next = pmd_addr_end(addr, end);
2365 4 : if (!pmd_present(pmd))
2366 0 : return 0;
2367 :
2368 2 : if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
2369 : pmd_devmap(pmd))) {
2370 : /*
2371 : * NUMA hinting faults need to be handled in the GUP
2372 : * slowpath for accounting purposes and so that they
2373 : * can be serialised against THP migration.
2374 : */
2375 0 : if (pmd_protnone(pmd))
2376 : return 0;
2377 :
2378 0 : if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
2379 : pages, nr))
2380 : return 0;
2381 :
2382 2 : } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
2383 : /*
2384 : * architecture have different format for hugetlbfs
2385 : * pmd format and THP pmd format
2386 : */
2387 : if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
2388 : PMD_SHIFT, next, flags, pages, nr))
2389 : return 0;
2390 2 : } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
2391 : return 0;
2392 2 : } while (pmdp++, addr = next, addr != end);
2393 :
2394 : return 1;
2395 : }
2396 :
2397 2 : static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
2398 : unsigned int flags, struct page **pages, int *nr)
2399 : {
2400 2 : unsigned long next;
2401 2 : pud_t *pudp;
2402 :
2403 2 : pudp = pud_offset_lockless(p4dp, p4d, addr);
2404 2 : do {
2405 2 : pud_t pud = READ_ONCE(*pudp);
2406 :
2407 2 : next = pud_addr_end(addr, end);
2408 4 : if (unlikely(!pud_present(pud)))
2409 0 : return 0;
2410 2 : if (unlikely(pud_huge(pud))) {
2411 : if (!gup_huge_pud(pud, pudp, addr, next, flags,
2412 : pages, nr))
2413 : return 0;
2414 2 : } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
2415 : if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
2416 : PUD_SHIFT, next, flags, pages, nr))
2417 : return 0;
2418 2 : } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
2419 : return 0;
2420 2 : } while (pudp++, addr = next, addr != end);
2421 :
2422 : return 1;
2423 : }
2424 :
2425 2 : static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
2426 : unsigned int flags, struct page **pages, int *nr)
2427 : {
2428 2 : unsigned long next;
2429 2 : p4d_t *p4dp;
2430 :
2431 2 : p4dp = p4d_offset_lockless(pgdp, pgd, addr);
2432 2 : do {
2433 2 : p4d_t p4d = READ_ONCE(*p4dp);
2434 :
2435 2 : next = p4d_addr_end(addr, end);
2436 2 : if (p4d_none(p4d))
2437 0 : return 0;
2438 2 : BUILD_BUG_ON(p4d_huge(p4d));
2439 2 : if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
2440 : if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
2441 : P4D_SHIFT, next, flags, pages, nr))
2442 : return 0;
2443 2 : } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
2444 : return 0;
2445 2 : } while (p4dp++, addr = next, addr != end);
2446 :
2447 2 : return 1;
2448 : }
2449 :
2450 2 : static void gup_pgd_range(unsigned long addr, unsigned long end,
2451 : unsigned int flags, struct page **pages, int *nr)
2452 : {
2453 2 : unsigned long next;
2454 2 : pgd_t *pgdp;
2455 :
2456 2 : pgdp = pgd_offset(current->mm, addr);
2457 2 : do {
2458 2 : pgd_t pgd = READ_ONCE(*pgdp);
2459 :
2460 2 : next = pgd_addr_end(addr, end);
2461 2 : if (pgd_none(pgd))
2462 0 : return;
2463 2 : if (unlikely(pgd_huge(pgd))) {
2464 : if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
2465 : pages, nr))
2466 : return;
2467 2 : } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
2468 : if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
2469 : PGDIR_SHIFT, next, flags, pages, nr))
2470 : return;
2471 2 : } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
2472 0 : return;
2473 2 : } while (pgdp++, addr = next, addr != end);
2474 : }
2475 : #else
2476 : static inline void gup_pgd_range(unsigned long addr, unsigned long end,
2477 : unsigned int flags, struct page **pages, int *nr)
2478 : {
2479 : }
2480 : #endif /* CONFIG_HAVE_FAST_GUP */
2481 :
2482 : #ifndef gup_fast_permitted
2483 : /*
2484 : * Check if it's allowed to use get_user_pages_fast_only() for the range, or
2485 : * we need to fall back to the slow version:
2486 : */
2487 : static bool gup_fast_permitted(unsigned long start, unsigned long end)
2488 : {
2489 : return true;
2490 : }
2491 : #endif
2492 :
2493 0 : static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
2494 : unsigned int gup_flags, struct page **pages)
2495 : {
2496 0 : int ret;
2497 :
2498 : /*
2499 : * FIXME: FOLL_LONGTERM does not work with
2500 : * get_user_pages_unlocked() (see comments in that function)
2501 : */
2502 0 : if (gup_flags & FOLL_LONGTERM) {
2503 0 : mmap_read_lock(current->mm);
2504 0 : ret = __gup_longterm_locked(current->mm,
2505 : start, nr_pages,
2506 : pages, NULL, gup_flags);
2507 0 : mmap_read_unlock(current->mm);
2508 : } else {
2509 0 : ret = get_user_pages_unlocked(start, nr_pages,
2510 : pages, gup_flags);
2511 : }
2512 :
2513 0 : return ret;
2514 : }
2515 :
2516 2 : static unsigned long lockless_pages_from_mm(unsigned long start,
2517 : unsigned long end,
2518 : unsigned int gup_flags,
2519 : struct page **pages)
2520 : {
2521 2 : unsigned long flags;
2522 2 : int nr_pinned = 0;
2523 2 : unsigned seq;
2524 :
2525 2 : if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
2526 4 : !gup_fast_permitted(start, end))
2527 : return 0;
2528 :
2529 2 : if (gup_flags & FOLL_PIN) {
2530 0 : seq = raw_read_seqcount(¤t->mm->write_protect_seq);
2531 0 : if (seq & 1)
2532 : return 0;
2533 : }
2534 :
2535 : /*
2536 : * Disable interrupts. The nested form is used, in order to allow full,
2537 : * general purpose use of this routine.
2538 : *
2539 : * With interrupts disabled, we block page table pages from being freed
2540 : * from under us. See struct mmu_table_batch comments in
2541 : * include/asm-generic/tlb.h for more details.
2542 : *
2543 : * We do not adopt an rcu_read_lock() here as we also want to block IPIs
2544 : * that come from THPs splitting.
2545 : */
2546 4 : local_irq_save(flags);
2547 2 : gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
2548 2 : local_irq_restore(flags);
2549 :
2550 : /*
2551 : * When pinning pages for DMA there could be a concurrent write protect
2552 : * from fork() via copy_page_range(), in this case always fail fast GUP.
2553 : */
2554 2 : if (gup_flags & FOLL_PIN) {
2555 0 : if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) {
2556 0 : unpin_user_pages(pages, nr_pinned);
2557 0 : return 0;
2558 : }
2559 : }
2560 2 : return nr_pinned;
2561 : }
2562 :
2563 2 : static int internal_get_user_pages_fast(unsigned long start,
2564 : unsigned long nr_pages,
2565 : unsigned int gup_flags,
2566 : struct page **pages)
2567 : {
2568 2 : unsigned long len, end;
2569 2 : unsigned long nr_pinned;
2570 2 : int ret;
2571 :
2572 2 : if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
2573 : FOLL_FORCE | FOLL_PIN | FOLL_GET |
2574 : FOLL_FAST_ONLY)))
2575 : return -EINVAL;
2576 :
2577 2 : if (gup_flags & FOLL_PIN)
2578 0 : atomic_set(¤t->mm->has_pinned, 1);
2579 :
2580 2 : if (!(gup_flags & FOLL_FAST_ONLY))
2581 2 : might_lock_read(¤t->mm->mmap_lock);
2582 :
2583 2 : start = untagged_addr(start) & PAGE_MASK;
2584 2 : len = nr_pages << PAGE_SHIFT;
2585 2 : if (check_add_overflow(start, len, &end))
2586 : return 0;
2587 4 : if (unlikely(!access_ok((void __user *)start, len)))
2588 : return -EFAULT;
2589 :
2590 2 : nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
2591 2 : if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
2592 2 : return nr_pinned;
2593 :
2594 : /* Slow path: try to get the remaining pages with get_user_pages */
2595 0 : start += nr_pinned << PAGE_SHIFT;
2596 0 : pages += nr_pinned;
2597 0 : ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
2598 : pages);
2599 0 : if (ret < 0) {
2600 : /*
2601 : * The caller has to unpin the pages we already pinned so
2602 : * returning -errno is not an option
2603 : */
2604 0 : if (nr_pinned)
2605 0 : return nr_pinned;
2606 : return ret;
2607 : }
2608 0 : return ret + nr_pinned;
2609 : }
2610 :
2611 : /**
2612 : * get_user_pages_fast_only() - pin user pages in memory
2613 : * @start: starting user address
2614 : * @nr_pages: number of pages from start to pin
2615 : * @gup_flags: flags modifying pin behaviour
2616 : * @pages: array that receives pointers to the pages pinned.
2617 : * Should be at least nr_pages long.
2618 : *
2619 : * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
2620 : * the regular GUP.
2621 : * Note a difference with get_user_pages_fast: this always returns the
2622 : * number of pages pinned, 0 if no pages were pinned.
2623 : *
2624 : * If the architecture does not support this function, simply return with no
2625 : * pages pinned.
2626 : *
2627 : * Careful, careful! COW breaking can go either way, so a non-write
2628 : * access can get ambiguous page results. If you call this function without
2629 : * 'write' set, you'd better be sure that you're ok with that ambiguity.
2630 : */
2631 0 : int get_user_pages_fast_only(unsigned long start, int nr_pages,
2632 : unsigned int gup_flags, struct page **pages)
2633 : {
2634 0 : int nr_pinned;
2635 : /*
2636 : * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
2637 : * because gup fast is always a "pin with a +1 page refcount" request.
2638 : *
2639 : * FOLL_FAST_ONLY is required in order to match the API description of
2640 : * this routine: no fall back to regular ("slow") GUP.
2641 : */
2642 0 : gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
2643 :
2644 0 : nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
2645 : pages);
2646 :
2647 : /*
2648 : * As specified in the API description above, this routine is not
2649 : * allowed to return negative values. However, the common core
2650 : * routine internal_get_user_pages_fast() *can* return -errno.
2651 : * Therefore, correct for that here:
2652 : */
2653 0 : if (nr_pinned < 0)
2654 : nr_pinned = 0;
2655 :
2656 0 : return nr_pinned;
2657 : }
2658 : EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
2659 :
2660 : /**
2661 : * get_user_pages_fast() - pin user pages in memory
2662 : * @start: starting user address
2663 : * @nr_pages: number of pages from start to pin
2664 : * @gup_flags: flags modifying pin behaviour
2665 : * @pages: array that receives pointers to the pages pinned.
2666 : * Should be at least nr_pages long.
2667 : *
2668 : * Attempt to pin user pages in memory without taking mm->mmap_lock.
2669 : * If not successful, it will fall back to taking the lock and
2670 : * calling get_user_pages().
2671 : *
2672 : * Returns number of pages pinned. This may be fewer than the number requested.
2673 : * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
2674 : * -errno.
2675 : */
2676 2 : int get_user_pages_fast(unsigned long start, int nr_pages,
2677 : unsigned int gup_flags, struct page **pages)
2678 : {
2679 2 : if (!is_valid_gup_flags(gup_flags))
2680 : return -EINVAL;
2681 :
2682 : /*
2683 : * The caller may or may not have explicitly set FOLL_GET; either way is
2684 : * OK. However, internally (within mm/gup.c), gup fast variants must set
2685 : * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
2686 : * request.
2687 : */
2688 2 : gup_flags |= FOLL_GET;
2689 2 : return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
2690 : }
2691 : EXPORT_SYMBOL_GPL(get_user_pages_fast);
2692 :
2693 : /**
2694 : * pin_user_pages_fast() - pin user pages in memory without taking locks
2695 : *
2696 : * @start: starting user address
2697 : * @nr_pages: number of pages from start to pin
2698 : * @gup_flags: flags modifying pin behaviour
2699 : * @pages: array that receives pointers to the pages pinned.
2700 : * Should be at least nr_pages long.
2701 : *
2702 : * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
2703 : * get_user_pages_fast() for documentation on the function arguments, because
2704 : * the arguments here are identical.
2705 : *
2706 : * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
2707 : * see Documentation/core-api/pin_user_pages.rst for further details.
2708 : */
2709 0 : int pin_user_pages_fast(unsigned long start, int nr_pages,
2710 : unsigned int gup_flags, struct page **pages)
2711 : {
2712 : /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2713 0 : if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2714 : return -EINVAL;
2715 :
2716 0 : gup_flags |= FOLL_PIN;
2717 0 : return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
2718 : }
2719 : EXPORT_SYMBOL_GPL(pin_user_pages_fast);
2720 :
2721 : /*
2722 : * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
2723 : * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
2724 : *
2725 : * The API rules are the same, too: no negative values may be returned.
2726 : */
2727 0 : int pin_user_pages_fast_only(unsigned long start, int nr_pages,
2728 : unsigned int gup_flags, struct page **pages)
2729 : {
2730 0 : int nr_pinned;
2731 :
2732 : /*
2733 : * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
2734 : * rules require returning 0, rather than -errno:
2735 : */
2736 0 : if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2737 : return 0;
2738 : /*
2739 : * FOLL_FAST_ONLY is required in order to match the API description of
2740 : * this routine: no fall back to regular ("slow") GUP.
2741 : */
2742 0 : gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
2743 0 : nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
2744 : pages);
2745 : /*
2746 : * This routine is not allowed to return negative values. However,
2747 : * internal_get_user_pages_fast() *can* return -errno. Therefore,
2748 : * correct for that here:
2749 : */
2750 0 : if (nr_pinned < 0)
2751 : nr_pinned = 0;
2752 :
2753 : return nr_pinned;
2754 : }
2755 : EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
2756 :
2757 : /**
2758 : * pin_user_pages_remote() - pin pages of a remote process
2759 : *
2760 : * @mm: mm_struct of target mm
2761 : * @start: starting user address
2762 : * @nr_pages: number of pages from start to pin
2763 : * @gup_flags: flags modifying lookup behaviour
2764 : * @pages: array that receives pointers to the pages pinned.
2765 : * Should be at least nr_pages long. Or NULL, if caller
2766 : * only intends to ensure the pages are faulted in.
2767 : * @vmas: array of pointers to vmas corresponding to each page.
2768 : * Or NULL if the caller does not require them.
2769 : * @locked: pointer to lock flag indicating whether lock is held and
2770 : * subsequently whether VM_FAULT_RETRY functionality can be
2771 : * utilised. Lock must initially be held.
2772 : *
2773 : * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
2774 : * get_user_pages_remote() for documentation on the function arguments, because
2775 : * the arguments here are identical.
2776 : *
2777 : * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
2778 : * see Documentation/core-api/pin_user_pages.rst for details.
2779 : */
2780 0 : long pin_user_pages_remote(struct mm_struct *mm,
2781 : unsigned long start, unsigned long nr_pages,
2782 : unsigned int gup_flags, struct page **pages,
2783 : struct vm_area_struct **vmas, int *locked)
2784 : {
2785 : /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2786 0 : if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2787 : return -EINVAL;
2788 :
2789 0 : gup_flags |= FOLL_PIN;
2790 0 : return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
2791 : pages, vmas, locked);
2792 : }
2793 : EXPORT_SYMBOL(pin_user_pages_remote);
2794 :
2795 : /**
2796 : * pin_user_pages() - pin user pages in memory for use by other devices
2797 : *
2798 : * @start: starting user address
2799 : * @nr_pages: number of pages from start to pin
2800 : * @gup_flags: flags modifying lookup behaviour
2801 : * @pages: array that receives pointers to the pages pinned.
2802 : * Should be at least nr_pages long. Or NULL, if caller
2803 : * only intends to ensure the pages are faulted in.
2804 : * @vmas: array of pointers to vmas corresponding to each page.
2805 : * Or NULL if the caller does not require them.
2806 : *
2807 : * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
2808 : * FOLL_PIN is set.
2809 : *
2810 : * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
2811 : * see Documentation/core-api/pin_user_pages.rst for details.
2812 : */
2813 0 : long pin_user_pages(unsigned long start, unsigned long nr_pages,
2814 : unsigned int gup_flags, struct page **pages,
2815 : struct vm_area_struct **vmas)
2816 : {
2817 : /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2818 0 : if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2819 : return -EINVAL;
2820 :
2821 0 : gup_flags |= FOLL_PIN;
2822 0 : return __gup_longterm_locked(current->mm, start, nr_pages,
2823 : pages, vmas, gup_flags);
2824 : }
2825 : EXPORT_SYMBOL(pin_user_pages);
2826 :
2827 : /*
2828 : * pin_user_pages_unlocked() is the FOLL_PIN variant of
2829 : * get_user_pages_unlocked(). Behavior is the same, except that this one sets
2830 : * FOLL_PIN and rejects FOLL_GET.
2831 : */
2832 0 : long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2833 : struct page **pages, unsigned int gup_flags)
2834 : {
2835 : /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2836 0 : if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2837 : return -EINVAL;
2838 :
2839 0 : gup_flags |= FOLL_PIN;
2840 0 : return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
2841 : }
2842 : EXPORT_SYMBOL(pin_user_pages_unlocked);
2843 :
2844 : /*
2845 : * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
2846 : * Behavior is the same, except that this one sets FOLL_PIN and rejects
2847 : * FOLL_GET.
2848 : */
2849 0 : long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
2850 : unsigned int gup_flags, struct page **pages,
2851 : int *locked)
2852 : {
2853 : /*
2854 : * FIXME: Current FOLL_LONGTERM behavior is incompatible with
2855 : * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
2856 : * vmas. As there are no users of this flag in this call we simply
2857 : * disallow this option for now.
2858 : */
2859 0 : if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
2860 : return -EINVAL;
2861 :
2862 : /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2863 0 : if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2864 : return -EINVAL;
2865 :
2866 0 : gup_flags |= FOLL_PIN;
2867 0 : return __get_user_pages_locked(current->mm, start, nr_pages,
2868 : pages, NULL, locked,
2869 : gup_flags | FOLL_TOUCH);
2870 : }
2871 : EXPORT_SYMBOL(pin_user_pages_locked);
|