Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * linux/mm/mlock.c
4 : *
5 : * (C) Copyright 1995 Linus Torvalds
6 : * (C) Copyright 2002 Christoph Hellwig
7 : */
8 :
9 : #include <linux/capability.h>
10 : #include <linux/mman.h>
11 : #include <linux/mm.h>
12 : #include <linux/sched/user.h>
13 : #include <linux/swap.h>
14 : #include <linux/swapops.h>
15 : #include <linux/pagemap.h>
16 : #include <linux/pagevec.h>
17 : #include <linux/mempolicy.h>
18 : #include <linux/syscalls.h>
19 : #include <linux/sched.h>
20 : #include <linux/export.h>
21 : #include <linux/rmap.h>
22 : #include <linux/mmzone.h>
23 : #include <linux/hugetlb.h>
24 : #include <linux/memcontrol.h>
25 : #include <linux/mm_inline.h>
26 :
27 : #include "internal.h"
28 :
29 1 : bool can_do_mlock(void)
30 : {
31 1 : if (rlimit(RLIMIT_MEMLOCK) != 0)
32 : return true;
33 0 : if (capable(CAP_IPC_LOCK))
34 0 : return true;
35 : return false;
36 : }
37 : EXPORT_SYMBOL(can_do_mlock);
38 :
39 : /*
40 : * Mlocked pages are marked with PageMlocked() flag for efficient testing
41 : * in vmscan and, possibly, the fault path; and to support semi-accurate
42 : * statistics.
43 : *
44 : * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
45 : * be placed on the LRU "unevictable" list, rather than the [in]active lists.
46 : * The unevictable list is an LRU sibling list to the [in]active lists.
47 : * PageUnevictable is set to indicate the unevictable state.
48 : *
49 : * When lazy mlocking via vmscan, it is important to ensure that the
50 : * vma's VM_LOCKED status is not concurrently being modified, otherwise we
51 : * may have mlocked a page that is being munlocked. So lazy mlock must take
52 : * the mmap_lock for read, and verify that the vma really is locked
53 : * (see mm/rmap.c).
54 : */
55 :
56 : /*
57 : * LRU accounting for clear_page_mlock()
58 : */
59 0 : void clear_page_mlock(struct page *page)
60 : {
61 0 : int nr_pages;
62 :
63 0 : if (!TestClearPageMlocked(page))
64 : return;
65 :
66 0 : nr_pages = thp_nr_pages(page);
67 0 : mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
68 0 : count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
69 : /*
70 : * The previous TestClearPageMlocked() corresponds to the smp_mb()
71 : * in __pagevec_lru_add_fn().
72 : *
73 : * See __pagevec_lru_add_fn for more explanation.
74 : */
75 0 : if (!isolate_lru_page(page)) {
76 0 : putback_lru_page(page);
77 : } else {
78 : /*
79 : * We lost the race. the page already moved to evictable list.
80 : */
81 0 : if (PageUnevictable(page))
82 0 : count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
83 : }
84 : }
85 :
86 : /*
87 : * Mark page as mlocked if not already.
88 : * If page on LRU, isolate and putback to move to unevictable list.
89 : */
90 16 : void mlock_vma_page(struct page *page)
91 : {
92 : /* Serialize with page migration */
93 32 : BUG_ON(!PageLocked(page));
94 :
95 16 : VM_BUG_ON_PAGE(PageTail(page), page);
96 32 : VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
97 :
98 32 : if (!TestSetPageMlocked(page)) {
99 1 : int nr_pages = thp_nr_pages(page);
100 :
101 1 : mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
102 1 : count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
103 1 : if (!isolate_lru_page(page))
104 1 : putback_lru_page(page);
105 : }
106 16 : }
107 :
108 : /*
109 : * Finish munlock after successful page isolation
110 : *
111 : * Page must be locked. This is a wrapper for try_to_munlock()
112 : * and putback_lru_page() with munlock accounting.
113 : */
114 0 : static void __munlock_isolated_page(struct page *page)
115 : {
116 : /*
117 : * Optimization: if the page was mapped just once, that's our mapping
118 : * and we don't need to check all the other vmas.
119 : */
120 0 : if (page_mapcount(page) > 1)
121 0 : try_to_munlock(page);
122 :
123 : /* Did try_to_unlock() succeed or punt? */
124 0 : if (!PageMlocked(page))
125 0 : count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
126 :
127 0 : putback_lru_page(page);
128 0 : }
129 :
130 : /*
131 : * Accounting for page isolation fail during munlock
132 : *
133 : * Performs accounting when page isolation fails in munlock. There is nothing
134 : * else to do because it means some other task has already removed the page
135 : * from the LRU. putback_lru_page() will take care of removing the page from
136 : * the unevictable list, if necessary. vmscan [page_referenced()] will move
137 : * the page back to the unevictable list if some other vma has it mlocked.
138 : */
139 0 : static void __munlock_isolation_failed(struct page *page)
140 : {
141 0 : int nr_pages = thp_nr_pages(page);
142 :
143 0 : if (PageUnevictable(page))
144 0 : __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
145 : else
146 0 : __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
147 0 : }
148 :
149 : /**
150 : * munlock_vma_page - munlock a vma page
151 : * @page: page to be unlocked, either a normal page or THP page head
152 : *
153 : * returns the size of the page as a page mask (0 for normal page,
154 : * HPAGE_PMD_NR - 1 for THP head page)
155 : *
156 : * called from munlock()/munmap() path with page supposedly on the LRU.
157 : * When we munlock a page, because the vma where we found the page is being
158 : * munlock()ed or munmap()ed, we want to check whether other vmas hold the
159 : * page locked so that we can leave it on the unevictable lru list and not
160 : * bother vmscan with it. However, to walk the page's rmap list in
161 : * try_to_munlock() we must isolate the page from the LRU. If some other
162 : * task has removed the page from the LRU, we won't be able to do that.
163 : * So we clear the PageMlocked as we might not get another chance. If we
164 : * can't isolate the page, we leave it for putback_lru_page() and vmscan
165 : * [page_referenced()/try_to_unmap()] to deal with.
166 : */
167 0 : unsigned int munlock_vma_page(struct page *page)
168 : {
169 0 : int nr_pages;
170 :
171 : /* For try_to_munlock() and to serialize with page migration */
172 0 : BUG_ON(!PageLocked(page));
173 0 : VM_BUG_ON_PAGE(PageTail(page), page);
174 :
175 0 : if (!TestClearPageMlocked(page)) {
176 : /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
177 : return 0;
178 : }
179 :
180 0 : nr_pages = thp_nr_pages(page);
181 0 : mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
182 :
183 0 : if (!isolate_lru_page(page))
184 0 : __munlock_isolated_page(page);
185 : else
186 0 : __munlock_isolation_failed(page);
187 :
188 0 : return nr_pages - 1;
189 : }
190 :
191 : /*
192 : * convert get_user_pages() return value to posix mlock() error
193 : */
194 0 : static int __mlock_posix_error_return(long retval)
195 : {
196 0 : if (retval == -EFAULT)
197 : retval = -ENOMEM;
198 0 : else if (retval == -ENOMEM)
199 0 : retval = -EAGAIN;
200 0 : return retval;
201 : }
202 :
203 : /*
204 : * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
205 : *
206 : * The fast path is available only for evictable pages with single mapping.
207 : * Then we can bypass the per-cpu pvec and get better performance.
208 : * when mapcount > 1 we need try_to_munlock() which can fail.
209 : * when !page_evictable(), we need the full redo logic of putback_lru_page to
210 : * avoid leaving evictable page in unevictable list.
211 : *
212 : * In case of success, @page is added to @pvec and @pgrescued is incremented
213 : * in case that the page was previously unevictable. @page is also unlocked.
214 : */
215 16 : static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
216 : int *pgrescued)
217 : {
218 32 : VM_BUG_ON_PAGE(PageLRU(page), page);
219 32 : VM_BUG_ON_PAGE(!PageLocked(page), page);
220 :
221 16 : if (page_mapcount(page) <= 1 && page_evictable(page)) {
222 16 : pagevec_add(pvec, page);
223 32 : if (TestClearPageUnevictable(page))
224 16 : (*pgrescued)++;
225 16 : unlock_page(page);
226 16 : return true;
227 : }
228 :
229 : return false;
230 : }
231 :
232 : /*
233 : * Putback multiple evictable pages to the LRU
234 : *
235 : * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
236 : * the pages might have meanwhile become unevictable but that is OK.
237 : */
238 2 : static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
239 : {
240 2 : count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
241 : /*
242 : *__pagevec_lru_add() calls release_pages() so we don't call
243 : * put_page() explicitly
244 : */
245 2 : __pagevec_lru_add(pvec);
246 2 : count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
247 2 : }
248 :
249 : /*
250 : * Munlock a batch of pages from the same zone
251 : *
252 : * The work is split to two main phases. First phase clears the Mlocked flag
253 : * and attempts to isolate the pages, all under a single zone lru lock.
254 : * The second phase finishes the munlock only for pages where isolation
255 : * succeeded.
256 : *
257 : * Note that the pagevec may be modified during the process.
258 : */
259 2 : static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
260 : {
261 2 : int i;
262 2 : int nr = pagevec_count(pvec);
263 2 : int delta_munlocked = -nr;
264 2 : struct pagevec pvec_putback;
265 2 : struct lruvec *lruvec = NULL;
266 2 : int pgrescued = 0;
267 :
268 2 : pagevec_init(&pvec_putback);
269 :
270 : /* Phase 1: page isolation */
271 18 : for (i = 0; i < nr; i++) {
272 16 : struct page *page = pvec->pages[i];
273 :
274 32 : if (TestClearPageMlocked(page)) {
275 : /*
276 : * We already have pin from follow_page_mask()
277 : * so we can spare the get_page() here.
278 : */
279 32 : if (TestClearPageLRU(page)) {
280 16 : lruvec = relock_page_lruvec_irq(page, lruvec);
281 16 : del_page_from_lru_list(page, lruvec);
282 16 : continue;
283 : } else
284 0 : __munlock_isolation_failed(page);
285 : } else {
286 0 : delta_munlocked++;
287 : }
288 :
289 : /*
290 : * We won't be munlocking this page in the next phase
291 : * but we still need to release the follow_page_mask()
292 : * pin. We cannot do it under lru_lock however. If it's
293 : * the last pin, __page_cache_release() would deadlock.
294 : */
295 0 : pagevec_add(&pvec_putback, pvec->pages[i]);
296 0 : pvec->pages[i] = NULL;
297 : }
298 2 : if (lruvec) {
299 2 : __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
300 2 : unlock_page_lruvec_irq(lruvec);
301 0 : } else if (delta_munlocked) {
302 0 : mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
303 : }
304 :
305 : /* Now we can release pins of pages that we are not munlocking */
306 2 : pagevec_release(&pvec_putback);
307 :
308 : /* Phase 2: page munlock */
309 18 : for (i = 0; i < nr; i++) {
310 16 : struct page *page = pvec->pages[i];
311 :
312 16 : if (page) {
313 16 : lock_page(page);
314 16 : if (!__putback_lru_fast_prepare(page, &pvec_putback,
315 : &pgrescued)) {
316 : /*
317 : * Slow path. We don't want to lose the last
318 : * pin before unlock_page()
319 : */
320 0 : get_page(page); /* for putback_lru_page() */
321 0 : __munlock_isolated_page(page);
322 0 : unlock_page(page);
323 0 : put_page(page); /* from follow_page_mask() */
324 : }
325 : }
326 : }
327 :
328 : /*
329 : * Phase 3: page putback for pages that qualified for the fast path
330 : * This will also call put_page() to return pin from follow_page_mask()
331 : */
332 2 : if (pagevec_count(&pvec_putback))
333 2 : __putback_lru_fast(&pvec_putback, pgrescued);
334 2 : }
335 :
336 : /*
337 : * Fill up pagevec for __munlock_pagevec using pte walk
338 : *
339 : * The function expects that the struct page corresponding to @start address is
340 : * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
341 : *
342 : * The rest of @pvec is filled by subsequent pages within the same pmd and same
343 : * zone, as long as the pte's are present and vm_normal_page() succeeds. These
344 : * pages also get pinned.
345 : *
346 : * Returns the address of the next page that should be scanned. This equals
347 : * @start + PAGE_SIZE when no page could be added by the pte walk.
348 : */
349 2 : static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
350 : struct vm_area_struct *vma, struct zone *zone,
351 : unsigned long start, unsigned long end)
352 : {
353 2 : pte_t *pte;
354 2 : spinlock_t *ptl;
355 :
356 : /*
357 : * Initialize pte walk starting at the already pinned page where we
358 : * are sure that there is a pte, as it was pinned under the same
359 : * mmap_lock write op.
360 : */
361 2 : pte = get_locked_pte(vma->vm_mm, start, &ptl);
362 : /* Make sure we do not cross the page table boundary */
363 2 : end = pgd_addr_end(start, end);
364 2 : end = p4d_addr_end(start, end);
365 2 : end = pud_addr_end(start, end);
366 2 : end = pmd_addr_end(start, end);
367 :
368 : /* The page next to the pinned page is the first we will try to get */
369 2 : start += PAGE_SIZE;
370 15 : while (start < end) {
371 14 : struct page *page = NULL;
372 14 : pte++;
373 14 : if (pte_present(*pte))
374 14 : page = vm_normal_page(vma, start, *pte);
375 : /*
376 : * Break if page could not be obtained or the page's node+zone does not
377 : * match
378 : */
379 14 : if (!page || page_zone(page) != zone)
380 : break;
381 :
382 : /*
383 : * Do not use pagevec for PTE-mapped THP,
384 : * munlock_vma_pages_range() will handle them.
385 : */
386 14 : if (PageTransCompound(page))
387 : break;
388 :
389 14 : get_page(page);
390 : /*
391 : * Increase the address that will be returned *before* the
392 : * eventual break due to pvec becoming full by adding the page
393 : */
394 14 : start += PAGE_SIZE;
395 14 : if (pagevec_add(pvec, page) == 0)
396 : break;
397 : }
398 2 : pte_unmap_unlock(pte, ptl);
399 2 : return start;
400 : }
401 :
402 : /*
403 : * munlock_vma_pages_range() - munlock all pages in the vma range.'
404 : * @vma - vma containing range to be munlock()ed.
405 : * @start - start address in @vma of the range
406 : * @end - end of range in @vma.
407 : *
408 : * For mremap(), munmap() and exit().
409 : *
410 : * Called with @vma VM_LOCKED.
411 : *
412 : * Returns with VM_LOCKED cleared. Callers must be prepared to
413 : * deal with this.
414 : *
415 : * We don't save and restore VM_LOCKED here because pages are
416 : * still on lru. In unmap path, pages might be scanned by reclaim
417 : * and re-mlocked by try_to_{munlock|unmap} before we unmap and
418 : * free them. This will result in freeing mlocked pages.
419 : */
420 1 : void munlock_vma_pages_range(struct vm_area_struct *vma,
421 : unsigned long start, unsigned long end)
422 : {
423 1 : vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
424 :
425 3 : while (start < end) {
426 2 : struct page *page;
427 2 : unsigned int page_mask = 0;
428 2 : unsigned long page_increm;
429 2 : struct pagevec pvec;
430 2 : struct zone *zone;
431 :
432 2 : pagevec_init(&pvec);
433 : /*
434 : * Although FOLL_DUMP is intended for get_dump_page(),
435 : * it just so happens that its special treatment of the
436 : * ZERO_PAGE (returning an error instead of doing get_page)
437 : * suits munlock very well (and if somehow an abnormal page
438 : * has sneaked into the range, we won't oops here: great).
439 : */
440 2 : page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
441 :
442 2 : if (page && !IS_ERR(page)) {
443 2 : if (PageTransTail(page)) {
444 0 : VM_BUG_ON_PAGE(PageMlocked(page), page);
445 0 : put_page(page); /* follow_page_mask() */
446 2 : } else if (PageTransHuge(page)) {
447 0 : lock_page(page);
448 : /*
449 : * Any THP page found by follow_page_mask() may
450 : * have gotten split before reaching
451 : * munlock_vma_page(), so we need to compute
452 : * the page_mask here instead.
453 : */
454 0 : page_mask = munlock_vma_page(page);
455 0 : unlock_page(page);
456 0 : put_page(page); /* follow_page_mask() */
457 : } else {
458 : /*
459 : * Non-huge pages are handled in batches via
460 : * pagevec. The pin from follow_page_mask()
461 : * prevents them from collapsing by THP.
462 : */
463 2 : pagevec_add(&pvec, page);
464 2 : zone = page_zone(page);
465 :
466 : /*
467 : * Try to fill the rest of pagevec using fast
468 : * pte walk. This will also update start to
469 : * the next page to process. Then munlock the
470 : * pagevec.
471 : */
472 2 : start = __munlock_pagevec_fill(&pvec, vma,
473 : zone, start, end);
474 2 : __munlock_pagevec(&pvec, zone);
475 2 : goto next;
476 : }
477 : }
478 0 : page_increm = 1 + page_mask;
479 0 : start += page_increm * PAGE_SIZE;
480 2 : next:
481 2 : cond_resched();
482 : }
483 1 : }
484 :
485 : /*
486 : * mlock_fixup - handle mlock[all]/munlock[all] requests.
487 : *
488 : * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
489 : * munlock is a no-op. However, for some special vmas, we go ahead and
490 : * populate the ptes.
491 : *
492 : * For vmas that pass the filters, merge/split as appropriate.
493 : */
494 1 : static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
495 : unsigned long start, unsigned long end, vm_flags_t newflags)
496 : {
497 1 : struct mm_struct *mm = vma->vm_mm;
498 1 : pgoff_t pgoff;
499 1 : int nr_pages;
500 1 : int ret = 0;
501 1 : int lock = !!(newflags & VM_LOCKED);
502 1 : vm_flags_t old_flags = vma->vm_flags;
503 :
504 1 : if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
505 1 : is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
506 1 : vma_is_dax(vma))
507 : /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
508 0 : goto out;
509 :
510 1 : pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
511 1 : *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
512 : vma->vm_file, pgoff, vma_policy(vma),
513 : vma->vm_userfaultfd_ctx);
514 1 : if (*prev) {
515 0 : vma = *prev;
516 0 : goto success;
517 : }
518 :
519 1 : if (start != vma->vm_start) {
520 0 : ret = split_vma(mm, vma, start, 1);
521 0 : if (ret)
522 0 : goto out;
523 : }
524 :
525 1 : if (end != vma->vm_end) {
526 0 : ret = split_vma(mm, vma, end, 0);
527 0 : if (ret)
528 0 : goto out;
529 : }
530 :
531 1 : success:
532 : /*
533 : * Keep track of amount of locked VM.
534 : */
535 1 : nr_pages = (end - start) >> PAGE_SHIFT;
536 1 : if (!lock)
537 0 : nr_pages = -nr_pages;
538 1 : else if (old_flags & VM_LOCKED)
539 0 : nr_pages = 0;
540 1 : mm->locked_vm += nr_pages;
541 :
542 : /*
543 : * vm_flags is protected by the mmap_lock held in write mode.
544 : * It's okay if try_to_unmap_one unmaps a page just after we
545 : * set VM_LOCKED, populate_vma_page_range will bring it back.
546 : */
547 :
548 1 : if (lock)
549 1 : vma->vm_flags = newflags;
550 : else
551 0 : munlock_vma_pages_range(vma, start, end);
552 :
553 1 : out:
554 1 : *prev = vma;
555 1 : return ret;
556 : }
557 :
558 1 : static int apply_vma_lock_flags(unsigned long start, size_t len,
559 : vm_flags_t flags)
560 : {
561 1 : unsigned long nstart, end, tmp;
562 1 : struct vm_area_struct * vma, * prev;
563 1 : int error;
564 :
565 1 : VM_BUG_ON(offset_in_page(start));
566 1 : VM_BUG_ON(len != PAGE_ALIGN(len));
567 1 : end = start + len;
568 1 : if (end < start)
569 : return -EINVAL;
570 1 : if (end == start)
571 : return 0;
572 1 : vma = find_vma(current->mm, start);
573 1 : if (!vma || vma->vm_start > start)
574 : return -ENOMEM;
575 :
576 1 : prev = vma->vm_prev;
577 1 : if (start > vma->vm_start)
578 0 : prev = vma;
579 :
580 : for (nstart = start ; ; ) {
581 1 : vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
582 :
583 1 : newflags |= flags;
584 :
585 : /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
586 1 : tmp = vma->vm_end;
587 1 : if (tmp > end)
588 : tmp = end;
589 1 : error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
590 1 : if (error)
591 : break;
592 1 : nstart = tmp;
593 1 : if (nstart < prev->vm_end)
594 : nstart = prev->vm_end;
595 1 : if (nstart >= end)
596 : break;
597 :
598 0 : vma = prev->vm_next;
599 0 : if (!vma || vma->vm_start != nstart) {
600 : error = -ENOMEM;
601 : break;
602 : }
603 : }
604 : return error;
605 : }
606 :
607 : /*
608 : * Go through vma areas and sum size of mlocked
609 : * vma pages, as return value.
610 : * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
611 : * is also counted.
612 : * Return value: previously mlocked page counts
613 : */
614 0 : static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
615 : unsigned long start, size_t len)
616 : {
617 0 : struct vm_area_struct *vma;
618 0 : unsigned long count = 0;
619 :
620 0 : if (mm == NULL)
621 0 : mm = current->mm;
622 :
623 0 : vma = find_vma(mm, start);
624 0 : if (vma == NULL)
625 : return 0;
626 :
627 0 : for (; vma ; vma = vma->vm_next) {
628 0 : if (start >= vma->vm_end)
629 0 : continue;
630 0 : if (start + len <= vma->vm_start)
631 : break;
632 0 : if (vma->vm_flags & VM_LOCKED) {
633 0 : if (start > vma->vm_start)
634 0 : count -= (start - vma->vm_start);
635 0 : if (start + len < vma->vm_end) {
636 0 : count += start + len - vma->vm_start;
637 0 : break;
638 : }
639 0 : count += vma->vm_end - vma->vm_start;
640 : }
641 : }
642 :
643 0 : return count >> PAGE_SHIFT;
644 : }
645 :
646 1 : static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
647 : {
648 1 : unsigned long locked;
649 1 : unsigned long lock_limit;
650 1 : int error = -ENOMEM;
651 :
652 1 : start = untagged_addr(start);
653 :
654 1 : if (!can_do_mlock())
655 : return -EPERM;
656 :
657 1 : len = PAGE_ALIGN(len + (offset_in_page(start)));
658 1 : start &= PAGE_MASK;
659 :
660 1 : lock_limit = rlimit(RLIMIT_MEMLOCK);
661 1 : lock_limit >>= PAGE_SHIFT;
662 1 : locked = len >> PAGE_SHIFT;
663 :
664 1 : if (mmap_write_lock_killable(current->mm))
665 : return -EINTR;
666 :
667 1 : locked += current->mm->locked_vm;
668 1 : if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
669 : /*
670 : * It is possible that the regions requested intersect with
671 : * previously mlocked areas, that part area in "mm->locked_vm"
672 : * should not be counted to new mlock increment count. So check
673 : * and adjust locked count if necessary.
674 : */
675 0 : locked -= count_mm_mlocked_page_nr(current->mm,
676 : start, len);
677 : }
678 :
679 : /* check against resource limits */
680 1 : if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
681 1 : error = apply_vma_lock_flags(start, len, flags);
682 :
683 1 : mmap_write_unlock(current->mm);
684 1 : if (error)
685 : return error;
686 :
687 1 : error = __mm_populate(start, len, 0);
688 1 : if (error)
689 0 : return __mlock_posix_error_return(error);
690 : return 0;
691 : }
692 :
693 2 : SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
694 : {
695 1 : return do_mlock(start, len, VM_LOCKED);
696 : }
697 :
698 0 : SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
699 : {
700 0 : vm_flags_t vm_flags = VM_LOCKED;
701 :
702 0 : if (flags & ~MLOCK_ONFAULT)
703 : return -EINVAL;
704 :
705 0 : if (flags & MLOCK_ONFAULT)
706 0 : vm_flags |= VM_LOCKONFAULT;
707 :
708 0 : return do_mlock(start, len, vm_flags);
709 : }
710 :
711 0 : SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
712 : {
713 0 : int ret;
714 :
715 0 : start = untagged_addr(start);
716 :
717 0 : len = PAGE_ALIGN(len + (offset_in_page(start)));
718 0 : start &= PAGE_MASK;
719 :
720 0 : if (mmap_write_lock_killable(current->mm))
721 : return -EINTR;
722 0 : ret = apply_vma_lock_flags(start, len, 0);
723 0 : mmap_write_unlock(current->mm);
724 :
725 0 : return ret;
726 : }
727 :
728 : /*
729 : * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
730 : * and translate into the appropriate modifications to mm->def_flags and/or the
731 : * flags for all current VMAs.
732 : *
733 : * There are a couple of subtleties with this. If mlockall() is called multiple
734 : * times with different flags, the values do not necessarily stack. If mlockall
735 : * is called once including the MCL_FUTURE flag and then a second time without
736 : * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
737 : */
738 0 : static int apply_mlockall_flags(int flags)
739 : {
740 0 : struct vm_area_struct * vma, * prev = NULL;
741 0 : vm_flags_t to_add = 0;
742 :
743 0 : current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
744 0 : if (flags & MCL_FUTURE) {
745 0 : current->mm->def_flags |= VM_LOCKED;
746 :
747 0 : if (flags & MCL_ONFAULT)
748 0 : current->mm->def_flags |= VM_LOCKONFAULT;
749 :
750 0 : if (!(flags & MCL_CURRENT))
751 0 : goto out;
752 : }
753 :
754 0 : if (flags & MCL_CURRENT) {
755 0 : to_add |= VM_LOCKED;
756 0 : if (flags & MCL_ONFAULT)
757 0 : to_add |= VM_LOCKONFAULT;
758 : }
759 :
760 0 : for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
761 0 : vm_flags_t newflags;
762 :
763 0 : newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
764 0 : newflags |= to_add;
765 :
766 : /* Ignore errors */
767 0 : mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
768 0 : cond_resched();
769 : }
770 0 : out:
771 0 : return 0;
772 : }
773 :
774 0 : SYSCALL_DEFINE1(mlockall, int, flags)
775 : {
776 0 : unsigned long lock_limit;
777 0 : int ret;
778 :
779 0 : if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
780 : flags == MCL_ONFAULT)
781 : return -EINVAL;
782 :
783 0 : if (!can_do_mlock())
784 : return -EPERM;
785 :
786 0 : lock_limit = rlimit(RLIMIT_MEMLOCK);
787 0 : lock_limit >>= PAGE_SHIFT;
788 :
789 0 : if (mmap_write_lock_killable(current->mm))
790 : return -EINTR;
791 :
792 0 : ret = -ENOMEM;
793 0 : if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
794 0 : capable(CAP_IPC_LOCK))
795 0 : ret = apply_mlockall_flags(flags);
796 0 : mmap_write_unlock(current->mm);
797 0 : if (!ret && (flags & MCL_CURRENT))
798 0 : mm_populate(0, TASK_SIZE);
799 :
800 0 : return ret;
801 : }
802 :
803 0 : SYSCALL_DEFINE0(munlockall)
804 : {
805 0 : int ret;
806 :
807 0 : if (mmap_write_lock_killable(current->mm))
808 : return -EINTR;
809 0 : ret = apply_mlockall_flags(0);
810 0 : mmap_write_unlock(current->mm);
811 0 : return ret;
812 : }
813 :
814 : /*
815 : * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
816 : * shm segments) get accounted against the user_struct instead.
817 : */
818 : static DEFINE_SPINLOCK(shmlock_user_lock);
819 :
820 0 : int user_shm_lock(size_t size, struct user_struct *user)
821 : {
822 0 : unsigned long lock_limit, locked;
823 0 : int allowed = 0;
824 :
825 0 : locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
826 0 : lock_limit = rlimit(RLIMIT_MEMLOCK);
827 0 : if (lock_limit == RLIM_INFINITY)
828 0 : allowed = 1;
829 0 : lock_limit >>= PAGE_SHIFT;
830 0 : spin_lock(&shmlock_user_lock);
831 0 : if (!allowed &&
832 0 : locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
833 0 : goto out;
834 0 : get_uid(user);
835 0 : user->locked_shm += locked;
836 0 : allowed = 1;
837 0 : out:
838 0 : spin_unlock(&shmlock_user_lock);
839 0 : return allowed;
840 : }
841 :
842 0 : void user_shm_unlock(size_t size, struct user_struct *user)
843 : {
844 0 : spin_lock(&shmlock_user_lock);
845 0 : user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
846 0 : spin_unlock(&shmlock_user_lock);
847 0 : free_uid(user);
848 0 : }
|