Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : #include <linux/mm.h>
3 : #include <linux/gfp.h>
4 : #include <linux/hugetlb.h>
5 : #include <asm/pgalloc.h>
6 : #include <asm/tlb.h>
7 : #include <asm/fixmap.h>
8 : #include <asm/mtrr.h>
9 :
10 : #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11 : phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
12 : EXPORT_SYMBOL(physical_mask);
13 : #endif
14 :
15 : #ifdef CONFIG_HIGHPTE
16 : #define PGTABLE_HIGHMEM __GFP_HIGHMEM
17 : #else
18 : #define PGTABLE_HIGHMEM 0
19 : #endif
20 :
21 : #ifndef CONFIG_PARAVIRT
22 : static inline
23 : void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
24 : {
25 : tlb_remove_page(tlb, table);
26 : }
27 : #endif
28 :
29 : gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
30 :
31 45080 : pgtable_t pte_alloc_one(struct mm_struct *mm)
32 : {
33 45080 : return __pte_alloc_one(mm, __userpte_alloc_gfp);
34 : }
35 :
36 0 : static int __init setup_userpte(char *arg)
37 : {
38 0 : if (!arg)
39 : return -EINVAL;
40 :
41 : /*
42 : * "userpte=nohigh" disables allocation of user pagetables in
43 : * high memory.
44 : */
45 0 : if (strcmp(arg, "nohigh") == 0)
46 0 : __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
47 : else
48 : return -EINVAL;
49 0 : return 0;
50 : }
51 : early_param("userpte", setup_userpte);
52 :
53 39889 : void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
54 : {
55 39889 : pgtable_pte_page_dtor(pte);
56 39889 : paravirt_release_pte(page_to_pfn(pte));
57 39889 : paravirt_tlb_remove_table(tlb, pte);
58 39889 : }
59 :
60 : #if CONFIG_PGTABLE_LEVELS > 2
61 22771 : void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
62 : {
63 22771 : struct page *page = virt_to_page(pmd);
64 22771 : paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
65 : /*
66 : * NOTE! For PAE, any changes to the top page-directory-pointer-table
67 : * entries need a full cr3 reload to flush.
68 : */
69 : #ifdef CONFIG_X86_PAE
70 : tlb->need_flush_all = 1;
71 : #endif
72 22771 : pgtable_pmd_page_dtor(page);
73 22771 : paravirt_tlb_remove_table(tlb, page);
74 22771 : }
75 :
76 : #if CONFIG_PGTABLE_LEVELS > 3
77 18410 : void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
78 : {
79 18410 : paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
80 18410 : paravirt_tlb_remove_table(tlb, virt_to_page(pud));
81 18410 : }
82 :
83 : #if CONFIG_PGTABLE_LEVELS > 4
84 : void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
85 : {
86 : paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
87 : paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
88 : }
89 : #endif /* CONFIG_PGTABLE_LEVELS > 4 */
90 : #endif /* CONFIG_PGTABLE_LEVELS > 3 */
91 : #endif /* CONFIG_PGTABLE_LEVELS > 2 */
92 :
93 10325 : static inline void pgd_list_add(pgd_t *pgd)
94 : {
95 10325 : struct page *page = virt_to_page(pgd);
96 :
97 10325 : list_add(&page->lru, &pgd_list);
98 10325 : }
99 :
100 10301 : static inline void pgd_list_del(pgd_t *pgd)
101 : {
102 10301 : struct page *page = virt_to_page(pgd);
103 :
104 10301 : list_del(&page->lru);
105 10301 : }
106 :
107 : #define UNSHARED_PTRS_PER_PGD \
108 : (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
109 : #define MAX_UNSHARED_PTRS_PER_PGD \
110 : max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
111 :
112 :
113 10325 : static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
114 : {
115 10325 : virt_to_page(pgd)->pt_mm = mm;
116 10325 : }
117 :
118 0 : struct mm_struct *pgd_page_get_mm(struct page *page)
119 : {
120 0 : return page->pt_mm;
121 : }
122 :
123 10325 : static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
124 : {
125 : /* If the pgd points to a shared pagetable level (either the
126 : ptes in non-PAE, or shared PMD in PAE), then just copy the
127 : references from swapper_pg_dir. */
128 10325 : if (CONFIG_PGTABLE_LEVELS == 2 ||
129 : (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
130 : CONFIG_PGTABLE_LEVELS >= 4) {
131 10325 : clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
132 : swapper_pg_dir + KERNEL_PGD_BOUNDARY,
133 : KERNEL_PGD_PTRS);
134 : }
135 :
136 : /* list required to sync kernel mapping updates */
137 10325 : if (!SHARED_KERNEL_PMD) {
138 10325 : pgd_set_mm(pgd, mm);
139 10325 : pgd_list_add(pgd);
140 : }
141 10325 : }
142 :
143 10301 : static void pgd_dtor(pgd_t *pgd)
144 : {
145 10301 : if (SHARED_KERNEL_PMD)
146 : return;
147 :
148 10301 : spin_lock(&pgd_lock);
149 10301 : pgd_list_del(pgd);
150 10301 : spin_unlock(&pgd_lock);
151 : }
152 :
153 : /*
154 : * List of all pgd's needed for non-PAE so it can invalidate entries
155 : * in both cached and uncached pgd's; not needed for PAE since the
156 : * kernel pmd is shared. If PAE were not to share the pmd a similar
157 : * tactic would be needed. This is essentially codepath-based locking
158 : * against pageattr.c; it is the unique case in which a valid change
159 : * of kernel pagetables can't be lazily synchronized by vmalloc faults.
160 : * vmalloc faults work because attached pagetables are never freed.
161 : * -- nyc
162 : */
163 :
164 : #ifdef CONFIG_X86_PAE
165 : /*
166 : * In PAE mode, we need to do a cr3 reload (=tlb flush) when
167 : * updating the top-level pagetable entries to guarantee the
168 : * processor notices the update. Since this is expensive, and
169 : * all 4 top-level entries are used almost immediately in a
170 : * new process's life, we just pre-populate them here.
171 : *
172 : * Also, if we're in a paravirt environment where the kernel pmd is
173 : * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
174 : * and initialize the kernel pmds here.
175 : */
176 : #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
177 : #define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD
178 :
179 : /*
180 : * We allocate separate PMDs for the kernel part of the user page-table
181 : * when PTI is enabled. We need them to map the per-process LDT into the
182 : * user-space page-table.
183 : */
184 : #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \
185 : KERNEL_PGD_PTRS : 0)
186 : #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
187 :
188 : void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
189 : {
190 : paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
191 :
192 : /* Note: almost everything apart from _PAGE_PRESENT is
193 : reserved at the pmd (PDPT) level. */
194 : set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
195 :
196 : /*
197 : * According to Intel App note "TLBs, Paging-Structure Caches,
198 : * and Their Invalidation", April 2007, document 317080-001,
199 : * section 8.1: in PAE mode we explicitly have to flush the
200 : * TLB via cr3 if the top-level pgd is changed...
201 : */
202 : flush_tlb_mm(mm);
203 : }
204 : #else /* !CONFIG_X86_PAE */
205 :
206 : /* No need to prepopulate any pagetable entries in non-PAE modes. */
207 : #define PREALLOCATED_PMDS 0
208 : #define MAX_PREALLOCATED_PMDS 0
209 : #define PREALLOCATED_USER_PMDS 0
210 : #define MAX_PREALLOCATED_USER_PMDS 0
211 : #endif /* CONFIG_X86_PAE */
212 :
213 0 : static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
214 : {
215 0 : int i;
216 :
217 0 : for (i = 0; i < count; i++)
218 0 : if (pmds[i]) {
219 0 : pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
220 0 : free_page((unsigned long)pmds[i]);
221 0 : mm_dec_nr_pmds(mm);
222 : }
223 0 : }
224 :
225 20650 : static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
226 : {
227 20650 : int i;
228 20650 : bool failed = false;
229 20650 : gfp_t gfp = GFP_PGTABLE_USER;
230 :
231 20650 : if (mm == &init_mm)
232 0 : gfp &= ~__GFP_ACCOUNT;
233 :
234 20650 : for (i = 0; i < count; i++) {
235 0 : pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
236 0 : if (!pmd)
237 0 : failed = true;
238 0 : if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
239 0 : free_page((unsigned long)pmd);
240 0 : pmd = NULL;
241 0 : failed = true;
242 : }
243 0 : if (pmd)
244 0 : mm_inc_nr_pmds(mm);
245 0 : pmds[i] = pmd;
246 : }
247 :
248 20650 : if (failed) {
249 0 : free_pmds(mm, pmds, count);
250 0 : return -ENOMEM;
251 : }
252 :
253 : return 0;
254 : }
255 :
256 : /*
257 : * Mop up any pmd pages which may still be attached to the pgd.
258 : * Normally they will be freed by munmap/exit_mmap, but any pmd we
259 : * preallocate which never got a corresponding vma will need to be
260 : * freed manually.
261 : */
262 : static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
263 : {
264 : pgd_t pgd = *pgdp;
265 :
266 : if (pgd_val(pgd) != 0) {
267 : pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
268 :
269 : pgd_clear(pgdp);
270 :
271 : paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
272 : pmd_free(mm, pmd);
273 : mm_dec_nr_pmds(mm);
274 : }
275 : }
276 :
277 10301 : static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
278 : {
279 10301 : int i;
280 :
281 10301 : for (i = 0; i < PREALLOCATED_PMDS; i++)
282 : mop_up_one_pmd(mm, &pgdp[i]);
283 :
284 : #ifdef CONFIG_PAGE_TABLE_ISOLATION
285 :
286 : if (!boot_cpu_has(X86_FEATURE_PTI))
287 : return;
288 :
289 : pgdp = kernel_to_user_pgdp(pgdp);
290 :
291 : for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
292 : mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
293 : #endif
294 : }
295 :
296 10325 : static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
297 : {
298 10325 : p4d_t *p4d;
299 10325 : pud_t *pud;
300 10325 : int i;
301 :
302 10325 : if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
303 10325 : return;
304 :
305 : p4d = p4d_offset(pgd, 0);
306 : pud = pud_offset(p4d, 0);
307 :
308 : for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
309 : pmd_t *pmd = pmds[i];
310 :
311 : if (i >= KERNEL_PGD_BOUNDARY)
312 : memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
313 : sizeof(pmd_t) * PTRS_PER_PMD);
314 :
315 : pud_populate(mm, pud, pmd);
316 : }
317 : }
318 :
319 : #ifdef CONFIG_PAGE_TABLE_ISOLATION
320 : static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
321 : pgd_t *k_pgd, pmd_t *pmds[])
322 : {
323 : pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
324 : pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
325 : p4d_t *u_p4d;
326 : pud_t *u_pud;
327 : int i;
328 :
329 : u_p4d = p4d_offset(u_pgd, 0);
330 : u_pud = pud_offset(u_p4d, 0);
331 :
332 : s_pgd += KERNEL_PGD_BOUNDARY;
333 : u_pud += KERNEL_PGD_BOUNDARY;
334 :
335 : for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
336 : pmd_t *pmd = pmds[i];
337 :
338 : memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
339 : sizeof(pmd_t) * PTRS_PER_PMD);
340 :
341 : pud_populate(mm, u_pud, pmd);
342 : }
343 :
344 : }
345 : #else
346 10325 : static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
347 : pgd_t *k_pgd, pmd_t *pmds[])
348 : {
349 10325 : }
350 : #endif
351 : /*
352 : * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
353 : * assumes that pgd should be in one page.
354 : *
355 : * But kernel with PAE paging that is not running as a Xen domain
356 : * only needs to allocate 32 bytes for pgd instead of one page.
357 : */
358 : #ifdef CONFIG_X86_PAE
359 :
360 : #include <linux/slab.h>
361 :
362 : #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
363 : #define PGD_ALIGN 32
364 :
365 : static struct kmem_cache *pgd_cache;
366 :
367 : void __init pgtable_cache_init(void)
368 : {
369 : /*
370 : * When PAE kernel is running as a Xen domain, it does not use
371 : * shared kernel pmd. And this requires a whole page for pgd.
372 : */
373 : if (!SHARED_KERNEL_PMD)
374 : return;
375 :
376 : /*
377 : * when PAE kernel is not running as a Xen domain, it uses
378 : * shared kernel pmd. Shared kernel pmd does not require a whole
379 : * page for pgd. We are able to just allocate a 32-byte for pgd.
380 : * During boot time, we create a 32-byte slab for pgd table allocation.
381 : */
382 : pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
383 : SLAB_PANIC, NULL);
384 : }
385 :
386 : static inline pgd_t *_pgd_alloc(void)
387 : {
388 : /*
389 : * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
390 : * We allocate one page for pgd.
391 : */
392 : if (!SHARED_KERNEL_PMD)
393 : return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
394 : PGD_ALLOCATION_ORDER);
395 :
396 : /*
397 : * Now PAE kernel is not running as a Xen domain. We can allocate
398 : * a 32-byte slab for pgd to save memory space.
399 : */
400 : return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
401 : }
402 :
403 : static inline void _pgd_free(pgd_t *pgd)
404 : {
405 : if (!SHARED_KERNEL_PMD)
406 : free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
407 : else
408 : kmem_cache_free(pgd_cache, pgd);
409 : }
410 : #else
411 :
412 10325 : static inline pgd_t *_pgd_alloc(void)
413 : {
414 20650 : return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
415 : PGD_ALLOCATION_ORDER);
416 : }
417 :
418 10301 : static inline void _pgd_free(pgd_t *pgd)
419 : {
420 10301 : free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
421 0 : }
422 : #endif /* CONFIG_X86_PAE */
423 :
424 10325 : pgd_t *pgd_alloc(struct mm_struct *mm)
425 : {
426 10325 : pgd_t *pgd;
427 10325 : pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
428 10325 : pmd_t *pmds[MAX_PREALLOCATED_PMDS];
429 :
430 10325 : pgd = _pgd_alloc();
431 :
432 10325 : if (pgd == NULL)
433 0 : goto out;
434 :
435 10325 : mm->pgd = pgd;
436 :
437 10325 : if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
438 0 : goto out_free_pgd;
439 :
440 10325 : if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
441 0 : goto out_free_pmds;
442 :
443 10325 : if (paravirt_pgd_alloc(mm) != 0)
444 : goto out_free_user_pmds;
445 :
446 : /*
447 : * Make sure that pre-populating the pmds is atomic with
448 : * respect to anything walking the pgd_list, so that they
449 : * never see a partially populated pgd.
450 : */
451 10325 : spin_lock(&pgd_lock);
452 :
453 10325 : pgd_ctor(mm, pgd);
454 10325 : pgd_prepopulate_pmd(mm, pgd, pmds);
455 10325 : pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
456 :
457 10325 : spin_unlock(&pgd_lock);
458 :
459 10325 : return pgd;
460 :
461 : out_free_user_pmds:
462 : free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
463 0 : out_free_pmds:
464 0 : free_pmds(mm, pmds, PREALLOCATED_PMDS);
465 0 : out_free_pgd:
466 0 : _pgd_free(pgd);
467 : out:
468 : return NULL;
469 : }
470 :
471 10301 : void pgd_free(struct mm_struct *mm, pgd_t *pgd)
472 : {
473 10301 : pgd_mop_up_pmds(mm, pgd);
474 10301 : pgd_dtor(pgd);
475 10301 : paravirt_pgd_free(mm, pgd);
476 10301 : _pgd_free(pgd);
477 10301 : }
478 :
479 : /*
480 : * Used to set accessed or dirty bits in the page table entries
481 : * on other architectures. On x86, the accessed and dirty bits
482 : * are tracked by hardware. However, do_wp_page calls this function
483 : * to also make the pte writeable at the same time the dirty bit is
484 : * set. In that case we do actually need to write the PTE.
485 : */
486 32324 : int ptep_set_access_flags(struct vm_area_struct *vma,
487 : unsigned long address, pte_t *ptep,
488 : pte_t entry, int dirty)
489 : {
490 32324 : int changed = !pte_same(*ptep, entry);
491 :
492 32324 : if (changed && dirty)
493 32017 : set_pte(ptep, entry);
494 :
495 32324 : return changed;
496 : }
497 :
498 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
499 0 : int pmdp_set_access_flags(struct vm_area_struct *vma,
500 : unsigned long address, pmd_t *pmdp,
501 : pmd_t entry, int dirty)
502 : {
503 0 : int changed = !pmd_same(*pmdp, entry);
504 :
505 0 : VM_BUG_ON(address & ~HPAGE_PMD_MASK);
506 :
507 0 : if (changed && dirty) {
508 0 : set_pmd(pmdp, entry);
509 : /*
510 : * We had a write-protection fault here and changed the pmd
511 : * to to more permissive. No need to flush the TLB for that,
512 : * #PF is architecturally guaranteed to do that and in the
513 : * worst-case we'll generate a spurious fault.
514 : */
515 : }
516 :
517 0 : return changed;
518 : }
519 :
520 0 : int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
521 : pud_t *pudp, pud_t entry, int dirty)
522 : {
523 0 : int changed = !pud_same(*pudp, entry);
524 :
525 0 : VM_BUG_ON(address & ~HPAGE_PUD_MASK);
526 :
527 0 : if (changed && dirty) {
528 0 : set_pud(pudp, entry);
529 : /*
530 : * We had a write-protection fault here and changed the pud
531 : * to to more permissive. No need to flush the TLB for that,
532 : * #PF is architecturally guaranteed to do that and in the
533 : * worst-case we'll generate a spurious fault.
534 : */
535 : }
536 :
537 0 : return changed;
538 : }
539 : #endif
540 :
541 0 : int ptep_test_and_clear_young(struct vm_area_struct *vma,
542 : unsigned long addr, pte_t *ptep)
543 : {
544 0 : int ret = 0;
545 :
546 0 : if (pte_young(*ptep))
547 0 : ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
548 0 : (unsigned long *) &ptep->pte);
549 :
550 0 : return ret;
551 : }
552 :
553 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
554 0 : int pmdp_test_and_clear_young(struct vm_area_struct *vma,
555 : unsigned long addr, pmd_t *pmdp)
556 : {
557 0 : int ret = 0;
558 :
559 0 : if (pmd_young(*pmdp))
560 0 : ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
561 : (unsigned long *)pmdp);
562 :
563 0 : return ret;
564 : }
565 0 : int pudp_test_and_clear_young(struct vm_area_struct *vma,
566 : unsigned long addr, pud_t *pudp)
567 : {
568 0 : int ret = 0;
569 :
570 0 : if (pud_young(*pudp))
571 0 : ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
572 : (unsigned long *)pudp);
573 :
574 0 : return ret;
575 : }
576 : #endif
577 :
578 0 : int ptep_clear_flush_young(struct vm_area_struct *vma,
579 : unsigned long address, pte_t *ptep)
580 : {
581 : /*
582 : * On x86 CPUs, clearing the accessed bit without a TLB flush
583 : * doesn't cause data corruption. [ It could cause incorrect
584 : * page aging and the (mistaken) reclaim of hot pages, but the
585 : * chance of that should be relatively low. ]
586 : *
587 : * So as a performance optimization don't flush the TLB when
588 : * clearing the accessed bit, it will eventually be flushed by
589 : * a context switch or a VM operation anyway. [ In the rare
590 : * event of it not getting flushed for a long time the delay
591 : * shouldn't really matter because there's no real memory
592 : * pressure for swapout to react to. ]
593 : */
594 0 : return ptep_test_and_clear_young(vma, address, ptep);
595 : }
596 :
597 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
598 0 : int pmdp_clear_flush_young(struct vm_area_struct *vma,
599 : unsigned long address, pmd_t *pmdp)
600 : {
601 0 : int young;
602 :
603 0 : VM_BUG_ON(address & ~HPAGE_PMD_MASK);
604 :
605 0 : young = pmdp_test_and_clear_young(vma, address, pmdp);
606 0 : if (young)
607 0 : flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
608 :
609 0 : return young;
610 : }
611 : #endif
612 :
613 : /**
614 : * reserve_top_address - reserves a hole in the top of kernel address space
615 : * @reserve - size of hole to reserve
616 : *
617 : * Can be used to relocate the fixmap area and poke a hole in the top
618 : * of kernel address space to make room for a hypervisor.
619 : */
620 0 : void __init reserve_top_address(unsigned long reserve)
621 : {
622 : #ifdef CONFIG_X86_32
623 : BUG_ON(fixmaps_set > 0);
624 : __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
625 : printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
626 : -reserve, __FIXADDR_TOP + PAGE_SIZE);
627 : #endif
628 0 : }
629 :
630 : int fixmaps_set;
631 :
632 3 : void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
633 : {
634 3 : unsigned long address = __fix_to_virt(idx);
635 :
636 : #ifdef CONFIG_X86_64
637 : /*
638 : * Ensure that the static initial page tables are covering the
639 : * fixmap completely.
640 : */
641 3 : BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
642 : (FIXMAP_PMD_NUM * PTRS_PER_PTE));
643 : #endif
644 :
645 3 : if (idx >= __end_of_fixed_addresses) {
646 0 : BUG();
647 : return;
648 : }
649 3 : set_pte_vaddr(address, pte);
650 3 : fixmaps_set++;
651 : }
652 :
653 3 : void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
654 : phys_addr_t phys, pgprot_t flags)
655 : {
656 : /* Sanitize 'prot' against any unsupported bits: */
657 3 : pgprot_val(flags) &= __default_kernel_pte_mask;
658 :
659 3 : __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
660 3 : }
661 :
662 : #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
663 : #ifdef CONFIG_X86_5LEVEL
664 : /**
665 : * p4d_set_huge - setup kernel P4D mapping
666 : *
667 : * No 512GB pages yet -- always return 0
668 : */
669 : int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
670 : {
671 : return 0;
672 : }
673 :
674 : /**
675 : * p4d_clear_huge - clear kernel P4D mapping when it is set
676 : *
677 : * No 512GB pages yet -- always return 0
678 : */
679 : int p4d_clear_huge(p4d_t *p4d)
680 : {
681 : return 0;
682 : }
683 : #endif
684 :
685 : /**
686 : * pud_set_huge - setup kernel PUD mapping
687 : *
688 : * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
689 : * function sets up a huge page only if any of the following conditions are met:
690 : *
691 : * - MTRRs are disabled, or
692 : *
693 : * - MTRRs are enabled and the range is completely covered by a single MTRR, or
694 : *
695 : * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
696 : * has no effect on the requested PAT memory type.
697 : *
698 : * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
699 : * page mapping attempt fails.
700 : *
701 : * Returns 1 on success and 0 on failure.
702 : */
703 0 : int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
704 : {
705 0 : u8 mtrr, uniform;
706 :
707 0 : mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
708 0 : if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
709 : (mtrr != MTRR_TYPE_WRBACK))
710 : return 0;
711 :
712 : /* Bail out if we are we on a populated non-leaf entry: */
713 0 : if (pud_present(*pud) && !pud_huge(*pud))
714 0 : return 0;
715 :
716 0 : set_pte((pte_t *)pud, pfn_pte(
717 : (u64)addr >> PAGE_SHIFT,
718 : __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
719 :
720 0 : return 1;
721 : }
722 :
723 : /**
724 : * pmd_set_huge - setup kernel PMD mapping
725 : *
726 : * See text over pud_set_huge() above.
727 : *
728 : * Returns 1 on success and 0 on failure.
729 : */
730 68 : int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
731 : {
732 68 : u8 mtrr, uniform;
733 :
734 68 : mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
735 68 : if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
736 : (mtrr != MTRR_TYPE_WRBACK)) {
737 0 : pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
738 : __func__, addr, addr + PMD_SIZE);
739 0 : return 0;
740 : }
741 :
742 : /* Bail out if we are we on a populated non-leaf entry: */
743 136 : if (pmd_present(*pmd) && !pmd_huge(*pmd))
744 0 : return 0;
745 :
746 68 : set_pte((pte_t *)pmd, pfn_pte(
747 : (u64)addr >> PAGE_SHIFT,
748 : __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
749 :
750 68 : return 1;
751 : }
752 :
753 : /**
754 : * pud_clear_huge - clear kernel PUD mapping when it is set
755 : *
756 : * Returns 1 on success and 0 on failure (no PUD map is found).
757 : */
758 36352 : int pud_clear_huge(pud_t *pud)
759 : {
760 36352 : if (pud_large(*pud)) {
761 0 : pud_clear(pud);
762 0 : return 1;
763 : }
764 :
765 : return 0;
766 : }
767 :
768 : /**
769 : * pmd_clear_huge - clear kernel PMD mapping when it is set
770 : *
771 : * Returns 1 on success and 0 on failure (no PMD map is found).
772 : */
773 36414 : int pmd_clear_huge(pmd_t *pmd)
774 : {
775 72828 : if (pmd_large(*pmd)) {
776 0 : pmd_clear(pmd);
777 0 : return 1;
778 : }
779 :
780 : return 0;
781 : }
782 :
783 : /*
784 : * Until we support 512GB pages, skip them in the vmap area.
785 : */
786 0 : int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
787 : {
788 0 : return 0;
789 : }
790 :
791 : #ifdef CONFIG_X86_64
792 : /**
793 : * pud_free_pmd_page - Clear pud entry and free pmd page.
794 : * @pud: Pointer to a PUD.
795 : * @addr: Virtual address associated with pud.
796 : *
797 : * Context: The pud range has been unmapped and TLB purged.
798 : * Return: 1 if clearing the entry succeeded. 0 otherwise.
799 : *
800 : * NOTE: Callers must allow a single page allocation.
801 : */
802 0 : int pud_free_pmd_page(pud_t *pud, unsigned long addr)
803 : {
804 0 : pmd_t *pmd, *pmd_sv;
805 0 : pte_t *pte;
806 0 : int i;
807 :
808 0 : pmd = (pmd_t *)pud_page_vaddr(*pud);
809 0 : pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
810 0 : if (!pmd_sv)
811 : return 0;
812 :
813 0 : for (i = 0; i < PTRS_PER_PMD; i++) {
814 0 : pmd_sv[i] = pmd[i];
815 0 : if (!pmd_none(pmd[i]))
816 0 : pmd_clear(&pmd[i]);
817 : }
818 :
819 0 : pud_clear(pud);
820 :
821 : /* INVLPG to clear all paging-structure caches */
822 0 : flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
823 :
824 0 : for (i = 0; i < PTRS_PER_PMD; i++) {
825 0 : if (!pmd_none(pmd_sv[i])) {
826 0 : pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
827 0 : free_page((unsigned long)pte);
828 : }
829 : }
830 :
831 0 : free_page((unsigned long)pmd_sv);
832 :
833 0 : pgtable_pmd_page_dtor(virt_to_page(pmd));
834 0 : free_page((unsigned long)pmd);
835 :
836 0 : return 1;
837 : }
838 :
839 : /**
840 : * pmd_free_pte_page - Clear pmd entry and free pte page.
841 : * @pmd: Pointer to a PMD.
842 : * @addr: Virtual address associated with pmd.
843 : *
844 : * Context: The pmd range has been unmapped and TLB purged.
845 : * Return: 1 if clearing the entry succeeded. 0 otherwise.
846 : */
847 0 : int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
848 : {
849 0 : pte_t *pte;
850 :
851 0 : pte = (pte_t *)pmd_page_vaddr(*pmd);
852 0 : pmd_clear(pmd);
853 :
854 : /* INVLPG to clear all paging-structure caches */
855 0 : flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
856 :
857 0 : free_page((unsigned long)pte);
858 :
859 0 : return 1;
860 : }
861 :
862 : #else /* !CONFIG_X86_64 */
863 :
864 : int pud_free_pmd_page(pud_t *pud, unsigned long addr)
865 : {
866 : return pud_none(*pud);
867 : }
868 :
869 : /*
870 : * Disable free page handling on x86-PAE. This assures that ioremap()
871 : * does not update sync'd pmd entries. See vmalloc_sync_one().
872 : */
873 : int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
874 : {
875 : return pmd_none(*pmd);
876 : }
877 :
878 : #endif /* CONFIG_X86_64 */
879 : #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
|