LCOV - landlock.info - arch/x86/mm/fault.c

LCOV - code coverage report

Current view:	top level - arch/x86/mm - fault.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	106	411	25.8 %
Date:	2021-04-22 12:43:58	Functions:	10	25	40.0 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  *  Copyright (C) 1995  Linus Torvalds
       4             :  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
       5             :  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
       6             :  */
       7             : #include <linux/sched.h>          /* test_thread_flag(), ...      */
       8             : #include <linux/sched/task_stack.h>       /* task_stack_*(), ...          */
       9             : #include <linux/kdebug.h>         /* oops_begin/end, ...          */
      10             : #include <linux/extable.h>                /* search_exception_tables      */
      11             : #include <linux/memblock.h>               /* max_low_pfn                  */
      12             : #include <linux/kfence.h>         /* kfence_handle_page_fault     */
      13             : #include <linux/kprobes.h>                /* NOKPROBE_SYMBOL, ...         */
      14             : #include <linux/mmiotrace.h>              /* kmmio_handler, ...           */
      15             : #include <linux/perf_event.h>             /* perf_sw_event                */
      16             : #include <linux/hugetlb.h>                /* hstate_index_to_shift        */
      17             : #include <linux/prefetch.h>               /* prefetchw                    */
      18             : #include <linux/context_tracking.h>       /* exception_enter(), ...       */
      19             : #include <linux/uaccess.h>                /* faulthandler_disabled()      */
      20             : #include <linux/efi.h>                    /* efi_crash_gracefully_on_page_fault()*/
      21             : #include <linux/mm_types.h>
      22             : 
      23             : #include <asm/cpufeature.h>               /* boot_cpu_has, ...            */
      24             : #include <asm/traps.h>                    /* dotraplinkage, ...           */
      25             : #include <asm/fixmap.h>                   /* VSYSCALL_ADDR                */
      26             : #include <asm/vsyscall.h>         /* emulate_vsyscall             */
      27             : #include <asm/vm86.h>                     /* struct vm86                  */
      28             : #include <asm/mmu_context.h>              /* vma_pkey()                   */
      29             : #include <asm/efi.h>                      /* efi_crash_gracefully_on_page_fault()*/
      30             : #include <asm/desc.h>                     /* store_idt(), ...             */
      31             : #include <asm/cpu_entry_area.h>           /* exception stack              */
      32             : #include <asm/pgtable_areas.h>            /* VMALLOC_START, ...           */
      33             : #include <asm/kvm_para.h>         /* kvm_handle_async_pf          */
      34             : #include <asm/vdso.h>                     /* fixup_vdso_exception()       */
      35             : 
      36             : #define CREATE_TRACE_POINTS
      37             : #include <asm/trace/exceptions.h>
      38             : 
      39             : /*
      40             :  * Returns 0 if mmiotrace is disabled, or if the fault is not
      41             :  * handled by mmiotrace:
      42             :  */
      43             : static nokprobe_inline int
      44      295857 : kmmio_fault(struct pt_regs *regs, unsigned long addr)
      45             : {
      46      295857 :         if (unlikely(is_kmmio_active()))
      47             :                 if (kmmio_handler(regs, addr) == 1)
      48             :                         return -1;
      49      295857 :         return 0;
      50             : }
      51             : 
      52             : /*
      53             :  * Prefetch quirks:
      54             :  *
      55             :  * 32-bit mode:
      56             :  *
      57             :  *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
      58             :  *   Check that here and ignore it.  This is AMD erratum #91.
      59             :  *
      60             :  * 64-bit mode:
      61             :  *
      62             :  *   Sometimes the CPU reports invalid exceptions on prefetch.
      63             :  *   Check that here and ignore it.
      64             :  *
      65             :  * Opcode checker based on code by Richard Brunner.
      66             :  */
      67             : static inline int
      68           0 : check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
      69             :                       unsigned char opcode, int *prefetch)
      70             : {
      71           0 :         unsigned char instr_hi = opcode & 0xf0;
      72           0 :         unsigned char instr_lo = opcode & 0x0f;
      73             : 
      74           0 :         switch (instr_hi) {
      75           0 :         case 0x20:
      76             :         case 0x30:
      77             :                 /*
      78             :                  * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
      79             :                  * In X86_64 long mode, the CPU will signal invalid
      80             :                  * opcode if some of these prefixes are present so
      81             :                  * X86_64 will never get here anyway
      82             :                  */
      83           0 :                 return ((instr_lo & 7) == 0x6);
      84             : #ifdef CONFIG_X86_64
      85             :         case 0x40:
      86             :                 /*
      87             :                  * In 64-bit mode 0x40..0x4F are valid REX prefixes
      88             :                  */
      89           0 :                 return (!user_mode(regs) || user_64bit_mode(regs));
      90             : #endif
      91           0 :         case 0x60:
      92             :                 /* 0x64 thru 0x67 are valid prefixes in all modes. */
      93           0 :                 return (instr_lo & 0xC) == 0x4;
      94           0 :         case 0xF0:
      95             :                 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
      96           0 :                 return !instr_lo || (instr_lo>>1) == 1;
      97           0 :         case 0x00:
      98             :                 /* Prefetch instruction is 0x0F0D or 0x0F18 */
      99           0 :                 if (get_kernel_nofault(opcode, instr))
     100             :                         return 0;
     101             : 
     102           0 :                 *prefetch = (instr_lo == 0xF) &&
     103           0 :                         (opcode == 0x0D || opcode == 0x18);
     104           0 :                 return 0;
     105             :         default:
     106             :                 return 0;
     107             :         }
     108             : }
     109             : 
     110           0 : static bool is_amd_k8_pre_npt(void)
     111             : {
     112           0 :         struct cpuinfo_x86 *c = &boot_cpu_data;
     113             : 
     114           0 :         return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
     115             :                         c->x86_vendor == X86_VENDOR_AMD &&
     116             :                         c->x86 == 0xf && c->x86_model < 0x40);
     117             : }
     118             : 
     119             : static int
     120           0 : is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
     121             : {
     122           0 :         unsigned char *max_instr;
     123           0 :         unsigned char *instr;
     124           0 :         int prefetch = 0;
     125             : 
     126             :         /* Erratum #91 affects AMD K8, pre-NPT CPUs */
     127           0 :         if (!is_amd_k8_pre_npt())
     128             :                 return 0;
     129             : 
     130             :         /*
     131             :          * If it was a exec (instruction fetch) fault on NX page, then
     132             :          * do not ignore the fault:
     133             :          */
     134           0 :         if (error_code & X86_PF_INSTR)
     135             :                 return 0;
     136             : 
     137           0 :         instr = (void *)convert_ip_to_linear(current, regs);
     138           0 :         max_instr = instr + 15;
     139             : 
     140             :         /*
     141             :          * This code has historically always bailed out if IP points to a
     142             :          * not-present page (e.g. due to a race).  No one has ever
     143             :          * complained about this.
     144             :          */
     145           0 :         pagefault_disable();
     146             : 
     147           0 :         while (instr < max_instr) {
     148           0 :                 unsigned char opcode;
     149             : 
     150           0 :                 if (user_mode(regs)) {
     151           0 :                         if (get_user(opcode, instr))
     152             :                                 break;
     153             :                 } else {
     154           0 :                         if (get_kernel_nofault(opcode, instr))
     155             :                                 break;
     156             :                 }
     157             : 
     158           0 :                 instr++;
     159             : 
     160           0 :                 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
     161             :                         break;
     162             :         }
     163             : 
     164           0 :         pagefault_enable();
     165           0 :         return prefetch;
     166             : }
     167             : 
     168             : DEFINE_SPINLOCK(pgd_lock);
     169             : LIST_HEAD(pgd_list);
     170             : 
     171             : #ifdef CONFIG_X86_32
     172             : static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
     173             : {
     174             :         unsigned index = pgd_index(address);
     175             :         pgd_t *pgd_k;
     176             :         p4d_t *p4d, *p4d_k;
     177             :         pud_t *pud, *pud_k;
     178             :         pmd_t *pmd, *pmd_k;
     179             : 
     180             :         pgd += index;
     181             :         pgd_k = init_mm.pgd + index;
     182             : 
     183             :         if (!pgd_present(*pgd_k))
     184             :                 return NULL;
     185             : 
     186             :         /*
     187             :          * set_pgd(pgd, *pgd_k); here would be useless on PAE
     188             :          * and redundant with the set_pmd() on non-PAE. As would
     189             :          * set_p4d/set_pud.
     190             :          */
     191             :         p4d = p4d_offset(pgd, address);
     192             :         p4d_k = p4d_offset(pgd_k, address);
     193             :         if (!p4d_present(*p4d_k))
     194             :                 return NULL;
     195             : 
     196             :         pud = pud_offset(p4d, address);
     197             :         pud_k = pud_offset(p4d_k, address);
     198             :         if (!pud_present(*pud_k))
     199             :                 return NULL;
     200             : 
     201             :         pmd = pmd_offset(pud, address);
     202             :         pmd_k = pmd_offset(pud_k, address);
     203             : 
     204             :         if (pmd_present(*pmd) != pmd_present(*pmd_k))
     205             :                 set_pmd(pmd, *pmd_k);
     206             : 
     207             :         if (!pmd_present(*pmd_k))
     208             :                 return NULL;
     209             :         else
     210             :                 BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
     211             : 
     212             :         return pmd_k;
     213             : }
     214             : 
     215             : /*
     216             :  *   Handle a fault on the vmalloc or module mapping area
     217             :  *
     218             :  *   This is needed because there is a race condition between the time
     219             :  *   when the vmalloc mapping code updates the PMD to the point in time
     220             :  *   where it synchronizes this update with the other page-tables in the
     221             :  *   system.
     222             :  *
     223             :  *   In this race window another thread/CPU can map an area on the same
     224             :  *   PMD, finds it already present and does not synchronize it with the
     225             :  *   rest of the system yet. As a result v[mz]alloc might return areas
     226             :  *   which are not mapped in every page-table in the system, causing an
     227             :  *   unhandled page-fault when they are accessed.
     228             :  */
     229             : static noinline int vmalloc_fault(unsigned long address)
     230             : {
     231             :         unsigned long pgd_paddr;
     232             :         pmd_t *pmd_k;
     233             :         pte_t *pte_k;
     234             : 
     235             :         /* Make sure we are in vmalloc area: */
     236             :         if (!(address >= VMALLOC_START && address < VMALLOC_END))
     237             :                 return -1;
     238             : 
     239             :         /*
     240             :          * Synchronize this task's top level page-table
     241             :          * with the 'reference' page table.
     242             :          *
     243             :          * Do _not_ use "current" here. We might be inside
     244             :          * an interrupt in the middle of a task switch..
     245             :          */
     246             :         pgd_paddr = read_cr3_pa();
     247             :         pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
     248             :         if (!pmd_k)
     249             :                 return -1;
     250             : 
     251             :         if (pmd_large(*pmd_k))
     252             :                 return 0;
     253             : 
     254             :         pte_k = pte_offset_kernel(pmd_k, address);
     255             :         if (!pte_present(*pte_k))
     256             :                 return -1;
     257             : 
     258             :         return 0;
     259             : }
     260             : NOKPROBE_SYMBOL(vmalloc_fault);
     261             : 
     262             : void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
     263             : {
     264             :         unsigned long addr;
     265             : 
     266             :         for (addr = start & PMD_MASK;
     267             :              addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
     268             :              addr += PMD_SIZE) {
     269             :                 struct page *page;
     270             : 
     271             :                 spin_lock(&pgd_lock);
     272             :                 list_for_each_entry(page, &pgd_list, lru) {
     273             :                         spinlock_t *pgt_lock;
     274             : 
     275             :                         /* the pgt_lock only for Xen */
     276             :                         pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
     277             : 
     278             :                         spin_lock(pgt_lock);
     279             :                         vmalloc_sync_one(page_address(page), addr);
     280             :                         spin_unlock(pgt_lock);
     281             :                 }
     282             :                 spin_unlock(&pgd_lock);
     283             :         }
     284             : }
     285             : 
     286             : static bool low_pfn(unsigned long pfn)
     287             : {
     288             :         return pfn < max_low_pfn;
     289             : }
     290             : 
     291             : static void dump_pagetable(unsigned long address)
     292             : {
     293             :         pgd_t *base = __va(read_cr3_pa());
     294             :         pgd_t *pgd = &base[pgd_index(address)];
     295             :         p4d_t *p4d;
     296             :         pud_t *pud;
     297             :         pmd_t *pmd;
     298             :         pte_t *pte;
     299             : 
     300             : #ifdef CONFIG_X86_PAE
     301             :         pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
     302             :         if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
     303             :                 goto out;
     304             : #define pr_pde pr_cont
     305             : #else
     306             : #define pr_pde pr_info
     307             : #endif
     308             :         p4d = p4d_offset(pgd, address);
     309             :         pud = pud_offset(p4d, address);
     310             :         pmd = pmd_offset(pud, address);
     311             :         pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
     312             : #undef pr_pde
     313             : 
     314             :         /*
     315             :          * We must not directly access the pte in the highpte
     316             :          * case if the page table is located in highmem.
     317             :          * And let's rather not kmap-atomic the pte, just in case
     318             :          * it's allocated already:
     319             :          */
     320             :         if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
     321             :                 goto out;
     322             : 
     323             :         pte = pte_offset_kernel(pmd, address);
     324             :         pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
     325             : out:
     326             :         pr_cont("\n");
     327             : }
     328             : 
     329             : #else /* CONFIG_X86_64: */
     330             : 
     331             : #ifdef CONFIG_CPU_SUP_AMD
     332             : static const char errata93_warning[] =
     333             : KERN_ERR 
     334             : "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
     335             : "******* Working around it, but it may cause SEGVs or burn power.\n"
     336             : "******* Please consider a BIOS update.\n"
     337             : "******* Disabling USB legacy in the BIOS may also help.\n";
     338             : #endif
     339             : 
     340           0 : static int bad_address(void *p)
     341             : {
     342           0 :         unsigned long dummy;
     343             : 
     344           0 :         return get_kernel_nofault(dummy, (unsigned long *)p);
     345             : }
     346             : 
     347           0 : static void dump_pagetable(unsigned long address)
     348             : {
     349           0 :         pgd_t *base = __va(read_cr3_pa());
     350           0 :         pgd_t *pgd = base + pgd_index(address);
     351           0 :         p4d_t *p4d;
     352           0 :         pud_t *pud;
     353           0 :         pmd_t *pmd;
     354           0 :         pte_t *pte;
     355             : 
     356           0 :         if (bad_address(pgd))
     357           0 :                 goto bad;
     358             : 
     359           0 :         pr_info("PGD %lx ", pgd_val(*pgd));
     360             : 
     361           0 :         if (!pgd_present(*pgd))
     362             :                 goto out;
     363             : 
     364           0 :         p4d = p4d_offset(pgd, address);
     365           0 :         if (bad_address(p4d))
     366           0 :                 goto bad;
     367             : 
     368           0 :         pr_cont("P4D %lx ", p4d_val(*p4d));
     369           0 :         if (!p4d_present(*p4d) || p4d_large(*p4d))
     370           0 :                 goto out;
     371             : 
     372           0 :         pud = pud_offset(p4d, address);
     373           0 :         if (bad_address(pud))
     374           0 :                 goto bad;
     375             : 
     376           0 :         pr_cont("PUD %lx ", pud_val(*pud));
     377           0 :         if (!pud_present(*pud) || pud_large(*pud))
     378           0 :                 goto out;
     379             : 
     380           0 :         pmd = pmd_offset(pud, address);
     381           0 :         if (bad_address(pmd))
     382           0 :                 goto bad;
     383             : 
     384           0 :         pr_cont("PMD %lx ", pmd_val(*pmd));
     385           0 :         if (!pmd_present(*pmd) || pmd_large(*pmd))
     386           0 :                 goto out;
     387             : 
     388           0 :         pte = pte_offset_kernel(pmd, address);
     389           0 :         if (bad_address(pte))
     390           0 :                 goto bad;
     391             : 
     392           0 :         pr_cont("PTE %lx", pte_val(*pte));
     393           0 : out:
     394           0 :         pr_cont("\n");
     395           0 :         return;
     396           0 : bad:
     397           0 :         pr_info("BAD\n");
     398             : }
     399             : 
     400             : #endif /* CONFIG_X86_64 */
     401             : 
     402             : /*
     403             :  * Workaround for K8 erratum #93 & buggy BIOS.
     404             :  *
     405             :  * BIOS SMM functions are required to use a specific workaround
     406             :  * to avoid corruption of the 64bit RIP register on C stepping K8.
     407             :  *
     408             :  * A lot of BIOS that didn't get tested properly miss this.
     409             :  *
     410             :  * The OS sees this as a page fault with the upper 32bits of RIP cleared.
     411             :  * Try to work around it here.
     412             :  *
     413             :  * Note we only handle faults in kernel here.
     414             :  * Does nothing on 32-bit.
     415             :  */
     416           0 : static int is_errata93(struct pt_regs *regs, unsigned long address)
     417             : {
     418             : #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
     419           0 :         if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
     420           0 :             || boot_cpu_data.x86 != 0xf)
     421             :                 return 0;
     422             : 
     423           0 :         if (user_mode(regs))
     424             :                 return 0;
     425             : 
     426           0 :         if (address != regs->ip)
     427             :                 return 0;
     428             : 
     429           0 :         if ((address >> 32) != 0)
     430             :                 return 0;
     431             : 
     432           0 :         address |= 0xffffffffUL << 32;
     433           0 :         if ((address >= (u64)_stext && address <= (u64)_etext) ||
     434           0 :             (address >= MODULES_VADDR && address <= MODULES_END)) {
     435           0 :                 printk_once(errata93_warning);
     436           0 :                 regs->ip = address;
     437           0 :                 return 1;
     438             :         }
     439             : #endif
     440             :         return 0;
     441             : }
     442             : 
     443             : /*
     444             :  * Work around K8 erratum #100 K8 in compat mode occasionally jumps
     445             :  * to illegal addresses >4GB.
     446             :  *
     447             :  * We catch this in the page fault handler because these addresses
     448             :  * are not reachable. Just detect this case and return.  Any code
     449             :  * segment in LDT is compatibility mode.
     450             :  */
     451           0 : static int is_errata100(struct pt_regs *regs, unsigned long address)
     452             : {
     453             : #ifdef CONFIG_X86_64
     454           0 :         if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
     455             :                 return 1;
     456             : #endif
     457             :         return 0;
     458             : }
     459             : 
     460             : /* Pentium F0 0F C7 C8 bug workaround: */
     461           0 : static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
     462             :                        unsigned long address)
     463             : {
     464             : #ifdef CONFIG_X86_F00F_BUG
     465             :         if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
     466             :             idt_is_f00f_address(address)) {
     467             :                 handle_invalid_op(regs);
     468             :                 return 1;
     469             :         }
     470             : #endif
     471           0 :         return 0;
     472             : }
     473             : 
     474           0 : static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
     475             : {
     476           0 :         u32 offset = (index >> 3) * sizeof(struct desc_struct);
     477           0 :         unsigned long addr;
     478           0 :         struct ldttss_desc desc;
     479             : 
     480           0 :         if (index == 0) {
     481           0 :                 pr_alert("%s: NULL\n", name);
     482           0 :                 return;
     483             :         }
     484             : 
     485           0 :         if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
     486           0 :                 pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
     487           0 :                 return;
     488             :         }
     489             : 
     490           0 :         if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
     491             :                               sizeof(struct ldttss_desc))) {
     492           0 :                 pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
     493             :                          name, index);
     494           0 :                 return;
     495             :         }
     496             : 
     497           0 :         addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
     498             : #ifdef CONFIG_X86_64
     499           0 :         addr |= ((u64)desc.base3 << 32);
     500             : #endif
     501           0 :         pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
     502             :                  name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
     503             : }
     504             : 
     505             : static void
     506           0 : show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
     507             : {
     508           0 :         if (!oops_may_print())
     509             :                 return;
     510             : 
     511           0 :         if (error_code & X86_PF_INSTR) {
     512           0 :                 unsigned int level;
     513           0 :                 pgd_t *pgd;
     514           0 :                 pte_t *pte;
     515             : 
     516           0 :                 pgd = __va(read_cr3_pa());
     517           0 :                 pgd += pgd_index(address);
     518             : 
     519           0 :                 pte = lookup_address_in_pgd(pgd, address, &level);
     520             : 
     521           0 :                 if (pte && pte_present(*pte) && !pte_exec(*pte))
     522           0 :                         pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
     523             :                                 from_kuid(&init_user_ns, current_uid()));
     524           0 :                 if (pte && pte_present(*pte) && pte_exec(*pte) &&
     525           0 :                                 (pgd_flags(*pgd) & _PAGE_USER) &&
     526           0 :                                 (__read_cr4() & X86_CR4_SMEP))
     527           0 :                         pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
     528             :                                 from_kuid(&init_user_ns, current_uid()));
     529             :         }
     530             : 
     531           0 :         if (address < PAGE_SIZE && !user_mode(regs))
     532           0 :                 pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
     533             :                         (void *)address);
     534             :         else
     535           0 :                 pr_alert("BUG: unable to handle page fault for address: %px\n",
     536             :                         (void *)address);
     537             : 
     538           0 :         pr_alert("#PF: %s %s in %s mode\n",
     539             :                  (error_code & X86_PF_USER)  ? "user" : "supervisor",
     540             :                  (error_code & X86_PF_INSTR) ? "instruction fetch" :
     541             :                  (error_code & X86_PF_WRITE) ? "write access" :
     542             :                                                "read access",
     543             :                              user_mode(regs) ? "user" : "kernel");
     544           0 :         pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
     545             :                  !(error_code & X86_PF_PROT) ? "not-present page" :
     546             :                  (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
     547             :                  (error_code & X86_PF_PK)    ? "protection keys violation" :
     548             :                                                "permissions violation");
     549             : 
     550           0 :         if (!(error_code & X86_PF_USER) && user_mode(regs)) {
     551           0 :                 struct desc_ptr idt, gdt;
     552           0 :                 u16 ldtr, tr;
     553             : 
     554             :                 /*
     555             :                  * This can happen for quite a few reasons.  The more obvious
     556             :                  * ones are faults accessing the GDT, or LDT.  Perhaps
     557             :                  * surprisingly, if the CPU tries to deliver a benign or
     558             :                  * contributory exception from user code and gets a page fault
     559             :                  * during delivery, the page fault can be delivered as though
     560             :                  * it originated directly from user code.  This could happen
     561             :                  * due to wrong permissions on the IDT, GDT, LDT, TSS, or
     562             :                  * kernel or IST stack.
     563             :                  */
     564           0 :                 store_idt(&idt);
     565             : 
     566             :                 /* Usable even on Xen PV -- it's just slow. */
     567           0 :                 native_store_gdt(&gdt);
     568             : 
     569           0 :                 pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
     570             :                          idt.address, idt.size, gdt.address, gdt.size);
     571             : 
     572           0 :                 store_ldt(ldtr);
     573           0 :                 show_ldttss(&gdt, "LDTR", ldtr);
     574             : 
     575           0 :                 store_tr(tr);
     576           0 :                 show_ldttss(&gdt, "TR", tr);
     577             :         }
     578             : 
     579           0 :         dump_pagetable(address);
     580             : }
     581             : 
     582             : static noinline void
     583           0 : pgtable_bad(struct pt_regs *regs, unsigned long error_code,
     584             :             unsigned long address)
     585             : {
     586           0 :         struct task_struct *tsk;
     587           0 :         unsigned long flags;
     588           0 :         int sig;
     589             : 
     590           0 :         flags = oops_begin();
     591           0 :         tsk = current;
     592           0 :         sig = SIGKILL;
     593             : 
     594           0 :         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
     595           0 :                tsk->comm, address);
     596           0 :         dump_pagetable(address);
     597             : 
     598           0 :         if (__die("Bad pagetable", regs, error_code))
     599           0 :                 sig = 0;
     600             : 
     601           0 :         oops_end(flags, regs, sig);
     602           0 : }
     603             : 
     604           0 : static void sanitize_error_code(unsigned long address,
     605             :                                 unsigned long *error_code)
     606             : {
     607             :         /*
     608             :          * To avoid leaking information about the kernel page
     609             :          * table layout, pretend that user-mode accesses to
     610             :          * kernel addresses are always protection faults.
     611             :          *
     612             :          * NB: This means that failed vsyscalls with vsyscall=none
     613             :          * will have the PROT bit.  This doesn't leak any
     614             :          * information and does not appear to cause any problems.
     615             :          */
     616           0 :         if (address >= TASK_SIZE_MAX)
     617           0 :                 *error_code |= X86_PF_PROT;
     618             : }
     619             : 
     620           0 : static void set_signal_archinfo(unsigned long address,
     621             :                                 unsigned long error_code)
     622             : {
     623           0 :         struct task_struct *tsk = current;
     624             : 
     625           0 :         tsk->thread.trap_nr = X86_TRAP_PF;
     626           0 :         tsk->thread.error_code = error_code | X86_PF_USER;
     627           0 :         tsk->thread.cr2 = address;
     628             : }
     629             : 
     630             : static noinline void
     631           0 : page_fault_oops(struct pt_regs *regs, unsigned long error_code,
     632             :                 unsigned long address)
     633             : {
     634           0 :         unsigned long flags;
     635           0 :         int sig;
     636             : 
     637           0 :         if (user_mode(regs)) {
     638             :                 /*
     639             :                  * Implicit kernel access from user mode?  Skip the stack
     640             :                  * overflow and EFI special cases.
     641             :                  */
     642             :                 goto oops;
     643             :         }
     644             : 
     645             : #ifdef CONFIG_VMAP_STACK
     646             :         /*
     647             :          * Stack overflow?  During boot, we can fault near the initial
     648             :          * stack in the direct map, but that's not an overflow -- check
     649             :          * that we're in vmalloc space to avoid this.
     650             :          */
     651             :         if (is_vmalloc_addr((void *)address) &&
     652             :             (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
     653             :              address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
     654             :                 unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
     655             :                 /*
     656             :                  * We're likely to be running with very little stack space
     657             :                  * left.  It's plausible that we'd hit this condition but
     658             :                  * double-fault even before we get this far, in which case
     659             :                  * we're fine: the double-fault handler will deal with it.
     660             :                  *
     661             :                  * We don't want to make it all the way into the oops code
     662             :                  * and then double-fault, though, because we're likely to
     663             :                  * break the console driver and lose most of the stack dump.
     664             :                  */
     665             :                 asm volatile ("movq %[stack], %%rsp\n\t"
     666             :                               "call handle_stack_overflow\n\t"
     667             :                               "1: jmp 1b"
     668             :                               : ASM_CALL_CONSTRAINT
     669             :                               : "D" ("kernel stack overflow (page fault)"),
     670             :                                 "S" (regs), "d" (address),
     671             :                                 [stack] "rm" (stack));
     672             :                 unreachable();
     673             :         }
     674             : #endif
     675             : 
     676             :         /*
     677             :          * Buggy firmware could access regions which might page fault.  If
     678             :          * this happens, EFI has a special OOPS path that will try to
     679             :          * avoid hanging the system.
     680             :          */
     681           0 :         if (IS_ENABLED(CONFIG_EFI))
     682             :                 efi_crash_gracefully_on_page_fault(address);
     683             : 
     684             :         /* Only not-present faults should be handled by KFENCE. */
     685           0 :         if (!(error_code & X86_PF_PROT) &&
     686             :             kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
     687             :                 return;
     688             : 
     689           0 : oops:
     690             :         /*
     691             :          * Oops. The kernel tried to access some bad page. We'll have to
     692             :          * terminate things with extreme prejudice:
     693             :          */
     694           0 :         flags = oops_begin();
     695             : 
     696           0 :         show_fault_oops(regs, error_code, address);
     697             : 
     698           0 :         if (task_stack_end_corrupted(current))
     699           0 :                 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
     700             : 
     701           0 :         sig = SIGKILL;
     702           0 :         if (__die("Oops", regs, error_code))
     703           0 :                 sig = 0;
     704             : 
     705             :         /* Executive summary in case the body of the oops scrolled away */
     706           0 :         printk(KERN_DEFAULT "CR2: %016lx\n", address);
     707             : 
     708           0 :         oops_end(flags, regs, sig);
     709             : }
     710             : 
     711             : static noinline void
     712         715 : kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
     713             :                          unsigned long address, int signal, int si_code)
     714             : {
     715         715 :         WARN_ON_ONCE(user_mode(regs));
     716             : 
     717             :         /* Are we prepared to handle this kernel fault? */
     718         715 :         if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
     719             :                 /*
     720             :                  * Any interrupt that takes a fault gets the fixup. This makes
     721             :                  * the below recursive fault logic only apply to a faults from
     722             :                  * task context.
     723             :                  */
     724         715 :                 if (in_interrupt())
     725             :                         return;
     726             : 
     727             :                 /*
     728             :                  * Per the above we're !in_interrupt(), aka. task context.
     729             :                  *
     730             :                  * In this case we need to make sure we're not recursively
     731             :                  * faulting through the emulate_vsyscall() logic.
     732             :                  */
     733           3 :                 if (current->thread.sig_on_uaccess_err && signal) {
     734           0 :                         sanitize_error_code(address, &error_code);
     735             : 
     736           0 :                         set_signal_archinfo(address, error_code);
     737             : 
     738             :                         /* XXX: hwpoison faults will set the wrong code. */
     739           0 :                         force_sig_fault(signal, si_code, (void __user *)address);
     740             :                 }
     741             : 
     742             :                 /*
     743             :                  * Barring that, we can do the fixup and be happy.
     744             :                  */
     745           3 :                 return;
     746             :         }
     747             : 
     748             :         /*
     749             :          * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
     750             :          * instruction.
     751             :          */
     752           0 :         if (is_prefetch(regs, error_code, address))
     753             :                 return;
     754             : 
     755           0 :         page_fault_oops(regs, error_code, address);
     756             : }
     757             : 
     758             : /*
     759             :  * Print out info about fatal segfaults, if the show_unhandled_signals
     760             :  * sysctl is set:
     761             :  */
     762             : static inline void
     763           0 : show_signal_msg(struct pt_regs *regs, unsigned long error_code,
     764             :                 unsigned long address, struct task_struct *tsk)
     765             : {
     766           0 :         const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
     767             : 
     768           0 :         if (!unhandled_signal(tsk, SIGSEGV))
     769             :                 return;
     770             : 
     771           0 :         if (!printk_ratelimit())
     772             :                 return;
     773             : 
     774           0 :         printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
     775           0 :                 loglvl, tsk->comm, task_pid_nr(tsk), address,
     776           0 :                 (void *)regs->ip, (void *)regs->sp, error_code);
     777             : 
     778           0 :         print_vma_addr(KERN_CONT " in ", regs->ip);
     779             : 
     780           0 :         printk(KERN_CONT "\n");
     781             : 
     782           0 :         show_opcodes(regs, loglvl);
     783             : }
     784             : 
     785             : /*
     786             :  * The (legacy) vsyscall page is the long page in the kernel portion
     787             :  * of the address space that has user-accessible permissions.
     788             :  */
     789      591007 : static bool is_vsyscall_vaddr(unsigned long vaddr)
     790             : {
     791      591007 :         return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
     792             : }
     793             : 
     794             : static void
     795         715 : __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
     796             :                        unsigned long address, u32 pkey, int si_code)
     797             : {
     798         715 :         struct task_struct *tsk = current;
     799             : 
     800         715 :         if (!user_mode(regs)) {
     801         715 :                 kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
     802         715 :                 return;
     803             :         }
     804             : 
     805           0 :         if (!(error_code & X86_PF_USER)) {
     806             :                 /* Implicit user access to kernel memory -- just oops */
     807           0 :                 page_fault_oops(regs, error_code, address);
     808           0 :                 return;
     809             :         }
     810             : 
     811             :         /*
     812             :          * User mode accesses just cause a SIGSEGV.
     813             :          * It's possible to have interrupts off here:
     814             :          */
     815           0 :         local_irq_enable();
     816             : 
     817             :         /*
     818             :          * Valid to do another page fault here because this one came
     819             :          * from user space:
     820             :          */
     821           0 :         if (is_prefetch(regs, error_code, address))
     822             :                 return;
     823             : 
     824           0 :         if (is_errata100(regs, address))
     825             :                 return;
     826             : 
     827           0 :         sanitize_error_code(address, &error_code);
     828             : 
     829           0 :         if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
     830             :                 return;
     831             : 
     832           0 :         if (likely(show_unhandled_signals))
     833           0 :                 show_signal_msg(regs, error_code, address, tsk);
     834             : 
     835           0 :         set_signal_archinfo(address, error_code);
     836             : 
     837           0 :         if (si_code == SEGV_PKUERR)
     838           0 :                 force_sig_pkuerr((void __user *)address, pkey);
     839             : 
     840           0 :         force_sig_fault(SIGSEGV, si_code, (void __user *)address);
     841             : 
     842           0 :         local_irq_disable();
     843             : }
     844             : 
     845             : static noinline void
     846         713 : bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
     847             :                      unsigned long address)
     848             : {
     849         713 :         __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
     850         713 : }
     851             : 
     852             : static void
     853           2 : __bad_area(struct pt_regs *regs, unsigned long error_code,
     854             :            unsigned long address, u32 pkey, int si_code)
     855             : {
     856           2 :         struct mm_struct *mm = current->mm;
     857             :         /*
     858             :          * Something tried to access memory that isn't in our memory map..
     859             :          * Fix it, but check if it's kernel or user first..
     860             :          */
     861           2 :         mmap_read_unlock(mm);
     862             : 
     863           2 :         __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
     864           2 : }
     865             : 
     866             : static noinline void
     867           2 : bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
     868             : {
     869           2 :         __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
     870           2 : }
     871             : 
     872           0 : static inline bool bad_area_access_from_pkeys(unsigned long error_code,
     873             :                 struct vm_area_struct *vma)
     874             : {
     875             :         /* This code is always called on the current mm */
     876           0 :         bool foreign = false;
     877             : 
     878           0 :         if (!boot_cpu_has(X86_FEATURE_OSPKE))
     879             :                 return false;
     880           0 :         if (error_code & X86_PF_PK)
     881             :                 return true;
     882             :         /* this checks permission keys on the VMA: */
     883           0 :         if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
     884           0 :                                        (error_code & X86_PF_INSTR), foreign))
     885           0 :                 return true;
     886             :         return false;
     887             : }
     888             : 
     889             : static noinline void
     890           0 : bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
     891             :                       unsigned long address, struct vm_area_struct *vma)
     892             : {
     893             :         /*
     894             :          * This OSPKE check is not strictly necessary at runtime.
     895             :          * But, doing it this way allows compiler optimizations
     896             :          * if pkeys are compiled out.
     897             :          */
     898           0 :         if (bad_area_access_from_pkeys(error_code, vma)) {
     899             :                 /*
     900             :                  * A protection key fault means that the PKRU value did not allow
     901             :                  * access to some PTE.  Userspace can figure out what PKRU was
     902             :                  * from the XSAVE state.  This function captures the pkey from
     903             :                  * the vma and passes it to userspace so userspace can discover
     904             :                  * which protection key was set on the PTE.
     905             :                  *
     906             :                  * If we get here, we know that the hardware signaled a X86_PF_PK
     907             :                  * fault and that there was a VMA once we got in the fault
     908             :                  * handler.  It does *not* guarantee that the VMA we find here
     909             :                  * was the one that we faulted on.
     910             :                  *
     911             :                  * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
     912             :                  * 2. T1   : set PKRU to deny access to pkey=4, touches page
     913             :                  * 3. T1   : faults...
     914             :                  * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
     915             :                  * 5. T1   : enters fault handler, takes mmap_lock, etc...
     916             :                  * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
     917             :                  *           faulted on a pte with its pkey=4.
     918             :                  */
     919           0 :                 u32 pkey = vma_pkey(vma);
     920             : 
     921           0 :                 __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
     922             :         } else {
     923           0 :                 __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
     924             :         }
     925           0 : }
     926             : 
     927             : static void
     928           0 : do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
     929             :           vm_fault_t fault)
     930             : {
     931             :         /* Kernel mode? Handle exceptions or die: */
     932           0 :         if (!user_mode(regs)) {
     933           0 :                 kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
     934           0 :                 return;
     935             :         }
     936             : 
     937             :         /* User-space => ok to do another page fault: */
     938           0 :         if (is_prefetch(regs, error_code, address))
     939             :                 return;
     940             : 
     941           0 :         sanitize_error_code(address, &error_code);
     942             : 
     943           0 :         if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
     944             :                 return;
     945             : 
     946           0 :         set_signal_archinfo(address, error_code);
     947             : 
     948             : #ifdef CONFIG_MEMORY_FAILURE
     949             :         if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
     950             :                 struct task_struct *tsk = current;
     951             :                 unsigned lsb = 0;
     952             : 
     953             :                 pr_err(
     954             :         "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
     955             :                         tsk->comm, tsk->pid, address);
     956             :                 if (fault & VM_FAULT_HWPOISON_LARGE)
     957             :                         lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
     958             :                 if (fault & VM_FAULT_HWPOISON)
     959             :                         lsb = PAGE_SHIFT;
     960             :                 force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
     961             :                 return;
     962             :         }
     963             : #endif
     964           0 :         force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
     965             : }
     966             : 
     967           0 : static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
     968             : {
     969           0 :         if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
     970             :                 return 0;
     971             : 
     972           0 :         if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
     973           0 :                 return 0;
     974             : 
     975             :         return 1;
     976             : }
     977             : 
     978             : /*
     979             :  * Handle a spurious fault caused by a stale TLB entry.
     980             :  *
     981             :  * This allows us to lazily refresh the TLB when increasing the
     982             :  * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
     983             :  * eagerly is very expensive since that implies doing a full
     984             :  * cross-processor TLB flush, even if no stale TLB entries exist
     985             :  * on other processors.
     986             :  *
     987             :  * Spurious faults may only occur if the TLB contains an entry with
     988             :  * fewer permission than the page table entry.  Non-present (P = 0)
     989             :  * and reserved bit (R = 1) faults are never spurious.
     990             :  *
     991             :  * There are no security implications to leaving a stale TLB when
     992             :  * increasing the permissions on a page.
     993             :  *
     994             :  * Returns non-zero if a spurious fault was handled, zero otherwise.
     995             :  *
     996             :  * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
     997             :  * (Optional Invalidation).
     998             :  */
     999             : static noinline int
    1000           0 : spurious_kernel_fault(unsigned long error_code, unsigned long address)
    1001             : {
    1002           0 :         pgd_t *pgd;
    1003           0 :         p4d_t *p4d;
    1004           0 :         pud_t *pud;
    1005           0 :         pmd_t *pmd;
    1006           0 :         pte_t *pte;
    1007           0 :         int ret;
    1008             : 
    1009             :         /*
    1010             :          * Only writes to RO or instruction fetches from NX may cause
    1011             :          * spurious faults.
    1012             :          *
    1013             :          * These could be from user or supervisor accesses but the TLB
    1014             :          * is only lazily flushed after a kernel mapping protection
    1015             :          * change, so user accesses are not expected to cause spurious
    1016             :          * faults.
    1017             :          */
    1018           0 :         if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
    1019           0 :             error_code != (X86_PF_INSTR | X86_PF_PROT))
    1020             :                 return 0;
    1021             : 
    1022           0 :         pgd = init_mm.pgd + pgd_index(address);
    1023           0 :         if (!pgd_present(*pgd))
    1024             :                 return 0;
    1025             : 
    1026           0 :         p4d = p4d_offset(pgd, address);
    1027           0 :         if (!p4d_present(*p4d))
    1028             :                 return 0;
    1029             : 
    1030           0 :         if (p4d_large(*p4d))
    1031             :                 return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
    1032             : 
    1033           0 :         pud = pud_offset(p4d, address);
    1034           0 :         if (!pud_present(*pud))
    1035             :                 return 0;
    1036             : 
    1037           0 :         if (pud_large(*pud))
    1038           0 :                 return spurious_kernel_fault_check(error_code, (pte_t *) pud);
    1039             : 
    1040           0 :         pmd = pmd_offset(pud, address);
    1041           0 :         if (!pmd_present(*pmd))
    1042             :                 return 0;
    1043             : 
    1044           0 :         if (pmd_large(*pmd))
    1045           0 :                 return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
    1046             : 
    1047           0 :         pte = pte_offset_kernel(pmd, address);
    1048           0 :         if (!pte_present(*pte))
    1049             :                 return 0;
    1050             : 
    1051           0 :         ret = spurious_kernel_fault_check(error_code, pte);
    1052           0 :         if (!ret)
    1053             :                 return 0;
    1054             : 
    1055             :         /*
    1056             :          * Make sure we have permissions in PMD.
    1057             :          * If not, then there's a bug in the page tables:
    1058             :          */
    1059           0 :         ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
    1060           0 :         WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
    1061             : 
    1062             :         return ret;
    1063             : }
    1064             : NOKPROBE_SYMBOL(spurious_kernel_fault);
    1065             : 
    1066             : int show_unhandled_signals = 1;
    1067             : 
    1068             : static inline int
    1069      296352 : access_error(unsigned long error_code, struct vm_area_struct *vma)
    1070             : {
    1071             :         /* This is only called for the current mm, so: */
    1072      296352 :         bool foreign = false;
    1073             : 
    1074             :         /*
    1075             :          * Read or write was blocked by protection keys.  This is
    1076             :          * always an unconditional error and can never result in
    1077             :          * a follow-up action to resolve the fault, like a COW.
    1078             :          */
    1079      296352 :         if (error_code & X86_PF_PK)
    1080             :                 return 1;
    1081             : 
    1082             :         /*
    1083             :          * SGX hardware blocked the access.  This usually happens
    1084             :          * when the enclave memory contents have been destroyed, like
    1085             :          * after a suspend/resume cycle. In any case, the kernel can't
    1086             :          * fix the cause of the fault.  Handle the fault as an access
    1087             :          * error even in cases where no actual access violation
    1088             :          * occurred.  This allows userspace to rebuild the enclave in
    1089             :          * response to the signal.
    1090             :          */
    1091      296352 :         if (unlikely(error_code & X86_PF_SGX))
    1092             :                 return 1;
    1093             : 
    1094             :         /*
    1095             :          * Make sure to check the VMA so that we do not perform
    1096             :          * faults just to hit a X86_PF_PK as soon as we fill in a
    1097             :          * page.
    1098             :          */
    1099      296350 :         if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
    1100      296352 :                                        (error_code & X86_PF_INSTR), foreign))
    1101             :                 return 1;
    1102             : 
    1103      296350 :         if (error_code & X86_PF_WRITE) {
    1104             :                 /* write, present and write, not present: */
    1105      141752 :                 if (unlikely(!(vma->vm_flags & VM_WRITE)))
    1106             :                         return 1;
    1107      141752 :                 return 0;
    1108             :         }
    1109             : 
    1110             :         /* read, present: */
    1111      154598 :         if (unlikely(error_code & X86_PF_PROT))
    1112             :                 return 1;
    1113             : 
    1114             :         /* read, not present: */
    1115      154598 :         if (unlikely(!vma_is_accessible(vma)))
    1116           0 :                 return 1;
    1117             : 
    1118             :         return 0;
    1119             : }
    1120             : 
    1121      295858 : bool fault_in_kernel_space(unsigned long address)
    1122             : {
    1123             :         /*
    1124             :          * On 64-bit systems, the vsyscall page is at an address above
    1125             :          * TASK_SIZE_MAX, but is not considered part of the kernel
    1126             :          * address space.
    1127             :          */
    1128      295858 :         if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
    1129             :                 return false;
    1130             : 
    1131      295858 :         return address >= TASK_SIZE_MAX;
    1132             : }
    1133             : 
    1134             : /*
    1135             :  * Called for all faults where 'address' is part of the kernel address
    1136             :  * space.  Might get called for faults that originate from *code* that
    1137             :  * ran in userspace or the kernel.
    1138             :  */
    1139             : static void
    1140           0 : do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
    1141             :                    unsigned long address)
    1142             : {
    1143             :         /*
    1144             :          * Protection keys exceptions only happen on user pages.  We
    1145             :          * have no user pages in the kernel portion of the address
    1146             :          * space, so do not expect them here.
    1147             :          */
    1148           0 :         WARN_ON_ONCE(hw_error_code & X86_PF_PK);
    1149             : 
    1150             : #ifdef CONFIG_X86_32
    1151             :         /*
    1152             :          * We can fault-in kernel-space virtual memory on-demand. The
    1153             :          * 'reference' page table is init_mm.pgd.
    1154             :          *
    1155             :          * NOTE! We MUST NOT take any locks for this case. We may
    1156             :          * be in an interrupt or a critical region, and should
    1157             :          * only copy the information from the master page table,
    1158             :          * nothing more.
    1159             :          *
    1160             :          * Before doing this on-demand faulting, ensure that the
    1161             :          * fault is not any of the following:
    1162             :          * 1. A fault on a PTE with a reserved bit set.
    1163             :          * 2. A fault caused by a user-mode access.  (Do not demand-
    1164             :          *    fault kernel memory due to user-mode accesses).
    1165             :          * 3. A fault caused by a page-level protection violation.
    1166             :          *    (A demand fault would be on a non-present page which
    1167             :          *     would have X86_PF_PROT==0).
    1168             :          *
    1169             :          * This is only needed to close a race condition on x86-32 in
    1170             :          * the vmalloc mapping/unmapping code. See the comment above
    1171             :          * vmalloc_fault() for details. On x86-64 the race does not
    1172             :          * exist as the vmalloc mappings don't need to be synchronized
    1173             :          * there.
    1174             :          */
    1175             :         if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
    1176             :                 if (vmalloc_fault(address) >= 0)
    1177             :                         return;
    1178             :         }
    1179             : #endif
    1180             : 
    1181           0 :         if (is_f00f_bug(regs, hw_error_code, address))
    1182             :                 return;
    1183             : 
    1184             :         /* Was the fault spurious, caused by lazy TLB invalidation? */
    1185           0 :         if (spurious_kernel_fault(hw_error_code, address))
    1186             :                 return;
    1187             : 
    1188             :         /* kprobes don't want to hook the spurious faults: */
    1189           0 :         if (kprobe_page_fault(regs, X86_TRAP_PF))
    1190             :                 return;
    1191             : 
    1192             :         /*
    1193             :          * Note, despite being a "bad area", there are quite a few
    1194             :          * acceptable reasons to get here, such as erratum fixups
    1195             :          * and handling kernel code that can fault, like get_user().
    1196             :          *
    1197             :          * Don't take the mm semaphore here. If we fixup a prefetch
    1198             :          * fault we could otherwise deadlock:
    1199             :          */
    1200           0 :         bad_area_nosemaphore(regs, hw_error_code, address);
    1201             : }
    1202             : NOKPROBE_SYMBOL(do_kern_addr_fault);
    1203             : 
    1204             : /*
    1205             :  * Handle faults in the user portion of the address space.  Nothing in here
    1206             :  * should check X86_PF_USER without a specific justification: for almost
    1207             :  * all purposes, we should treat a normal kernel access to user memory
    1208             :  * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
    1209             :  * The one exception is AC flag handling, which is, per the x86
    1210             :  * architecture, special for WRUSS.
    1211             :  */
    1212             : static inline
    1213      295861 : void do_user_addr_fault(struct pt_regs *regs,
    1214             :                         unsigned long error_code,
    1215             :                         unsigned long address)
    1216             : {
    1217      295861 :         struct vm_area_struct *vma;
    1218      295861 :         struct task_struct *tsk;
    1219      295861 :         struct mm_struct *mm;
    1220      295861 :         vm_fault_t fault;
    1221      295861 :         unsigned int flags = FAULT_FLAG_DEFAULT;
    1222             : 
    1223      295861 :         tsk = current;
    1224      295861 :         mm = tsk->mm;
    1225             : 
    1226      295861 :         if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
    1227             :                 /*
    1228             :                  * Whoops, this is kernel mode code trying to execute from
    1229             :                  * user memory.  Unless this is AMD erratum #93, which
    1230             :                  * corrupts RIP such that it looks like a user address,
    1231             :                  * this is unrecoverable.  Don't even try to look up the
    1232             :                  * VMA or look for extable entries.
    1233             :                  */
    1234           0 :                 if (is_errata93(regs, address))
    1235             :                         return;
    1236             : 
    1237           0 :                 page_fault_oops(regs, error_code, address);
    1238           0 :                 return;
    1239             :         }
    1240             : 
    1241             :         /* kprobes don't want to hook the spurious faults: */
    1242      295861 :         if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
    1243             :                 return;
    1244             : 
    1245             :         /*
    1246             :          * Reserved bits are never expected to be set on
    1247             :          * entries in the user portion of the page tables.
    1248             :          */
    1249      295861 :         if (unlikely(error_code & X86_PF_RSVD))
    1250           0 :                 pgtable_bad(regs, error_code, address);
    1251             : 
    1252             :         /*
    1253             :          * If SMAP is on, check for invalid kernel (supervisor) access to user
    1254             :          * pages in the user address space.  The odd case here is WRUSS,
    1255             :          * which, according to the preliminary documentation, does not respect
    1256             :          * SMAP and will have the USER bit set so, in all cases, SMAP
    1257             :          * enforcement appears to be consistent with the USER bit.
    1258             :          */
    1259      295861 :         if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
    1260             :                      !(error_code & X86_PF_USER) &&
    1261             :                      !(regs->flags & X86_EFLAGS_AC))) {
    1262             :                 /*
    1263             :                  * No extable entry here.  This was a kernel access to an
    1264             :                  * invalid pointer.  get_kernel_nofault() will not get here.
    1265             :                  */
    1266             :                 page_fault_oops(regs, error_code, address);
    1267             :                 return;
    1268             :         }
    1269             : 
    1270             :         /*
    1271             :          * If we're in an interrupt, have no user context or are running
    1272             :          * in a region with pagefaults disabled then we must not take the fault
    1273             :          */
    1274      295861 :         if (unlikely(faulthandler_disabled() || !mm)) {
    1275         713 :                 bad_area_nosemaphore(regs, error_code, address);
    1276         713 :                 return;
    1277             :         }
    1278             : 
    1279             :         /*
    1280             :          * It's safe to allow irq's after cr2 has been saved and the
    1281             :          * vmalloc fault has been handled.
    1282             :          *
    1283             :          * User-mode registers count as a user access even for any
    1284             :          * potential system fault or CPU buglet:
    1285             :          */
    1286      295148 :         if (user_mode(regs)) {
    1287      280725 :                 local_irq_enable();
    1288      280730 :                 flags |= FAULT_FLAG_USER;
    1289             :         } else {
    1290       14423 :                 if (regs->flags & X86_EFLAGS_IF)
    1291       14423 :                         local_irq_enable();
    1292             :         }
    1293             : 
    1294      295153 :         perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
    1295             : 
    1296      295149 :         if (error_code & X86_PF_WRITE)
    1297      141293 :                 flags |= FAULT_FLAG_WRITE;
    1298      295149 :         if (error_code & X86_PF_INSTR)
    1299       77605 :                 flags |= FAULT_FLAG_INSTRUCTION;
    1300             : 
    1301             : #ifdef CONFIG_X86_64
    1302             :         /*
    1303             :          * Faults in the vsyscall page might need emulation.  The
    1304             :          * vsyscall page is at a high address (>PAGE_OFFSET), but is
    1305             :          * considered to be part of the user address space.
    1306             :          *
    1307             :          * The vsyscall page does not have a "real" VMA, so do this
    1308             :          * emulation before we go searching for VMAs.
    1309             :          *
    1310             :          * PKRU never rejects instruction fetches, so we don't need
    1311             :          * to consider the PF_PK bit.
    1312             :          */
    1313      295149 :         if (is_vsyscall_vaddr(address)) {
    1314      295149 :                 if (emulate_vsyscall(error_code, regs, address))
    1315             :                         return;
    1316             :         }
    1317             : #endif
    1318             : 
    1319             :         /*
    1320             :          * Kernel-mode access to the user address space should only occur
    1321             :          * on well-defined single instructions listed in the exception
    1322             :          * tables.  But, an erroneous kernel fault occurring outside one of
    1323             :          * those areas which also holds mmap_lock might deadlock attempting
    1324             :          * to validate the fault against the address space.
    1325             :          *
    1326             :          * Only do the expensive exception table search when we might be at
    1327             :          * risk of a deadlock.  This happens if we
    1328             :          * 1. Failed to acquire mmap_lock, and
    1329             :          * 2. The access did not originate in userspace.
    1330             :          */
    1331      295149 :         if (unlikely(!mmap_read_trylock(mm))) {
    1332           0 :                 if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
    1333             :                         /*
    1334             :                          * Fault from code in kernel from
    1335             :                          * which we do not expect faults.
    1336             :                          */
    1337           0 :                         bad_area_nosemaphore(regs, error_code, address);
    1338           0 :                         return;
    1339             :                 }
    1340           0 : retry:
    1341        1195 :                 mmap_read_lock(mm);
    1342             :         } else {
    1343             :                 /*
    1344             :                  * The above down_read_trylock() might have succeeded in
    1345             :                  * which case we'll have missed the might_sleep() from
    1346             :                  * down_read():
    1347             :                  */
    1348      295138 :                 might_sleep();
    1349             :         }
    1350             : 
    1351      296348 :         vma = find_vma(mm, address);
    1352      296353 :         if (unlikely(!vma)) {
    1353           0 :                 bad_area(regs, error_code, address);
    1354           0 :                 return;
    1355             :         }
    1356      296353 :         if (likely(vma->vm_start <= address))
    1357      296335 :                 goto good_area;
    1358          18 :         if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
    1359           2 :                 bad_area(regs, error_code, address);
    1360           2 :                 return;
    1361             :         }
    1362          16 :         if (unlikely(expand_stack(vma, address))) {
    1363           0 :                 bad_area(regs, error_code, address);
    1364           0 :                 return;
    1365             :         }
    1366             : 
    1367             :         /*
    1368             :          * Ok, we have a good vm_area for this memory access, so
    1369             :          * we can handle it..
    1370             :          */
    1371          16 : good_area:
    1372      296351 :         if (unlikely(access_error(error_code, vma))) {
    1373           0 :                 bad_area_access_error(regs, error_code, address, vma);
    1374           0 :                 return;
    1375             :         }
    1376             : 
    1377             :         /*
    1378             :          * If for any reason at all we couldn't handle the fault,
    1379             :          * make sure we exit gracefully rather than endlessly redo
    1380             :          * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
    1381             :          * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
    1382             :          *
    1383             :          * Note that handle_userfault() may also release and reacquire mmap_lock
    1384             :          * (and not return with VM_FAULT_RETRY), when returning to userland to
    1385             :          * repeat the page fault later with a VM_FAULT_NOPAGE retval
    1386             :          * (potentially after handling any pending signal during the return to
    1387             :          * userland). The return to userland is identified whenever
    1388             :          * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
    1389             :          */
    1390      296344 :         fault = handle_mm_fault(vma, address, flags, regs);
    1391             : 
    1392      296338 :         if (fault_signal_pending(fault, regs)) {
    1393             :                 /*
    1394             :                  * Quick path to respond to signals.  The core mm code
    1395             :                  * has unlocked the mm for us if we get here.
    1396             :                  */
    1397           0 :                 if (!user_mode(regs))
    1398           0 :                         kernelmode_fixup_or_oops(regs, error_code, address,
    1399             :                                                  SIGBUS, BUS_ADRERR);
    1400           0 :                 return;
    1401             :         }
    1402             : 
    1403             :         /*
    1404             :          * If we need to retry the mmap_lock has already been released,
    1405             :          * and if there is a fatal signal pending there is no guarantee
    1406             :          * that we made any progress. Handle this case first.
    1407             :          */
    1408      296341 :         if (unlikely((fault & VM_FAULT_RETRY) &&
    1409             :                      (flags & FAULT_FLAG_ALLOW_RETRY))) {
    1410        1195 :                 flags |= FAULT_FLAG_TRIED;
    1411        1195 :                 goto retry;
    1412             :         }
    1413             : 
    1414      295146 :         mmap_read_unlock(mm);
    1415      295146 :         if (likely(!(fault & VM_FAULT_ERROR)))
    1416             :                 return;
    1417             : 
    1418           0 :         if (fatal_signal_pending(current) && !user_mode(regs)) {
    1419           0 :                 kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
    1420           0 :                 return;
    1421             :         }
    1422             : 
    1423           0 :         if (fault & VM_FAULT_OOM) {
    1424             :                 /* Kernel mode? Handle exceptions or die: */
    1425           0 :                 if (!user_mode(regs)) {
    1426           0 :                         kernelmode_fixup_or_oops(regs, error_code, address,
    1427             :                                                  SIGSEGV, SEGV_MAPERR);
    1428           0 :                         return;
    1429             :                 }
    1430             : 
    1431             :                 /*
    1432             :                  * We ran out of memory, call the OOM killer, and return the
    1433             :                  * userspace (which will retry the fault, or kill us if we got
    1434             :                  * oom-killed):
    1435             :                  */
    1436           0 :                 pagefault_out_of_memory();
    1437             :         } else {
    1438           0 :                 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
    1439             :                              VM_FAULT_HWPOISON_LARGE))
    1440           0 :                         do_sigbus(regs, error_code, address, fault);
    1441           0 :                 else if (fault & VM_FAULT_SIGSEGV)
    1442           0 :                         bad_area_nosemaphore(regs, error_code, address);
    1443             :                 else
    1444           0 :                         BUG();
    1445             :         }
    1446             : }
    1447             : NOKPROBE_SYMBOL(do_user_addr_fault);
    1448             : 
    1449             : static __always_inline void
    1450      295863 : trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
    1451             :                          unsigned long address)
    1452             : {
    1453      295860 :         if (!trace_pagefault_enabled())
    1454             :                 return;
    1455             : 
    1456           0 :         if (user_mode(regs))
    1457           0 :                 trace_page_fault_user(address, regs, error_code);
    1458             :         else
    1459           0 :                 trace_page_fault_kernel(address, regs, error_code);
    1460             : }
    1461             : 
    1462             : static __always_inline void
    1463      295863 : handle_page_fault(struct pt_regs *regs, unsigned long error_code,
    1464             :                               unsigned long address)
    1465             : {
    1466      591723 :         trace_page_fault_entries(regs, error_code, address);
    1467             : 
    1468      295860 :         if (unlikely(kmmio_fault(regs, address)))
    1469             :                 return;
    1470             : 
    1471             :         /* Was the fault on kernel-controlled part of the address space? */
    1472      295859 :         if (unlikely(fault_in_kernel_space(address))) {
    1473           0 :                 do_kern_addr_fault(regs, error_code, address);
    1474             :         } else {
    1475      295858 :                 do_user_addr_fault(regs, error_code, address);
    1476             :                 /*
    1477             :                  * User address page fault handling might have reenabled
    1478             :                  * interrupts. Fixing up all potential exit points of
    1479             :                  * do_user_addr_fault() and its leaf functions is just not
    1480             :                  * doable w/o creating an unholy mess or turning the code
    1481             :                  * upside down.
    1482             :                  */
    1483      295860 :                 local_irq_disable();
    1484             :         }
    1485             : }
    1486             : 
    1487      295868 : DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
    1488             : {
    1489      295868 :         unsigned long address = read_cr2();
    1490      295869 :         irqentry_state_t state;
    1491             : 
    1492      295869 :         prefetchw(&current->mm->mmap_lock);
    1493             : 
    1494             :         /*
    1495             :          * KVM uses #PF vector to deliver 'page not present' events to guests
    1496             :          * (asynchronous page fault mechanism). The event happens when a
    1497             :          * userspace task is trying to access some valid (from guest's point of
    1498             :          * view) memory which is not currently mapped by the host (e.g. the
    1499             :          * memory is swapped out). Note, the corresponding "page ready" event
    1500             :          * which is injected when the memory becomes available, is delived via
    1501             :          * an interrupt mechanism and not a #PF exception
    1502             :          * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
    1503             :          *
    1504             :          * We are relying on the interrupted context being sane (valid RSP,
    1505             :          * relevant locks not held, etc.), which is fine as long as the
    1506             :          * interrupted context had IF=1.  We are also relying on the KVM
    1507             :          * async pf type field and CR2 being read consistently instead of
    1508             :          * getting values from real and async page faults mixed up.
    1509             :          *
    1510             :          * Fingers crossed.
    1511             :          *
    1512             :          * The async #PF handling code takes care of idtentry handling
    1513             :          * itself.
    1514             :          */
    1515      295871 :         if (kvm_handle_async_pf(regs, (u32)address))
    1516           0 :                 return;
    1517             : 
    1518             :         /*
    1519             :          * Entry handling for valid #PF from kernel mode is slightly
    1520             :          * different: RCU is already watching and rcu_irq_enter() must not
    1521             :          * be invoked because a kernel fault on a user space address might
    1522             :          * sleep.
    1523             :          *
    1524             :          * In case the fault hit a RCU idle region the conditional entry
    1525             :          * code reenabled RCU to avoid subsequent wreckage which helps
    1526             :          * debugability.
    1527             :          */
    1528      295862 :         state = irqentry_enter(regs);
    1529             : 
    1530      295863 :         instrumentation_begin();
    1531      295863 :         handle_page_fault(regs, error_code, address);
    1532      295844 :         instrumentation_end();
    1533             : 
    1534      295844 :         irqentry_exit(regs, state);
    1535             : }

Generated by: LCOV version 1.14