LCOV - code coverage report
Current view: top level - arch/x86/kernel - alternative.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 268 445 60.2 %
Date: 2021-04-22 12:43:58 Functions: 20 33 60.6 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : #define pr_fmt(fmt) "SMP alternatives: " fmt
       3             : 
       4             : #include <linux/module.h>
       5             : #include <linux/sched.h>
       6             : #include <linux/perf_event.h>
       7             : #include <linux/mutex.h>
       8             : #include <linux/list.h>
       9             : #include <linux/stringify.h>
      10             : #include <linux/highmem.h>
      11             : #include <linux/mm.h>
      12             : #include <linux/vmalloc.h>
      13             : #include <linux/memory.h>
      14             : #include <linux/stop_machine.h>
      15             : #include <linux/slab.h>
      16             : #include <linux/kdebug.h>
      17             : #include <linux/kprobes.h>
      18             : #include <linux/mmu_context.h>
      19             : #include <linux/bsearch.h>
      20             : #include <linux/sync_core.h>
      21             : #include <asm/text-patching.h>
      22             : #include <asm/alternative.h>
      23             : #include <asm/sections.h>
      24             : #include <asm/mce.h>
      25             : #include <asm/nmi.h>
      26             : #include <asm/cacheflush.h>
      27             : #include <asm/tlbflush.h>
      28             : #include <asm/insn.h>
      29             : #include <asm/io.h>
      30             : #include <asm/fixmap.h>
      31             : 
      32             : int __read_mostly alternatives_patched;
      33             : 
      34             : EXPORT_SYMBOL_GPL(alternatives_patched);
      35             : 
      36             : #define MAX_PATCH_LEN (255-1)
      37             : 
      38             : static int __initdata_or_module debug_alternative;
      39             : 
      40           0 : static int __init debug_alt(char *str)
      41             : {
      42           0 :         debug_alternative = 1;
      43           0 :         return 1;
      44             : }
      45             : __setup("debug-alternative", debug_alt);
      46             : 
      47             : static int noreplace_smp;
      48             : 
      49           0 : static int __init setup_noreplace_smp(char *str)
      50             : {
      51           0 :         noreplace_smp = 1;
      52           0 :         return 1;
      53             : }
      54             : __setup("noreplace-smp", setup_noreplace_smp);
      55             : 
      56             : #define DPRINTK(fmt, args...)                                           \
      57             : do {                                                                    \
      58             :         if (debug_alternative)                                          \
      59             :                 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);          \
      60             : } while (0)
      61             : 
      62             : #define DUMP_BYTES(buf, len, fmt, args...)                              \
      63             : do {                                                                    \
      64             :         if (unlikely(debug_alternative)) {                              \
      65             :                 int j;                                                  \
      66             :                                                                         \
      67             :                 if (!(len))                                             \
      68             :                         break;                                          \
      69             :                                                                         \
      70             :                 printk(KERN_DEBUG pr_fmt(fmt), ##args);                 \
      71             :                 for (j = 0; j < (len) - 1; j++)                              \
      72             :                         printk(KERN_CONT "%02hhx ", buf[j]);          \
      73             :                 printk(KERN_CONT "%02hhx\n", buf[j]);                 \
      74             :         }                                                               \
      75             : } while (0)
      76             : 
      77             : /*
      78             :  * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
      79             :  * that correspond to that nop. Getting from one nop to the next, we
      80             :  * add to the array the offset that is equal to the sum of all sizes of
      81             :  * nops preceding the one we are after.
      82             :  *
      83             :  * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
      84             :  * nice symmetry of sizes of the previous nops.
      85             :  */
      86             : #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
      87             : static const unsigned char intelnops[] =
      88             : {
      89             :         GENERIC_NOP1,
      90             :         GENERIC_NOP2,
      91             :         GENERIC_NOP3,
      92             :         GENERIC_NOP4,
      93             :         GENERIC_NOP5,
      94             :         GENERIC_NOP6,
      95             :         GENERIC_NOP7,
      96             :         GENERIC_NOP8,
      97             :         GENERIC_NOP5_ATOMIC
      98             : };
      99             : static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
     100             : {
     101             :         NULL,
     102             :         intelnops,
     103             :         intelnops + 1,
     104             :         intelnops + 1 + 2,
     105             :         intelnops + 1 + 2 + 3,
     106             :         intelnops + 1 + 2 + 3 + 4,
     107             :         intelnops + 1 + 2 + 3 + 4 + 5,
     108             :         intelnops + 1 + 2 + 3 + 4 + 5 + 6,
     109             :         intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
     110             :         intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
     111             : };
     112             : #endif
     113             : 
     114             : #ifdef K8_NOP1
     115             : static const unsigned char k8nops[] =
     116             : {
     117             :         K8_NOP1,
     118             :         K8_NOP2,
     119             :         K8_NOP3,
     120             :         K8_NOP4,
     121             :         K8_NOP5,
     122             :         K8_NOP6,
     123             :         K8_NOP7,
     124             :         K8_NOP8,
     125             :         K8_NOP5_ATOMIC
     126             : };
     127             : static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
     128             : {
     129             :         NULL,
     130             :         k8nops,
     131             :         k8nops + 1,
     132             :         k8nops + 1 + 2,
     133             :         k8nops + 1 + 2 + 3,
     134             :         k8nops + 1 + 2 + 3 + 4,
     135             :         k8nops + 1 + 2 + 3 + 4 + 5,
     136             :         k8nops + 1 + 2 + 3 + 4 + 5 + 6,
     137             :         k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
     138             :         k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
     139             : };
     140             : #endif
     141             : 
     142             : #if defined(K7_NOP1) && !defined(CONFIG_X86_64)
     143             : static const unsigned char k7nops[] =
     144             : {
     145             :         K7_NOP1,
     146             :         K7_NOP2,
     147             :         K7_NOP3,
     148             :         K7_NOP4,
     149             :         K7_NOP5,
     150             :         K7_NOP6,
     151             :         K7_NOP7,
     152             :         K7_NOP8,
     153             :         K7_NOP5_ATOMIC
     154             : };
     155             : static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
     156             : {
     157             :         NULL,
     158             :         k7nops,
     159             :         k7nops + 1,
     160             :         k7nops + 1 + 2,
     161             :         k7nops + 1 + 2 + 3,
     162             :         k7nops + 1 + 2 + 3 + 4,
     163             :         k7nops + 1 + 2 + 3 + 4 + 5,
     164             :         k7nops + 1 + 2 + 3 + 4 + 5 + 6,
     165             :         k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
     166             :         k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
     167             : };
     168             : #endif
     169             : 
     170             : #ifdef P6_NOP1
     171             : static const unsigned char p6nops[] =
     172             : {
     173             :         P6_NOP1,
     174             :         P6_NOP2,
     175             :         P6_NOP3,
     176             :         P6_NOP4,
     177             :         P6_NOP5,
     178             :         P6_NOP6,
     179             :         P6_NOP7,
     180             :         P6_NOP8,
     181             :         P6_NOP5_ATOMIC
     182             : };
     183             : static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
     184             : {
     185             :         NULL,
     186             :         p6nops,
     187             :         p6nops + 1,
     188             :         p6nops + 1 + 2,
     189             :         p6nops + 1 + 2 + 3,
     190             :         p6nops + 1 + 2 + 3 + 4,
     191             :         p6nops + 1 + 2 + 3 + 4 + 5,
     192             :         p6nops + 1 + 2 + 3 + 4 + 5 + 6,
     193             :         p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
     194             :         p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
     195             : };
     196             : #endif
     197             : 
     198             : /* Initialize these to a safe default */
     199             : #ifdef CONFIG_X86_64
     200             : const unsigned char * const *ideal_nops = p6_nops;
     201             : #else
     202             : const unsigned char * const *ideal_nops = intel_nops;
     203             : #endif
     204             : 
     205           1 : void __init arch_init_ideal_nops(void)
     206             : {
     207           1 :         switch (boot_cpu_data.x86_vendor) {
     208           1 :         case X86_VENDOR_INTEL:
     209             :                 /*
     210             :                  * Due to a decoder implementation quirk, some
     211             :                  * specific Intel CPUs actually perform better with
     212             :                  * the "k8_nops" than with the SDM-recommended NOPs.
     213             :                  */
     214           1 :                 if (boot_cpu_data.x86 == 6 &&
     215           1 :                     boot_cpu_data.x86_model >= 0x0f &&
     216           1 :                     boot_cpu_data.x86_model != 0x1c &&
     217           1 :                     boot_cpu_data.x86_model != 0x26 &&
     218           1 :                     boot_cpu_data.x86_model != 0x27 &&
     219             :                     boot_cpu_data.x86_model < 0x30) {
     220           0 :                         ideal_nops = k8_nops;
     221           1 :                 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
     222           1 :                            ideal_nops = p6_nops;
     223             :                 } else {
     224             : #ifdef CONFIG_X86_64
     225             :                         ideal_nops = k8_nops;
     226             : #else
     227             :                         ideal_nops = intel_nops;
     228             : #endif
     229             :                 }
     230             :                 break;
     231             : 
     232           0 :         case X86_VENDOR_HYGON:
     233           0 :                 ideal_nops = p6_nops;
     234           0 :                 return;
     235             : 
     236           0 :         case X86_VENDOR_AMD:
     237           0 :                 if (boot_cpu_data.x86 > 0xf) {
     238           0 :                         ideal_nops = p6_nops;
     239           0 :                         return;
     240             :                 }
     241             : 
     242           0 :                 fallthrough;
     243             : 
     244             :         default:
     245             : #ifdef CONFIG_X86_64
     246           0 :                 ideal_nops = k8_nops;
     247             : #else
     248             :                 if (boot_cpu_has(X86_FEATURE_K8))
     249             :                         ideal_nops = k8_nops;
     250             :                 else if (boot_cpu_has(X86_FEATURE_K7))
     251             :                         ideal_nops = k7_nops;
     252             :                 else
     253             :                         ideal_nops = intel_nops;
     254             : #endif
     255             :         }
     256             : }
     257             : 
     258             : /* Use this to add nops to a buffer, then text_poke the whole buffer. */
     259         201 : static void __init_or_module add_nops(void *insns, unsigned int len)
     260             : {
     261         402 :         while (len > 0) {
     262         201 :                 unsigned int noplen = len;
     263         201 :                 if (noplen > ASM_NOP_MAX)
     264             :                         noplen = ASM_NOP_MAX;
     265         201 :                 memcpy(insns, ideal_nops[noplen], noplen);
     266         201 :                 insns += noplen;
     267         201 :                 len -= noplen;
     268             :         }
     269         201 : }
     270             : 
     271             : extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
     272             : extern s32 __smp_locks[], __smp_locks_end[];
     273             : void text_poke_early(void *addr, const void *opcode, size_t len);
     274             : 
     275             : /*
     276             :  * Are we looking at a near JMP with a 1 or 4-byte displacement.
     277             :  */
     278         376 : static inline bool is_jmp(const u8 opcode)
     279             : {
     280         376 :         return opcode == 0xeb || opcode == 0xe9;
     281             : }
     282             : 
     283             : static void __init_or_module
     284         118 : recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
     285             : {
     286         118 :         u8 *next_rip, *tgt_rip;
     287         118 :         s32 n_dspl, o_dspl;
     288         118 :         int repl_len;
     289             : 
     290         118 :         if (a->replacementlen != 5)
     291             :                 return;
     292             : 
     293         118 :         o_dspl = *(s32 *)(insn_buff + 1);
     294             : 
     295             :         /* next_rip of the replacement JMP */
     296         118 :         next_rip = repl_insn + a->replacementlen;
     297             :         /* target rip of the replacement JMP */
     298         118 :         tgt_rip  = next_rip + o_dspl;
     299         118 :         n_dspl = tgt_rip - orig_insn;
     300             : 
     301         118 :         DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
     302             : 
     303         118 :         if (tgt_rip - orig_insn >= 0) {
     304         118 :                 if (n_dspl - 2 <= 127)
     305          40 :                         goto two_byte_jmp;
     306             :                 else
     307          78 :                         goto five_byte_jmp;
     308             :         /* negative offset */
     309             :         } else {
     310           0 :                 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
     311           0 :                         goto two_byte_jmp;
     312             :                 else
     313           0 :                         goto five_byte_jmp;
     314             :         }
     315             : 
     316          40 : two_byte_jmp:
     317          40 :         n_dspl -= 2;
     318             : 
     319          40 :         insn_buff[0] = 0xeb;
     320          40 :         insn_buff[1] = (s8)n_dspl;
     321          40 :         add_nops(insn_buff + 2, 3);
     322             : 
     323          40 :         repl_len = 2;
     324          40 :         goto done;
     325             : 
     326          78 : five_byte_jmp:
     327          78 :         n_dspl -= 5;
     328             : 
     329          78 :         insn_buff[0] = 0xe9;
     330          78 :         *(s32 *)&insn_buff[1] = n_dspl;
     331             : 
     332          78 :         repl_len = 5;
     333             : 
     334         118 : done:
     335             : 
     336         118 :         DPRINTK("final displ: 0x%08x, JMP 0x%lx",
     337             :                 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
     338             : }
     339             : 
     340             : /*
     341             :  * "noinline" to cause control flow change and thus invalidate I$ and
     342             :  * cause refetch after modification.
     343             :  */
     344           7 : static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
     345             : {
     346           7 :         unsigned long flags;
     347           7 :         int i;
     348             : 
     349          31 :         for (i = 0; i < a->padlen; i++) {
     350          25 :                 if (instr[i] != 0x90)
     351             :                         return;
     352             :         }
     353             : 
     354          12 :         local_irq_save(flags);
     355           6 :         add_nops(instr + (a->instrlen - a->padlen), a->padlen);
     356           6 :         local_irq_restore(flags);
     357             : 
     358           6 :         DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
     359             :                    instr, a->instrlen - a->padlen, a->padlen);
     360             : }
     361             : 
     362             : /*
     363             :  * Replace instructions with better alternatives for this CPU type. This runs
     364             :  * before SMP is initialized to avoid SMP problems with self modifying code.
     365             :  * This implies that asymmetric systems where APs have less capabilities than
     366             :  * the boot processor are not handled. Tough. Make sure you disable such
     367             :  * features by hand.
     368             :  *
     369             :  * Marked "noinline" to cause control flow change and thus insn cache
     370             :  * to refetch changed I$ lines.
     371             :  */
     372           3 : void __init_or_module noinline apply_alternatives(struct alt_instr *start,
     373             :                                                   struct alt_instr *end)
     374             : {
     375           3 :         struct alt_instr *a;
     376           3 :         u8 *instr, *replacement;
     377           3 :         u8 insn_buff[MAX_PATCH_LEN];
     378             : 
     379           3 :         DPRINTK("alt table %px, -> %px", start, end);
     380             :         /*
     381             :          * The scan order should be from start to end. A later scanned
     382             :          * alternative code can overwrite previously scanned alternative code.
     383             :          * Some kernel functions (e.g. memcpy, memset, etc) use this order to
     384             :          * patch code.
     385             :          *
     386             :          * So be careful if you want to change the scan order to any other
     387             :          * order.
     388             :          */
     389         548 :         for (a = start; a < end; a++) {
     390         545 :                 int insn_buff_sz = 0;
     391             : 
     392         545 :                 instr = (u8 *)&a->instr_offset + a->instr_offset;
     393         545 :                 replacement = (u8 *)&a->repl_offset + a->repl_offset;
     394         545 :                 BUG_ON(a->instrlen > sizeof(insn_buff));
     395         545 :                 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
     396         545 :                 if (!boot_cpu_has(a->cpuid)) {
     397         111 :                         if (a->padlen > 1)
     398           7 :                                 optimize_nops(a, instr);
     399             : 
     400         111 :                         continue;
     401             :                 }
     402             : 
     403         434 :                 DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
     404             :                         a->cpuid >> 5,
     405             :                         a->cpuid & 0x1f,
     406             :                         instr, instr, a->instrlen,
     407             :                         replacement, a->replacementlen, a->padlen);
     408             : 
     409         434 :                 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
     410         434 :                 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
     411             : 
     412         434 :                 memcpy(insn_buff, replacement, a->replacementlen);
     413         434 :                 insn_buff_sz = a->replacementlen;
     414             : 
     415             :                 /*
     416             :                  * 0xe8 is a relative jump; fix the offset.
     417             :                  *
     418             :                  * Instruction length is checked before the opcode to avoid
     419             :                  * accessing uninitialized bytes for zero-length replacements.
     420             :                  */
     421         434 :                 if (a->replacementlen == 5 && *insn_buff == 0xe8) {
     422         110 :                         *(s32 *)(insn_buff + 1) += replacement - instr;
     423         110 :                         DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
     424             :                                 *(s32 *)(insn_buff + 1),
     425             :                                 (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
     426             :                 }
     427             : 
     428         434 :                 if (a->replacementlen && is_jmp(replacement[0]))
     429         118 :                         recompute_jump(a, instr, replacement, insn_buff);
     430             : 
     431         434 :                 if (a->instrlen > a->replacementlen) {
     432          83 :                         add_nops(insn_buff + a->replacementlen,
     433          83 :                                  a->instrlen - a->replacementlen);
     434          83 :                         insn_buff_sz += a->instrlen - a->replacementlen;
     435             :                 }
     436         434 :                 DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
     437             : 
     438         434 :                 text_poke_early(instr, insn_buff, insn_buff_sz);
     439             :         }
     440           3 : }
     441             : 
     442             : #ifdef CONFIG_SMP
     443           0 : static void alternatives_smp_lock(const s32 *start, const s32 *end,
     444             :                                   u8 *text, u8 *text_end)
     445             : {
     446           0 :         const s32 *poff;
     447             : 
     448           0 :         for (poff = start; poff < end; poff++) {
     449           0 :                 u8 *ptr = (u8 *)poff + *poff;
     450             : 
     451           0 :                 if (!*poff || ptr < text || ptr >= text_end)
     452           0 :                         continue;
     453             :                 /* turn DS segment override prefix into lock prefix */
     454           0 :                 if (*ptr == 0x3e)
     455           0 :                         text_poke(ptr, ((unsigned char []){0xf0}), 1);
     456             :         }
     457           0 : }
     458             : 
     459           0 : static void alternatives_smp_unlock(const s32 *start, const s32 *end,
     460             :                                     u8 *text, u8 *text_end)
     461             : {
     462           0 :         const s32 *poff;
     463             : 
     464           0 :         for (poff = start; poff < end; poff++) {
     465           0 :                 u8 *ptr = (u8 *)poff + *poff;
     466             : 
     467           0 :                 if (!*poff || ptr < text || ptr >= text_end)
     468           0 :                         continue;
     469             :                 /* turn lock prefix into DS segment override prefix */
     470           0 :                 if (*ptr == 0xf0)
     471           0 :                         text_poke(ptr, ((unsigned char []){0x3E}), 1);
     472             :         }
     473           0 : }
     474             : 
     475             : struct smp_alt_module {
     476             :         /* what is this ??? */
     477             :         struct module   *mod;
     478             :         char            *name;
     479             : 
     480             :         /* ptrs to lock prefixes */
     481             :         const s32       *locks;
     482             :         const s32       *locks_end;
     483             : 
     484             :         /* .text segment, needed to avoid patching init code ;) */
     485             :         u8              *text;
     486             :         u8              *text_end;
     487             : 
     488             :         struct list_head next;
     489             : };
     490             : static LIST_HEAD(smp_alt_modules);
     491             : static bool uniproc_patched = false;    /* protected by text_mutex */
     492             : 
     493           0 : void __init_or_module alternatives_smp_module_add(struct module *mod,
     494             :                                                   char *name,
     495             :                                                   void *locks, void *locks_end,
     496             :                                                   void *text,  void *text_end)
     497             : {
     498           0 :         struct smp_alt_module *smp;
     499             : 
     500           0 :         mutex_lock(&text_mutex);
     501           0 :         if (!uniproc_patched)
     502           0 :                 goto unlock;
     503             : 
     504           0 :         if (num_possible_cpus() == 1)
     505             :                 /* Don't bother remembering, we'll never have to undo it. */
     506           0 :                 goto smp_unlock;
     507             : 
     508           0 :         smp = kzalloc(sizeof(*smp), GFP_KERNEL);
     509           0 :         if (NULL == smp)
     510             :                 /* we'll run the (safe but slow) SMP code then ... */
     511           0 :                 goto unlock;
     512             : 
     513           0 :         smp->mod     = mod;
     514           0 :         smp->name    = name;
     515           0 :         smp->locks   = locks;
     516           0 :         smp->locks_end       = locks_end;
     517           0 :         smp->text    = text;
     518           0 :         smp->text_end        = text_end;
     519           0 :         DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
     520             :                 smp->locks, smp->locks_end,
     521             :                 smp->text, smp->text_end, smp->name);
     522             : 
     523           0 :         list_add_tail(&smp->next, &smp_alt_modules);
     524           0 : smp_unlock:
     525           0 :         alternatives_smp_unlock(locks, locks_end, text, text_end);
     526           0 : unlock:
     527           0 :         mutex_unlock(&text_mutex);
     528           0 : }
     529             : 
     530           0 : void __init_or_module alternatives_smp_module_del(struct module *mod)
     531             : {
     532           0 :         struct smp_alt_module *item;
     533             : 
     534           0 :         mutex_lock(&text_mutex);
     535           0 :         list_for_each_entry(item, &smp_alt_modules, next) {
     536           0 :                 if (mod != item->mod)
     537           0 :                         continue;
     538           0 :                 list_del(&item->next);
     539           0 :                 kfree(item);
     540           0 :                 break;
     541             :         }
     542           0 :         mutex_unlock(&text_mutex);
     543           0 : }
     544             : 
     545           3 : void alternatives_enable_smp(void)
     546             : {
     547           3 :         struct smp_alt_module *mod;
     548             : 
     549             :         /* Why bother if there are no other CPUs? */
     550           3 :         BUG_ON(num_possible_cpus() == 1);
     551             : 
     552           3 :         mutex_lock(&text_mutex);
     553             : 
     554           3 :         if (uniproc_patched) {
     555           0 :                 pr_info("switching to SMP code\n");
     556           0 :                 BUG_ON(num_online_cpus() != 1);
     557           0 :                 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
     558           0 :                 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
     559           0 :                 list_for_each_entry(mod, &smp_alt_modules, next)
     560           0 :                         alternatives_smp_lock(mod->locks, mod->locks_end,
     561             :                                               mod->text, mod->text_end);
     562           0 :                 uniproc_patched = false;
     563             :         }
     564           3 :         mutex_unlock(&text_mutex);
     565           3 : }
     566             : 
     567             : /*
     568             :  * Return 1 if the address range is reserved for SMP-alternatives.
     569             :  * Must hold text_mutex.
     570             :  */
     571           0 : int alternatives_text_reserved(void *start, void *end)
     572             : {
     573           0 :         struct smp_alt_module *mod;
     574           0 :         const s32 *poff;
     575           0 :         u8 *text_start = start;
     576           0 :         u8 *text_end = end;
     577             : 
     578           0 :         lockdep_assert_held(&text_mutex);
     579             : 
     580           0 :         list_for_each_entry(mod, &smp_alt_modules, next) {
     581           0 :                 if (mod->text > text_end || mod->text_end < text_start)
     582           0 :                         continue;
     583           0 :                 for (poff = mod->locks; poff < mod->locks_end; poff++) {
     584           0 :                         const u8 *ptr = (const u8 *)poff + *poff;
     585             : 
     586           0 :                         if (text_start <= ptr && text_end > ptr)
     587             :                                 return 1;
     588             :                 }
     589             :         }
     590             : 
     591             :         return 0;
     592             : }
     593             : #endif /* CONFIG_SMP */
     594             : 
     595             : #ifdef CONFIG_PARAVIRT
     596           1 : void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
     597             :                                      struct paravirt_patch_site *end)
     598             : {
     599           1 :         struct paravirt_patch_site *p;
     600           1 :         char insn_buff[MAX_PATCH_LEN];
     601             : 
     602          73 :         for (p = start; p < end; p++) {
     603          72 :                 unsigned int used;
     604             : 
     605          72 :                 BUG_ON(p->len > MAX_PATCH_LEN);
     606             :                 /* prep the buffer with the original instructions */
     607          72 :                 memcpy(insn_buff, p->instr, p->len);
     608          72 :                 used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
     609             : 
     610          72 :                 BUG_ON(used > p->len);
     611             : 
     612             :                 /* Pad the rest with nops */
     613          72 :                 add_nops(insn_buff + used, p->len - used);
     614          72 :                 text_poke_early(p->instr, insn_buff, p->len);
     615             :         }
     616           1 : }
     617             : extern struct paravirt_patch_site __start_parainstructions[],
     618             :         __stop_parainstructions[];
     619             : #endif  /* CONFIG_PARAVIRT */
     620             : 
     621             : /*
     622             :  * Self-test for the INT3 based CALL emulation code.
     623             :  *
     624             :  * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
     625             :  * properly and that there is a stack gap between the INT3 frame and the
     626             :  * previous context. Without this gap doing a virtual PUSH on the interrupted
     627             :  * stack would corrupt the INT3 IRET frame.
     628             :  *
     629             :  * See entry_{32,64}.S for more details.
     630             :  */
     631             : 
     632             : /*
     633             :  * We define the int3_magic() function in assembly to control the calling
     634             :  * convention such that we can 'call' it from assembly.
     635             :  */
     636             : 
     637             : extern void int3_magic(unsigned int *ptr); /* defined in asm */
     638             : 
     639             : asm (
     640             : "  .pushsection    .init.text, \"ax\", @progbits\n"
     641             : "  .type           int3_magic, @function\n"
     642             : "int3_magic:\n"
     643             : "  movl    $1, (%" _ASM_ARG1 ")\n"
     644             : "  ret\n"
     645             : "  .size           int3_magic, .-int3_magic\n"
     646             : "  .popsection\n"
     647             : );
     648             : 
     649             : extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
     650             : 
     651             : static int __init
     652           1 : int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
     653             : {
     654           1 :         struct die_args *args = data;
     655           1 :         struct pt_regs *regs = args->regs;
     656             : 
     657           1 :         if (!regs || user_mode(regs))
     658             :                 return NOTIFY_DONE;
     659             : 
     660           1 :         if (val != DIE_INT3)
     661             :                 return NOTIFY_DONE;
     662             : 
     663           1 :         if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
     664             :                 return NOTIFY_DONE;
     665             : 
     666           1 :         int3_emulate_call(regs, (unsigned long)&int3_magic);
     667           1 :         return NOTIFY_STOP;
     668             : }
     669             : 
     670           1 : static void __init int3_selftest(void)
     671             : {
     672           1 :         static __initdata struct notifier_block int3_exception_nb = {
     673             :                 .notifier_call  = int3_exception_notify,
     674             :                 .priority       = INT_MAX-1, /* last */
     675             :         };
     676           1 :         unsigned int val = 0;
     677             : 
     678           1 :         BUG_ON(register_die_notifier(&int3_exception_nb));
     679             : 
     680             :         /*
     681             :          * Basically: int3_magic(&val); but really complicated :-)
     682             :          *
     683             :          * Stick the address of the INT3 instruction into int3_selftest_ip,
     684             :          * then trigger the INT3, padded with NOPs to match a CALL instruction
     685             :          * length.
     686             :          */
     687           1 :         asm volatile ("1: int3; nop; nop; nop; nop\n\t"
     688             :                       ".pushsection .init.data,\"aw\"\n\t"
     689             :                       ".align " __ASM_SEL(4, 8) "\n\t"
     690             :                       ".type int3_selftest_ip, @object\n\t"
     691             :                       ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
     692             :                       "int3_selftest_ip:\n\t"
     693             :                       __ASM_SEL(.long, .quad) " 1b\n\t"
     694             :                       ".popsection\n\t"
     695             :                       : ASM_CALL_CONSTRAINT
     696             :                       : __ASM_SEL_RAW(a, D) (&val)
     697             :                       : "memory");
     698             : 
     699           1 :         BUG_ON(val != 1);
     700             : 
     701           1 :         unregister_die_notifier(&int3_exception_nb);
     702           1 : }
     703             : 
     704           1 : void __init alternative_instructions(void)
     705             : {
     706           1 :         int3_selftest();
     707             : 
     708             :         /*
     709             :          * The patching is not fully atomic, so try to avoid local
     710             :          * interruptions that might execute the to be patched code.
     711             :          * Other CPUs are not running.
     712             :          */
     713           1 :         stop_nmi();
     714             : 
     715             :         /*
     716             :          * Don't stop machine check exceptions while patching.
     717             :          * MCEs only happen when something got corrupted and in this
     718             :          * case we must do something about the corruption.
     719             :          * Ignoring it is worse than an unlikely patching race.
     720             :          * Also machine checks tend to be broadcast and if one CPU
     721             :          * goes into machine check the others follow quickly, so we don't
     722             :          * expect a machine check to cause undue problems during to code
     723             :          * patching.
     724             :          */
     725             : 
     726           1 :         apply_alternatives(__alt_instructions, __alt_instructions_end);
     727             : 
     728             : #ifdef CONFIG_SMP
     729             :         /* Patch to UP if other cpus not imminent. */
     730           1 :         if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
     731           0 :                 uniproc_patched = true;
     732           0 :                 alternatives_smp_module_add(NULL, "core kernel",
     733             :                                             __smp_locks, __smp_locks_end,
     734             :                                             _text, _etext);
     735             :         }
     736             : 
     737           1 :         if (!uniproc_patched || num_possible_cpus() == 1) {
     738           1 :                 free_init_pages("SMP alternatives",
     739             :                                 (unsigned long)__smp_locks,
     740             :                                 (unsigned long)__smp_locks_end);
     741             :         }
     742             : #endif
     743             : 
     744           1 :         apply_paravirt(__parainstructions, __parainstructions_end);
     745             : 
     746           1 :         restart_nmi();
     747           1 :         alternatives_patched = 1;
     748           1 : }
     749             : 
     750             : /**
     751             :  * text_poke_early - Update instructions on a live kernel at boot time
     752             :  * @addr: address to modify
     753             :  * @opcode: source of the copy
     754             :  * @len: length to copy
     755             :  *
     756             :  * When you use this code to patch more than one byte of an instruction
     757             :  * you need to make sure that other CPUs cannot execute this code in parallel.
     758             :  * Also no thread must be currently preempted in the middle of these
     759             :  * instructions. And on the local CPU you need to be protected against NMI or
     760             :  * MCE handlers seeing an inconsistent instruction while you patch.
     761             :  */
     762        1037 : void __init_or_module text_poke_early(void *addr, const void *opcode,
     763             :                                       size_t len)
     764             : {
     765        1037 :         unsigned long flags;
     766             : 
     767        1037 :         if (boot_cpu_has(X86_FEATURE_NX) &&
     768             :             is_module_text_address((unsigned long)addr)) {
     769             :                 /*
     770             :                  * Modules text is marked initially as non-executable, so the
     771             :                  * code cannot be running and speculative code-fetches are
     772             :                  * prevented. Just change the code.
     773             :                  */
     774             :                 memcpy(addr, opcode, len);
     775             :         } else {
     776        2074 :                 local_irq_save(flags);
     777        1037 :                 memcpy(addr, opcode, len);
     778        1037 :                 local_irq_restore(flags);
     779        1037 :                 sync_core();
     780             : 
     781             :                 /*
     782             :                  * Could also do a CLFLUSH here to speed up CPU recovery; but
     783             :                  * that causes hangs on some VIA CPUs.
     784             :                  */
     785             :         }
     786        1037 : }
     787             : 
     788             : typedef struct {
     789             :         struct mm_struct *mm;
     790             : } temp_mm_state_t;
     791             : 
     792             : /*
     793             :  * Using a temporary mm allows to set temporary mappings that are not accessible
     794             :  * by other CPUs. Such mappings are needed to perform sensitive memory writes
     795             :  * that override the kernel memory protections (e.g., W^X), without exposing the
     796             :  * temporary page-table mappings that are required for these write operations to
     797             :  * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
     798             :  * mapping is torn down.
     799             :  *
     800             :  * Context: The temporary mm needs to be used exclusively by a single core. To
     801             :  *          harden security IRQs must be disabled while the temporary mm is
     802             :  *          loaded, thereby preventing interrupt handler bugs from overriding
     803             :  *          the kernel memory protection.
     804             :  */
     805         114 : static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
     806             : {
     807         114 :         temp_mm_state_t temp_state;
     808             : 
     809         228 :         lockdep_assert_irqs_disabled();
     810             : 
     811             :         /*
     812             :          * Make sure not to be in TLB lazy mode, as otherwise we'll end up
     813             :          * with a stale address space WITHOUT being in lazy mode after
     814             :          * restoring the previous mm.
     815             :          */
     816         114 :         if (this_cpu_read(cpu_tlbstate.is_lazy))
     817           0 :                 leave_mm(smp_processor_id());
     818             : 
     819         114 :         temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
     820         114 :         switch_mm_irqs_off(NULL, mm, current);
     821             : 
     822             :         /*
     823             :          * If breakpoints are enabled, disable them while the temporary mm is
     824             :          * used. Userspace might set up watchpoints on addresses that are used
     825             :          * in the temporary mm, which would lead to wrong signals being sent or
     826             :          * crashes.
     827             :          *
     828             :          * Note that breakpoints are not disabled selectively, which also causes
     829             :          * kernel breakpoints (e.g., perf's) to be disabled. This might be
     830             :          * undesirable, but still seems reasonable as the code that runs in the
     831             :          * temporary mm should be short.
     832             :          */
     833         114 :         if (hw_breakpoint_active())
     834           0 :                 hw_breakpoint_disable();
     835             : 
     836         114 :         return temp_state;
     837             : }
     838             : 
     839         114 : static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
     840             : {
     841         228 :         lockdep_assert_irqs_disabled();
     842         114 :         switch_mm_irqs_off(NULL, prev_state.mm, current);
     843             : 
     844             :         /*
     845             :          * Restore the breakpoints if they were disabled before the temporary mm
     846             :          * was loaded.
     847             :          */
     848         114 :         if (hw_breakpoint_active())
     849           0 :                 hw_breakpoint_restore();
     850         114 : }
     851             : 
     852             : __ro_after_init struct mm_struct *poking_mm;
     853             : __ro_after_init unsigned long poking_addr;
     854             : 
     855         114 : static void *__text_poke(void *addr, const void *opcode, size_t len)
     856             : {
     857         114 :         bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
     858         114 :         struct page *pages[2] = {NULL};
     859         114 :         temp_mm_state_t prev;
     860         114 :         unsigned long flags;
     861         114 :         pte_t pte, *ptep;
     862         114 :         spinlock_t *ptl;
     863         114 :         pgprot_t pgprot;
     864             : 
     865             :         /*
     866             :          * While boot memory allocator is running we cannot use struct pages as
     867             :          * they are not yet initialized. There is no way to recover.
     868             :          */
     869         114 :         BUG_ON(!after_bootmem);
     870             : 
     871         114 :         if (!core_kernel_text((unsigned long)addr)) {
     872           0 :                 pages[0] = vmalloc_to_page(addr);
     873           0 :                 if (cross_page_boundary)
     874           0 :                         pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
     875             :         } else {
     876         228 :                 pages[0] = virt_to_page(addr);
     877         114 :                 WARN_ON(!PageReserved(pages[0]));
     878         114 :                 if (cross_page_boundary)
     879           0 :                         pages[1] = virt_to_page(addr + PAGE_SIZE);
     880             :         }
     881             :         /*
     882             :          * If something went wrong, crash and burn since recovery paths are not
     883             :          * implemented.
     884             :          */
     885         114 :         BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
     886             : 
     887             :         /*
     888             :          * Map the page without the global bit, as TLB flushing is done with
     889             :          * flush_tlb_mm_range(), which is intended for non-global PTEs.
     890             :          */
     891         114 :         pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
     892             : 
     893             :         /*
     894             :          * The lock is not really needed, but this allows to avoid open-coding.
     895             :          */
     896         114 :         ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
     897             : 
     898             :         /*
     899             :          * This must not fail; preallocated in poking_init().
     900             :          */
     901         114 :         VM_BUG_ON(!ptep);
     902             : 
     903         228 :         local_irq_save(flags);
     904             : 
     905         114 :         pte = mk_pte(pages[0], pgprot);
     906         114 :         set_pte_at(poking_mm, poking_addr, ptep, pte);
     907             : 
     908         114 :         if (cross_page_boundary) {
     909           0 :                 pte = mk_pte(pages[1], pgprot);
     910           0 :                 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
     911             :         }
     912             : 
     913             :         /*
     914             :          * Loading the temporary mm behaves as a compiler barrier, which
     915             :          * guarantees that the PTE will be set at the time memcpy() is done.
     916             :          */
     917         114 :         prev = use_temporary_mm(poking_mm);
     918             : 
     919         114 :         kasan_disable_current();
     920         114 :         memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
     921         114 :         kasan_enable_current();
     922             : 
     923             :         /*
     924             :          * Ensure that the PTE is only cleared after the instructions of memcpy
     925             :          * were issued by using a compiler barrier.
     926             :          */
     927         114 :         barrier();
     928             : 
     929         114 :         pte_clear(poking_mm, poking_addr, ptep);
     930         114 :         if (cross_page_boundary)
     931           0 :                 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
     932             : 
     933             :         /*
     934             :          * Loading the previous page-table hierarchy requires a serializing
     935             :          * instruction that already allows the core to see the updated version.
     936             :          * Xen-PV is assumed to serialize execution in a similar manner.
     937             :          */
     938         114 :         unuse_temporary_mm(prev);
     939             : 
     940             :         /*
     941             :          * Flushing the TLB might involve IPIs, which would require enabled
     942             :          * IRQs, but not if the mm is not used, as it is in this point.
     943             :          */
     944         114 :         flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
     945         114 :                            (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
     946             :                            PAGE_SHIFT, false);
     947             : 
     948             :         /*
     949             :          * If the text does not match what we just wrote then something is
     950             :          * fundamentally screwy; there's nothing we can really do about that.
     951             :          */
     952         114 :         BUG_ON(memcmp(addr, opcode, len));
     953             : 
     954         114 :         local_irq_restore(flags);
     955         114 :         pte_unmap_unlock(ptep, ptl);
     956         114 :         return addr;
     957             : }
     958             : 
     959             : /**
     960             :  * text_poke - Update instructions on a live kernel
     961             :  * @addr: address to modify
     962             :  * @opcode: source of the copy
     963             :  * @len: length to copy
     964             :  *
     965             :  * Only atomic text poke/set should be allowed when not doing early patching.
     966             :  * It means the size must be writable atomically and the address must be aligned
     967             :  * in a way that permits an atomic write. It also makes sure we fit on a single
     968             :  * page.
     969             :  *
     970             :  * Note that the caller must ensure that if the modified code is part of a
     971             :  * module, the module would not be removed during poking. This can be achieved
     972             :  * by registering a module notifier, and ordering module removal and patching
     973             :  * trough a mutex.
     974             :  */
     975         114 : void *text_poke(void *addr, const void *opcode, size_t len)
     976             : {
     977         342 :         lockdep_assert_held(&text_mutex);
     978             : 
     979         114 :         return __text_poke(addr, opcode, len);
     980             : }
     981             : 
     982             : /**
     983             :  * text_poke_kgdb - Update instructions on a live kernel by kgdb
     984             :  * @addr: address to modify
     985             :  * @opcode: source of the copy
     986             :  * @len: length to copy
     987             :  *
     988             :  * Only atomic text poke/set should be allowed when not doing early patching.
     989             :  * It means the size must be writable atomically and the address must be aligned
     990             :  * in a way that permits an atomic write. It also makes sure we fit on a single
     991             :  * page.
     992             :  *
     993             :  * Context: should only be used by kgdb, which ensures no other core is running,
     994             :  *          despite the fact it does not hold the text_mutex.
     995             :  */
     996           0 : void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
     997             : {
     998           0 :         return __text_poke(addr, opcode, len);
     999             : }
    1000             : 
    1001         114 : static void do_sync_core(void *info)
    1002             : {
    1003         114 :         sync_core();
    1004         114 : }
    1005             : 
    1006         114 : void text_poke_sync(void)
    1007             : {
    1008           0 :         on_each_cpu(do_sync_core, NULL, 1);
    1009          76 : }
    1010             : 
    1011             : struct text_poke_loc {
    1012             :         s32 rel_addr; /* addr := _stext + rel_addr */
    1013             :         s32 rel32;
    1014             :         u8 opcode;
    1015             :         const u8 text[POKE_MAX_OPCODE_SIZE];
    1016             :         u8 old;
    1017             : };
    1018             : 
    1019             : struct bp_patching_desc {
    1020             :         struct text_poke_loc *vec;
    1021             :         int nr_entries;
    1022             :         atomic_t refs;
    1023             : };
    1024             : 
    1025             : static struct bp_patching_desc *bp_desc;
    1026             : 
    1027             : static __always_inline
    1028           1 : struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
    1029             : {
    1030           1 :         struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */
    1031             : 
    1032           0 :         if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
    1033           1 :                 return NULL;
    1034             : 
    1035             :         return desc;
    1036             : }
    1037             : 
    1038           0 : static __always_inline void put_desc(struct bp_patching_desc *desc)
    1039             : {
    1040           0 :         smp_mb__before_atomic();
    1041           0 :         arch_atomic_dec(&desc->refs);
    1042             : }
    1043             : 
    1044         152 : static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
    1045             : {
    1046         152 :         return _stext + tp->rel_addr;
    1047             : }
    1048             : 
    1049           0 : static __always_inline int patch_cmp(const void *key, const void *elt)
    1050             : {
    1051           0 :         struct text_poke_loc *tp = (struct text_poke_loc *) elt;
    1052             : 
    1053           0 :         if (key < text_poke_addr(tp))
    1054             :                 return -1;
    1055           0 :         if (key > text_poke_addr(tp))
    1056             :                 return 1;
    1057             :         return 0;
    1058             : }
    1059             : 
    1060           1 : noinstr int poke_int3_handler(struct pt_regs *regs)
    1061             : {
    1062           1 :         struct bp_patching_desc *desc;
    1063           1 :         struct text_poke_loc *tp;
    1064           1 :         int len, ret = 0;
    1065           1 :         void *ip;
    1066             : 
    1067           1 :         if (user_mode(regs))
    1068             :                 return 0;
    1069             : 
    1070             :         /*
    1071             :          * Having observed our INT3 instruction, we now must observe
    1072             :          * bp_desc:
    1073             :          *
    1074             :          *      bp_desc = desc                  INT3
    1075             :          *      WMB                             RMB
    1076             :          *      write INT3                      if (desc)
    1077             :          */
    1078           1 :         smp_rmb();
    1079             : 
    1080           1 :         desc = try_get_desc(&bp_desc);
    1081           1 :         if (!desc)
    1082           1 :                 return 0;
    1083             : 
    1084             :         /*
    1085             :          * Discount the INT3. See text_poke_bp_batch().
    1086             :          */
    1087           0 :         ip = (void *) regs->ip - INT3_INSN_SIZE;
    1088             : 
    1089             :         /*
    1090             :          * Skip the binary search if there is a single member in the vector.
    1091             :          */
    1092           0 :         if (unlikely(desc->nr_entries > 1)) {
    1093           0 :                 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
    1094             :                                       sizeof(struct text_poke_loc),
    1095             :                                       patch_cmp);
    1096           0 :                 if (!tp)
    1097           0 :                         goto out_put;
    1098             :         } else {
    1099           0 :                 tp = desc->vec;
    1100           0 :                 if (text_poke_addr(tp) != ip)
    1101           0 :                         goto out_put;
    1102             :         }
    1103             : 
    1104           0 :         len = text_opcode_size(tp->opcode);
    1105           0 :         ip += len;
    1106             : 
    1107           0 :         switch (tp->opcode) {
    1108           0 :         case INT3_INSN_OPCODE:
    1109             :                 /*
    1110             :                  * Someone poked an explicit INT3, they'll want to handle it,
    1111             :                  * do not consume.
    1112             :                  */
    1113           0 :                 goto out_put;
    1114             : 
    1115             :         case RET_INSN_OPCODE:
    1116           0 :                 int3_emulate_ret(regs);
    1117             :                 break;
    1118             : 
    1119           0 :         case CALL_INSN_OPCODE:
    1120           0 :                 int3_emulate_call(regs, (long)ip + tp->rel32);
    1121             :                 break;
    1122             : 
    1123           0 :         case JMP32_INSN_OPCODE:
    1124             :         case JMP8_INSN_OPCODE:
    1125           0 :                 int3_emulate_jmp(regs, (long)ip + tp->rel32);
    1126             :                 break;
    1127             : 
    1128           0 :         default:
    1129           0 :                 BUG();
    1130             :         }
    1131             : 
    1132             :         ret = 1;
    1133             : 
    1134           0 : out_put:
    1135           0 :         put_desc(desc);
    1136           0 :         return ret;
    1137             : }
    1138             : 
    1139             : #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
    1140             : static struct text_poke_loc tp_vec[TP_VEC_MAX];
    1141             : static int tp_vec_nr;
    1142             : 
    1143             : /**
    1144             :  * text_poke_bp_batch() -- update instructions on live kernel on SMP
    1145             :  * @tp:                 vector of instructions to patch
    1146             :  * @nr_entries:         number of entries in the vector
    1147             :  *
    1148             :  * Modify multi-byte instruction by using int3 breakpoint on SMP.
    1149             :  * We completely avoid stop_machine() here, and achieve the
    1150             :  * synchronization using int3 breakpoint.
    1151             :  *
    1152             :  * The way it is done:
    1153             :  *      - For each entry in the vector:
    1154             :  *              - add a int3 trap to the address that will be patched
    1155             :  *      - sync cores
    1156             :  *      - For each entry in the vector:
    1157             :  *              - update all but the first byte of the patched range
    1158             :  *      - sync cores
    1159             :  *      - For each entry in the vector:
    1160             :  *              - replace the first byte (int3) by the first byte of
    1161             :  *                replacing opcode
    1162             :  *      - sync cores
    1163             :  */
    1164          38 : static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
    1165             : {
    1166          38 :         struct bp_patching_desc desc = {
    1167             :                 .vec = tp,
    1168             :                 .nr_entries = nr_entries,
    1169             :                 .refs = ATOMIC_INIT(1),
    1170             :         };
    1171          38 :         unsigned char int3 = INT3_INSN_OPCODE;
    1172          38 :         unsigned int i;
    1173          38 :         int do_sync;
    1174             : 
    1175         114 :         lockdep_assert_held(&text_mutex);
    1176             : 
    1177          38 :         smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
    1178             : 
    1179             :         /*
    1180             :          * Corresponding read barrier in int3 notifier for making sure the
    1181             :          * nr_entries and handler are correctly ordered wrt. patching.
    1182             :          */
    1183          38 :         smp_wmb();
    1184             : 
    1185             :         /*
    1186             :          * First step: add a int3 trap to the address that will be patched.
    1187             :          */
    1188         114 :         for (i = 0; i < nr_entries; i++) {
    1189          38 :                 tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
    1190          38 :                 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
    1191             :         }
    1192             : 
    1193          38 :         text_poke_sync();
    1194             : 
    1195             :         /*
    1196             :          * Second step: update all but the first byte of the patched range.
    1197             :          */
    1198         114 :         for (do_sync = 0, i = 0; i < nr_entries; i++) {
    1199          38 :                 u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
    1200          38 :                 int len = text_opcode_size(tp[i].opcode);
    1201             : 
    1202          38 :                 if (len - INT3_INSN_SIZE > 0) {
    1203          38 :                         memcpy(old + INT3_INSN_SIZE,
    1204          38 :                                text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
    1205             :                                len - INT3_INSN_SIZE);
    1206          38 :                         text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
    1207             :                                   (const char *)tp[i].text + INT3_INSN_SIZE,
    1208             :                                   len - INT3_INSN_SIZE);
    1209          38 :                         do_sync++;
    1210             :                 }
    1211             : 
    1212             :                 /*
    1213             :                  * Emit a perf event to record the text poke, primarily to
    1214             :                  * support Intel PT decoding which must walk the executable code
    1215             :                  * to reconstruct the trace. The flow up to here is:
    1216             :                  *   - write INT3 byte
    1217             :                  *   - IPI-SYNC
    1218             :                  *   - write instruction tail
    1219             :                  * At this point the actual control flow will be through the
    1220             :                  * INT3 and handler and not hit the old or new instruction.
    1221             :                  * Intel PT outputs FUP/TIP packets for the INT3, so the flow
    1222             :                  * can still be decoded. Subsequently:
    1223             :                  *   - emit RECORD_TEXT_POKE with the new instruction
    1224             :                  *   - IPI-SYNC
    1225             :                  *   - write first byte
    1226             :                  *   - IPI-SYNC
    1227             :                  * So before the text poke event timestamp, the decoder will see
    1228             :                  * either the old instruction flow or FUP/TIP of INT3. After the
    1229             :                  * text poke event timestamp, the decoder will see either the
    1230             :                  * new instruction flow or FUP/TIP of INT3. Thus decoders can
    1231             :                  * use the timestamp as the point at which to modify the
    1232             :                  * executable code.
    1233             :                  * The old instruction is recorded so that the event can be
    1234             :                  * processed forwards or backwards.
    1235             :                  */
    1236          38 :                 perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
    1237          38 :                                      tp[i].text, len);
    1238             :         }
    1239             : 
    1240          38 :         if (do_sync) {
    1241             :                 /*
    1242             :                  * According to Intel, this core syncing is very likely
    1243             :                  * not necessary and we'd be safe even without it. But
    1244             :                  * better safe than sorry (plus there's not only Intel).
    1245             :                  */
    1246          38 :                 text_poke_sync();
    1247             :         }
    1248             : 
    1249             :         /*
    1250             :          * Third step: replace the first byte (int3) by the first byte of
    1251             :          * replacing opcode.
    1252             :          */
    1253          76 :         for (do_sync = 0, i = 0; i < nr_entries; i++) {
    1254          38 :                 if (tp[i].text[0] == INT3_INSN_OPCODE)
    1255           0 :                         continue;
    1256             : 
    1257          38 :                 text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
    1258          38 :                 do_sync++;
    1259             :         }
    1260             : 
    1261          38 :         if (do_sync)
    1262          38 :                 text_poke_sync();
    1263             : 
    1264             :         /*
    1265             :          * Remove and synchronize_rcu(), except we have a very primitive
    1266             :          * refcount based completion.
    1267             :          */
    1268          38 :         WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
    1269          76 :         if (!atomic_dec_and_test(&desc.refs))
    1270           0 :                 atomic_cond_read_acquire(&desc.refs, !VAL);
    1271          38 : }
    1272             : 
    1273          38 : static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
    1274             :                                const void *opcode, size_t len, const void *emulate)
    1275             : {
    1276          38 :         struct insn insn;
    1277             : 
    1278          38 :         memcpy((void *)tp->text, opcode, len);
    1279          38 :         if (!emulate)
    1280          38 :                 emulate = opcode;
    1281             : 
    1282          38 :         kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
    1283          38 :         insn_get_length(&insn);
    1284             : 
    1285          38 :         BUG_ON(!insn_complete(&insn));
    1286          38 :         BUG_ON(len != insn.length);
    1287             : 
    1288          38 :         tp->rel_addr = addr - (void *)_stext;
    1289          38 :         tp->opcode = insn.opcode.bytes[0];
    1290             : 
    1291          38 :         switch (tp->opcode) {
    1292             :         case INT3_INSN_OPCODE:
    1293             :         case RET_INSN_OPCODE:
    1294             :                 break;
    1295             : 
    1296          38 :         case CALL_INSN_OPCODE:
    1297             :         case JMP32_INSN_OPCODE:
    1298             :         case JMP8_INSN_OPCODE:
    1299          38 :                 tp->rel32 = insn.immediate.value;
    1300          38 :                 break;
    1301             : 
    1302           0 :         default: /* assume NOP */
    1303           0 :                 switch (len) {
    1304           0 :                 case 2: /* NOP2 -- emulate as JMP8+0 */
    1305           0 :                         BUG_ON(memcmp(emulate, ideal_nops[len], len));
    1306           0 :                         tp->opcode = JMP8_INSN_OPCODE;
    1307           0 :                         tp->rel32 = 0;
    1308           0 :                         break;
    1309             : 
    1310           0 :                 case 5: /* NOP5 -- emulate as JMP32+0 */
    1311           0 :                         BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
    1312           0 :                         tp->opcode = JMP32_INSN_OPCODE;
    1313           0 :                         tp->rel32 = 0;
    1314           0 :                         break;
    1315             : 
    1316           0 :                 default: /* unknown instruction */
    1317           0 :                         BUG();
    1318             :                 }
    1319             :                 break;
    1320             :         }
    1321          38 : }
    1322             : 
    1323             : /*
    1324             :  * We hard rely on the tp_vec being ordered; ensure this is so by flushing
    1325             :  * early if needed.
    1326             :  */
    1327           0 : static bool tp_order_fail(void *addr)
    1328             : {
    1329           0 :         struct text_poke_loc *tp;
    1330             : 
    1331           0 :         if (!tp_vec_nr)
    1332             :                 return false;
    1333             : 
    1334           0 :         if (!addr) /* force */
    1335             :                 return true;
    1336             : 
    1337           0 :         tp = &tp_vec[tp_vec_nr - 1];
    1338           0 :         if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
    1339           0 :                 return true;
    1340             : 
    1341             :         return false;
    1342             : }
    1343             : 
    1344           0 : static void text_poke_flush(void *addr)
    1345             : {
    1346           0 :         if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
    1347           0 :                 text_poke_bp_batch(tp_vec, tp_vec_nr);
    1348           0 :                 tp_vec_nr = 0;
    1349             :         }
    1350           0 : }
    1351             : 
    1352           0 : void text_poke_finish(void)
    1353             : {
    1354           0 :         text_poke_flush(NULL);
    1355           0 : }
    1356             : 
    1357           0 : void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
    1358             : {
    1359           0 :         struct text_poke_loc *tp;
    1360             : 
    1361           0 :         if (unlikely(system_state == SYSTEM_BOOTING)) {
    1362           0 :                 text_poke_early(addr, opcode, len);
    1363           0 :                 return;
    1364             :         }
    1365             : 
    1366           0 :         text_poke_flush(addr);
    1367             : 
    1368           0 :         tp = &tp_vec[tp_vec_nr++];
    1369           0 :         text_poke_loc_init(tp, addr, opcode, len, emulate);
    1370             : }
    1371             : 
    1372             : /**
    1373             :  * text_poke_bp() -- update instructions on live kernel on SMP
    1374             :  * @addr:       address to patch
    1375             :  * @opcode:     opcode of new instruction
    1376             :  * @len:        length to copy
    1377             :  * @emulate:    instruction to be emulated
    1378             :  *
    1379             :  * Update a single instruction with the vector in the stack, avoiding
    1380             :  * dynamically allocated memory. This function should be used when it is
    1381             :  * not possible to allocate memory.
    1382             :  */
    1383          38 : void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
    1384             : {
    1385          38 :         struct text_poke_loc tp;
    1386             : 
    1387          38 :         if (unlikely(system_state == SYSTEM_BOOTING)) {
    1388           0 :                 text_poke_early(addr, opcode, len);
    1389           0 :                 return;
    1390             :         }
    1391             : 
    1392          38 :         text_poke_loc_init(&tp, addr, opcode, len, emulate);
    1393          38 :         text_poke_bp_batch(&tp, 1);
    1394             : }

Generated by: LCOV version 1.14