LCOV - code coverage report
Current view: top level - arch/x86/events/intel - pt.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 4 694 0.6 %
Date: 2021-04-22 12:43:58 Functions: 1 57 1.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * Intel(R) Processor Trace PMU driver for perf
       4             :  * Copyright (c) 2013-2014, Intel Corporation.
       5             :  *
       6             :  * Intel PT is specified in the Intel Architecture Instruction Set Extensions
       7             :  * Programming Reference:
       8             :  * http://software.intel.com/en-us/intel-isa-extensions
       9             :  */
      10             : 
      11             : #undef DEBUG
      12             : 
      13             : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      14             : 
      15             : #include <linux/types.h>
      16             : #include <linux/slab.h>
      17             : #include <linux/device.h>
      18             : 
      19             : #include <asm/perf_event.h>
      20             : #include <asm/insn.h>
      21             : #include <asm/io.h>
      22             : #include <asm/intel_pt.h>
      23             : #include <asm/intel-family.h>
      24             : 
      25             : #include "../perf_event.h"
      26             : #include "pt.h"
      27             : 
      28             : static DEFINE_PER_CPU(struct pt, pt_ctx);
      29             : 
      30             : static struct pt_pmu pt_pmu;
      31             : 
      32             : /*
      33             :  * Capabilities of Intel PT hardware, such as number of address bits or
      34             :  * supported output schemes, are cached and exported to userspace as "caps"
      35             :  * attribute group of pt pmu device
      36             :  * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
      37             :  * relevant bits together with intel_pt traces.
      38             :  *
      39             :  * These are necessary for both trace decoding (payloads_lip, contains address
      40             :  * width encoded in IP-related packets), and event configuration (bitmasks with
      41             :  * permitted values for certain bit fields).
      42             :  */
      43             : #define PT_CAP(_n, _l, _r, _m)                                          \
      44             :         [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
      45             :                             .reg = _r, .mask = _m }
      46             : 
      47             : static struct pt_cap_desc {
      48             :         const char      *name;
      49             :         u32             leaf;
      50             :         u8              reg;
      51             :         u32             mask;
      52             : } pt_caps[] = {
      53             :         PT_CAP(max_subleaf,             0, CPUID_EAX, 0xffffffff),
      54             :         PT_CAP(cr3_filtering,           0, CPUID_EBX, BIT(0)),
      55             :         PT_CAP(psb_cyc,                 0, CPUID_EBX, BIT(1)),
      56             :         PT_CAP(ip_filtering,            0, CPUID_EBX, BIT(2)),
      57             :         PT_CAP(mtc,                     0, CPUID_EBX, BIT(3)),
      58             :         PT_CAP(ptwrite,                 0, CPUID_EBX, BIT(4)),
      59             :         PT_CAP(power_event_trace,       0, CPUID_EBX, BIT(5)),
      60             :         PT_CAP(topa_output,             0, CPUID_ECX, BIT(0)),
      61             :         PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
      62             :         PT_CAP(single_range_output,     0, CPUID_ECX, BIT(2)),
      63             :         PT_CAP(output_subsys,           0, CPUID_ECX, BIT(3)),
      64             :         PT_CAP(payloads_lip,            0, CPUID_ECX, BIT(31)),
      65             :         PT_CAP(num_address_ranges,      1, CPUID_EAX, 0x3),
      66             :         PT_CAP(mtc_periods,             1, CPUID_EAX, 0xffff0000),
      67             :         PT_CAP(cycle_thresholds,        1, CPUID_EBX, 0xffff),
      68             :         PT_CAP(psb_periods,             1, CPUID_EBX, 0xffff0000),
      69             : };
      70             : 
      71           0 : u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
      72             : {
      73           0 :         struct pt_cap_desc *cd = &pt_caps[capability];
      74           0 :         u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
      75           0 :         unsigned int shift = __ffs(cd->mask);
      76             : 
      77           0 :         return (c & cd->mask) >> shift;
      78             : }
      79             : EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
      80             : 
      81           0 : u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
      82             : {
      83           0 :         return intel_pt_validate_cap(pt_pmu.caps, cap);
      84             : }
      85             : EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
      86             : 
      87           0 : static ssize_t pt_cap_show(struct device *cdev,
      88             :                            struct device_attribute *attr,
      89             :                            char *buf)
      90             : {
      91           0 :         struct dev_ext_attribute *ea =
      92           0 :                 container_of(attr, struct dev_ext_attribute, attr);
      93           0 :         enum pt_capabilities cap = (long)ea->var;
      94             : 
      95           0 :         return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
      96             : }
      97             : 
      98             : static struct attribute_group pt_cap_group __ro_after_init = {
      99             :         .name   = "caps",
     100             : };
     101             : 
     102           0 : PMU_FORMAT_ATTR(pt,             "config:0"    );
     103           0 : PMU_FORMAT_ATTR(cyc,            "config:1"    );
     104           0 : PMU_FORMAT_ATTR(pwr_evt,        "config:4"    );
     105           0 : PMU_FORMAT_ATTR(fup_on_ptw,     "config:5"    );
     106           0 : PMU_FORMAT_ATTR(mtc,            "config:9"    );
     107           0 : PMU_FORMAT_ATTR(tsc,            "config:10"   );
     108           0 : PMU_FORMAT_ATTR(noretcomp,      "config:11"   );
     109           0 : PMU_FORMAT_ATTR(ptw,            "config:12"   );
     110           0 : PMU_FORMAT_ATTR(branch,         "config:13"   );
     111           0 : PMU_FORMAT_ATTR(mtc_period,     "config:14-17"        );
     112           0 : PMU_FORMAT_ATTR(cyc_thresh,     "config:19-22"        );
     113           0 : PMU_FORMAT_ATTR(psb_period,     "config:24-27"        );
     114             : 
     115             : static struct attribute *pt_formats_attr[] = {
     116             :         &format_attr_pt.attr,
     117             :         &format_attr_cyc.attr,
     118             :         &format_attr_pwr_evt.attr,
     119             :         &format_attr_fup_on_ptw.attr,
     120             :         &format_attr_mtc.attr,
     121             :         &format_attr_tsc.attr,
     122             :         &format_attr_noretcomp.attr,
     123             :         &format_attr_ptw.attr,
     124             :         &format_attr_branch.attr,
     125             :         &format_attr_mtc_period.attr,
     126             :         &format_attr_cyc_thresh.attr,
     127             :         &format_attr_psb_period.attr,
     128             :         NULL,
     129             : };
     130             : 
     131             : static struct attribute_group pt_format_group = {
     132             :         .name   = "format",
     133             :         .attrs  = pt_formats_attr,
     134             : };
     135             : 
     136             : static ssize_t
     137           0 : pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
     138             :                     char *page)
     139             : {
     140           0 :         struct perf_pmu_events_attr *pmu_attr =
     141           0 :                 container_of(attr, struct perf_pmu_events_attr, attr);
     142             : 
     143           0 :         switch (pmu_attr->id) {
     144           0 :         case 0:
     145           0 :                 return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
     146           0 :         case 1:
     147           0 :                 return sprintf(page, "%u:%u\n",
     148             :                                pt_pmu.tsc_art_num,
     149             :                                pt_pmu.tsc_art_den);
     150             :         default:
     151             :                 break;
     152             :         }
     153             : 
     154             :         return -EINVAL;
     155             : }
     156             : 
     157             : PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
     158             :                pt_timing_attr_show);
     159             : PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
     160             :                pt_timing_attr_show);
     161             : 
     162             : static struct attribute *pt_timing_attr[] = {
     163             :         &timing_attr_max_nonturbo_ratio.attr.attr,
     164             :         &timing_attr_tsc_art_ratio.attr.attr,
     165             :         NULL,
     166             : };
     167             : 
     168             : static struct attribute_group pt_timing_group = {
     169             :         .attrs  = pt_timing_attr,
     170             : };
     171             : 
     172             : static const struct attribute_group *pt_attr_groups[] = {
     173             :         &pt_cap_group,
     174             :         &pt_format_group,
     175             :         &pt_timing_group,
     176             :         NULL,
     177             : };
     178             : 
     179           0 : static int __init pt_pmu_hw_init(void)
     180             : {
     181           0 :         struct dev_ext_attribute *de_attrs;
     182           0 :         struct attribute **attrs;
     183           0 :         size_t size;
     184           0 :         u64 reg;
     185           0 :         int ret;
     186           0 :         long i;
     187             : 
     188           0 :         rdmsrl(MSR_PLATFORM_INFO, reg);
     189           0 :         pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
     190             : 
     191             :         /*
     192             :          * if available, read in TSC to core crystal clock ratio,
     193             :          * otherwise, zero for numerator stands for "not enumerated"
     194             :          * as per SDM
     195             :          */
     196           0 :         if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
     197           0 :                 u32 eax, ebx, ecx, edx;
     198             : 
     199           0 :                 cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
     200             : 
     201           0 :                 pt_pmu.tsc_art_num = ebx;
     202           0 :                 pt_pmu.tsc_art_den = eax;
     203             :         }
     204             : 
     205             :         /* model-specific quirks */
     206           0 :         switch (boot_cpu_data.x86_model) {
     207           0 :         case INTEL_FAM6_BROADWELL:
     208             :         case INTEL_FAM6_BROADWELL_D:
     209             :         case INTEL_FAM6_BROADWELL_G:
     210             :         case INTEL_FAM6_BROADWELL_X:
     211             :                 /* not setting BRANCH_EN will #GP, erratum BDM106 */
     212           0 :                 pt_pmu.branch_en_always_on = true;
     213           0 :                 break;
     214             :         default:
     215             :                 break;
     216             :         }
     217             : 
     218           0 :         if (boot_cpu_has(X86_FEATURE_VMX)) {
     219             :                 /*
     220             :                  * Intel SDM, 36.5 "Tracing post-VMXON" says that
     221             :                  * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
     222             :                  * post-VMXON.
     223             :                  */
     224           0 :                 rdmsrl(MSR_IA32_VMX_MISC, reg);
     225           0 :                 if (reg & BIT(14))
     226           0 :                         pt_pmu.vmx = true;
     227             :         }
     228             : 
     229           0 :         for (i = 0; i < PT_CPUID_LEAVES; i++) {
     230           0 :                 cpuid_count(20, i,
     231           0 :                             &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM],
     232           0 :                             &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM],
     233           0 :                             &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM],
     234           0 :                             &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]);
     235             :         }
     236             : 
     237           0 :         ret = -ENOMEM;
     238           0 :         size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
     239           0 :         attrs = kzalloc(size, GFP_KERNEL);
     240           0 :         if (!attrs)
     241           0 :                 goto fail;
     242             : 
     243           0 :         size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
     244           0 :         de_attrs = kzalloc(size, GFP_KERNEL);
     245           0 :         if (!de_attrs)
     246           0 :                 goto fail;
     247             : 
     248           0 :         for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
     249           0 :                 struct dev_ext_attribute *de_attr = de_attrs + i;
     250             : 
     251           0 :                 de_attr->attr.attr.name = pt_caps[i].name;
     252             : 
     253           0 :                 sysfs_attr_init(&de_attr->attr.attr);
     254             : 
     255           0 :                 de_attr->attr.attr.mode              = S_IRUGO;
     256           0 :                 de_attr->attr.show           = pt_cap_show;
     257           0 :                 de_attr->var                 = (void *)i;
     258             : 
     259           0 :                 attrs[i] = &de_attr->attr.attr;
     260             :         }
     261             : 
     262           0 :         pt_cap_group.attrs = attrs;
     263             : 
     264           0 :         return 0;
     265             : 
     266           0 : fail:
     267           0 :         kfree(attrs);
     268             : 
     269           0 :         return ret;
     270             : }
     271             : 
     272             : #define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC     | \
     273             :                           RTIT_CTL_CYC_THRESH   | \
     274             :                           RTIT_CTL_PSB_FREQ)
     275             : 
     276             : #define RTIT_CTL_MTC    (RTIT_CTL_MTC_EN        | \
     277             :                          RTIT_CTL_MTC_RANGE)
     278             : 
     279             : #define RTIT_CTL_PTW    (RTIT_CTL_PTW_EN        | \
     280             :                          RTIT_CTL_FUP_ON_PTW)
     281             : 
     282             : /*
     283             :  * Bit 0 (TraceEn) in the attr.config is meaningless as the
     284             :  * corresponding bit in the RTIT_CTL can only be controlled
     285             :  * by the driver; therefore, repurpose it to mean: pass
     286             :  * through the bit that was previously assumed to be always
     287             :  * on for PT, thereby allowing the user to *not* set it if
     288             :  * they so wish. See also pt_event_valid() and pt_config().
     289             :  */
     290             : #define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
     291             : 
     292             : #define PT_CONFIG_MASK (RTIT_CTL_TRACEEN        | \
     293             :                         RTIT_CTL_TSC_EN         | \
     294             :                         RTIT_CTL_DISRETC        | \
     295             :                         RTIT_CTL_BRANCH_EN      | \
     296             :                         RTIT_CTL_CYC_PSB        | \
     297             :                         RTIT_CTL_MTC            | \
     298             :                         RTIT_CTL_PWR_EVT_EN     | \
     299             :                         RTIT_CTL_FUP_ON_PTW     | \
     300             :                         RTIT_CTL_PTW_EN)
     301             : 
     302           0 : static bool pt_event_valid(struct perf_event *event)
     303             : {
     304           0 :         u64 config = event->attr.config;
     305           0 :         u64 allowed, requested;
     306             : 
     307           0 :         if ((config & PT_CONFIG_MASK) != config)
     308             :                 return false;
     309             : 
     310           0 :         if (config & RTIT_CTL_CYC_PSB) {
     311           0 :                 if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
     312             :                         return false;
     313             : 
     314           0 :                 allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
     315           0 :                 requested = (config & RTIT_CTL_PSB_FREQ) >>
     316             :                         RTIT_CTL_PSB_FREQ_OFFSET;
     317           0 :                 if (requested && (!(allowed & BIT(requested))))
     318             :                         return false;
     319             : 
     320           0 :                 allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
     321           0 :                 requested = (config & RTIT_CTL_CYC_THRESH) >>
     322             :                         RTIT_CTL_CYC_THRESH_OFFSET;
     323           0 :                 if (requested && (!(allowed & BIT(requested))))
     324             :                         return false;
     325             :         }
     326             : 
     327           0 :         if (config & RTIT_CTL_MTC) {
     328             :                 /*
     329             :                  * In the unlikely case that CPUID lists valid mtc periods,
     330             :                  * but not the mtc capability, drop out here.
     331             :                  *
     332             :                  * Spec says that setting mtc period bits while mtc bit in
     333             :                  * CPUID is 0 will #GP, so better safe than sorry.
     334             :                  */
     335           0 :                 if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
     336             :                         return false;
     337             : 
     338           0 :                 allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
     339           0 :                 if (!allowed)
     340             :                         return false;
     341             : 
     342           0 :                 requested = (config & RTIT_CTL_MTC_RANGE) >>
     343             :                         RTIT_CTL_MTC_RANGE_OFFSET;
     344             : 
     345           0 :                 if (!(allowed & BIT(requested)))
     346             :                         return false;
     347             :         }
     348             : 
     349           0 :         if (config & RTIT_CTL_PWR_EVT_EN &&
     350           0 :             !intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
     351             :                 return false;
     352             : 
     353           0 :         if (config & RTIT_CTL_PTW) {
     354           0 :                 if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
     355             :                         return false;
     356             : 
     357             :                 /* FUPonPTW without PTW doesn't make sense */
     358           0 :                 if ((config & RTIT_CTL_FUP_ON_PTW) &&
     359             :                     !(config & RTIT_CTL_PTW_EN))
     360             :                         return false;
     361             :         }
     362             : 
     363             :         /*
     364             :          * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
     365             :          * clears the assomption that BranchEn must always be enabled,
     366             :          * as was the case with the first implementation of PT.
     367             :          * If this bit is not set, the legacy behavior is preserved
     368             :          * for compatibility with the older userspace.
     369             :          *
     370             :          * Re-using bit 0 for this purpose is fine because it is never
     371             :          * directly set by the user; previous attempts at setting it in
     372             :          * the attr.config resulted in -EINVAL.
     373             :          */
     374           0 :         if (config & RTIT_CTL_PASSTHROUGH) {
     375             :                 /*
     376             :                  * Disallow not setting BRANCH_EN where BRANCH_EN is
     377             :                  * always required.
     378             :                  */
     379           0 :                 if (pt_pmu.branch_en_always_on &&
     380           0 :                     !(config & RTIT_CTL_BRANCH_EN))
     381           0 :                         return false;
     382             :         } else {
     383             :                 /*
     384             :                  * Disallow BRANCH_EN without the PASSTHROUGH.
     385             :                  */
     386           0 :                 if (config & RTIT_CTL_BRANCH_EN)
     387           0 :                         return false;
     388             :         }
     389             : 
     390             :         return true;
     391             : }
     392             : 
     393             : /*
     394             :  * PT configuration helpers
     395             :  * These all are cpu affine and operate on a local PT
     396             :  */
     397             : 
     398           0 : static void pt_config_start(struct perf_event *event)
     399             : {
     400           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
     401           0 :         u64 ctl = event->hw.config;
     402             : 
     403           0 :         ctl |= RTIT_CTL_TRACEEN;
     404           0 :         if (READ_ONCE(pt->vmx_on))
     405           0 :                 perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
     406             :         else
     407           0 :                 wrmsrl(MSR_IA32_RTIT_CTL, ctl);
     408             : 
     409           0 :         WRITE_ONCE(event->hw.config, ctl);
     410           0 : }
     411             : 
     412             : /* Address ranges and their corresponding msr configuration registers */
     413             : static const struct pt_address_range {
     414             :         unsigned long   msr_a;
     415             :         unsigned long   msr_b;
     416             :         unsigned int    reg_off;
     417             : } pt_address_ranges[] = {
     418             :         {
     419             :                 .msr_a   = MSR_IA32_RTIT_ADDR0_A,
     420             :                 .msr_b   = MSR_IA32_RTIT_ADDR0_B,
     421             :                 .reg_off = RTIT_CTL_ADDR0_OFFSET,
     422             :         },
     423             :         {
     424             :                 .msr_a   = MSR_IA32_RTIT_ADDR1_A,
     425             :                 .msr_b   = MSR_IA32_RTIT_ADDR1_B,
     426             :                 .reg_off = RTIT_CTL_ADDR1_OFFSET,
     427             :         },
     428             :         {
     429             :                 .msr_a   = MSR_IA32_RTIT_ADDR2_A,
     430             :                 .msr_b   = MSR_IA32_RTIT_ADDR2_B,
     431             :                 .reg_off = RTIT_CTL_ADDR2_OFFSET,
     432             :         },
     433             :         {
     434             :                 .msr_a   = MSR_IA32_RTIT_ADDR3_A,
     435             :                 .msr_b   = MSR_IA32_RTIT_ADDR3_B,
     436             :                 .reg_off = RTIT_CTL_ADDR3_OFFSET,
     437             :         }
     438             : };
     439             : 
     440           0 : static u64 pt_config_filters(struct perf_event *event)
     441             : {
     442           0 :         struct pt_filters *filters = event->hw.addr_filters;
     443           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
     444           0 :         unsigned int range = 0;
     445           0 :         u64 rtit_ctl = 0;
     446             : 
     447           0 :         if (!filters)
     448             :                 return 0;
     449             : 
     450           0 :         perf_event_addr_filters_sync(event);
     451             : 
     452           0 :         for (range = 0; range < filters->nr_filters; range++) {
     453           0 :                 struct pt_filter *filter = &filters->filter[range];
     454             : 
     455             :                 /*
     456             :                  * Note, if the range has zero start/end addresses due
     457             :                  * to its dynamic object not being loaded yet, we just
     458             :                  * go ahead and program zeroed range, which will simply
     459             :                  * produce no data. Note^2: if executable code at 0x0
     460             :                  * is a concern, we can set up an "invalid" configuration
     461             :                  * such as msr_b < msr_a.
     462             :                  */
     463             : 
     464             :                 /* avoid redundant msr writes */
     465           0 :                 if (pt->filters.filter[range].msr_a != filter->msr_a) {
     466           0 :                         wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
     467           0 :                         pt->filters.filter[range].msr_a = filter->msr_a;
     468             :                 }
     469             : 
     470           0 :                 if (pt->filters.filter[range].msr_b != filter->msr_b) {
     471           0 :                         wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
     472           0 :                         pt->filters.filter[range].msr_b = filter->msr_b;
     473             :                 }
     474             : 
     475           0 :                 rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;
     476             :         }
     477             : 
     478             :         return rtit_ctl;
     479             : }
     480             : 
     481           0 : static void pt_config(struct perf_event *event)
     482             : {
     483           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
     484           0 :         struct pt_buffer *buf = perf_get_aux(&pt->handle);
     485           0 :         u64 reg;
     486             : 
     487             :         /* First round: clear STATUS, in particular the PSB byte counter. */
     488           0 :         if (!event->hw.config) {
     489           0 :                 perf_event_itrace_started(event);
     490           0 :                 wrmsrl(MSR_IA32_RTIT_STATUS, 0);
     491             :         }
     492             : 
     493           0 :         reg = pt_config_filters(event);
     494           0 :         reg |= RTIT_CTL_TRACEEN;
     495           0 :         if (!buf->single)
     496           0 :                 reg |= RTIT_CTL_TOPA;
     497             : 
     498             :         /*
     499             :          * Previously, we had BRANCH_EN on by default, but now that PT has
     500             :          * grown features outside of branch tracing, it is useful to allow
     501             :          * the user to disable it. Setting bit 0 in the event's attr.config
     502             :          * allows BRANCH_EN to pass through instead of being always on. See
     503             :          * also the comment in pt_event_valid().
     504             :          */
     505           0 :         if (event->attr.config & BIT(0)) {
     506           0 :                 reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
     507             :         } else {
     508           0 :                 reg |= RTIT_CTL_BRANCH_EN;
     509             :         }
     510             : 
     511           0 :         if (!event->attr.exclude_kernel)
     512           0 :                 reg |= RTIT_CTL_OS;
     513           0 :         if (!event->attr.exclude_user)
     514           0 :                 reg |= RTIT_CTL_USR;
     515             : 
     516           0 :         reg |= (event->attr.config & PT_CONFIG_MASK);
     517             : 
     518           0 :         event->hw.config = reg;
     519           0 :         pt_config_start(event);
     520           0 : }
     521             : 
     522           0 : static void pt_config_stop(struct perf_event *event)
     523             : {
     524           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
     525           0 :         u64 ctl = READ_ONCE(event->hw.config);
     526             : 
     527             :         /* may be already stopped by a PMI */
     528           0 :         if (!(ctl & RTIT_CTL_TRACEEN))
     529             :                 return;
     530             : 
     531           0 :         ctl &= ~RTIT_CTL_TRACEEN;
     532           0 :         if (!READ_ONCE(pt->vmx_on))
     533           0 :                 wrmsrl(MSR_IA32_RTIT_CTL, ctl);
     534             : 
     535           0 :         WRITE_ONCE(event->hw.config, ctl);
     536             : 
     537             :         /*
     538             :          * A wrmsr that disables trace generation serializes other PT
     539             :          * registers and causes all data packets to be written to memory,
     540             :          * but a fence is required for the data to become globally visible.
     541             :          *
     542             :          * The below WMB, separating data store and aux_head store matches
     543             :          * the consumer's RMB that separates aux_head load and data load.
     544             :          */
     545           0 :         wmb();
     546             : }
     547             : 
     548             : /**
     549             :  * struct topa - ToPA metadata
     550             :  * @list:       linkage to struct pt_buffer's list of tables
     551             :  * @offset:     offset of the first entry in this table in the buffer
     552             :  * @size:       total size of all entries in this table
     553             :  * @last:       index of the last initialized entry in this table
     554             :  * @z_count:    how many times the first entry repeats
     555             :  */
     556             : struct topa {
     557             :         struct list_head        list;
     558             :         u64                     offset;
     559             :         size_t                  size;
     560             :         int                     last;
     561             :         unsigned int            z_count;
     562             : };
     563             : 
     564             : /*
     565             :  * Keep ToPA table-related metadata on the same page as the actual table,
     566             :  * taking up a few words from the top
     567             :  */
     568             : 
     569             : #define TENTS_PER_PAGE  \
     570             :         ((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry))
     571             : 
     572             : /**
     573             :  * struct topa_page - page-sized ToPA table with metadata at the top
     574             :  * @table:      actual ToPA table entries, as understood by PT hardware
     575             :  * @topa:       metadata
     576             :  */
     577             : struct topa_page {
     578             :         struct topa_entry       table[TENTS_PER_PAGE];
     579             :         struct topa             topa;
     580             : };
     581             : 
     582           0 : static inline struct topa_page *topa_to_page(struct topa *topa)
     583             : {
     584           0 :         return container_of(topa, struct topa_page, topa);
     585             : }
     586             : 
     587           0 : static inline struct topa_page *topa_entry_to_page(struct topa_entry *te)
     588             : {
     589           0 :         return (struct topa_page *)((unsigned long)te & PAGE_MASK);
     590             : }
     591             : 
     592           0 : static inline phys_addr_t topa_pfn(struct topa *topa)
     593             : {
     594           0 :         return PFN_DOWN(virt_to_phys(topa_to_page(topa)));
     595             : }
     596             : 
     597             : /* make -1 stand for the last table entry */
     598             : #define TOPA_ENTRY(t, i)                                \
     599             :         ((i) == -1                                      \
     600             :                 ? &topa_to_page(t)->table[(t)->last]  \
     601             :                 : &topa_to_page(t)->table[(i)])
     602             : #define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
     603             : #define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
     604             : 
     605           0 : static void pt_config_buffer(struct pt_buffer *buf)
     606             : {
     607           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
     608           0 :         u64 reg, mask;
     609           0 :         void *base;
     610             : 
     611           0 :         if (buf->single) {
     612           0 :                 base = buf->data_pages[0];
     613           0 :                 mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
     614             :         } else {
     615           0 :                 base = topa_to_page(buf->cur)->table;
     616           0 :                 mask = (u64)buf->cur_idx;
     617             :         }
     618             : 
     619           0 :         reg = virt_to_phys(base);
     620           0 :         if (pt->output_base != reg) {
     621           0 :                 pt->output_base = reg;
     622           0 :                 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg);
     623             :         }
     624             : 
     625           0 :         reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
     626           0 :         if (pt->output_mask != reg) {
     627           0 :                 pt->output_mask = reg;
     628           0 :                 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
     629             :         }
     630           0 : }
     631             : 
     632             : /**
     633             :  * topa_alloc() - allocate page-sized ToPA table
     634             :  * @cpu:        CPU on which to allocate.
     635             :  * @gfp:        Allocation flags.
     636             :  *
     637             :  * Return:      On success, return the pointer to ToPA table page.
     638             :  */
     639           0 : static struct topa *topa_alloc(int cpu, gfp_t gfp)
     640             : {
     641           0 :         int node = cpu_to_node(cpu);
     642           0 :         struct topa_page *tp;
     643           0 :         struct page *p;
     644             : 
     645           0 :         p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
     646           0 :         if (!p)
     647             :                 return NULL;
     648             : 
     649           0 :         tp = page_address(p);
     650           0 :         tp->topa.last = 0;
     651             : 
     652             :         /*
     653             :          * In case of singe-entry ToPA, always put the self-referencing END
     654             :          * link as the 2nd entry in the table
     655             :          */
     656           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
     657           0 :                 TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT;
     658           0 :                 TOPA_ENTRY(&tp->topa, 1)->end = 1;
     659             :         }
     660             : 
     661           0 :         return &tp->topa;
     662             : }
     663             : 
     664             : /**
     665             :  * topa_free() - free a page-sized ToPA table
     666             :  * @topa:       Table to deallocate.
     667             :  */
     668           0 : static void topa_free(struct topa *topa)
     669             : {
     670           0 :         free_page((unsigned long)topa);
     671             : }
     672             : 
     673             : /**
     674             :  * topa_insert_table() - insert a ToPA table into a buffer
     675             :  * @buf:         PT buffer that's being extended.
     676             :  * @topa:        New topa table to be inserted.
     677             :  *
     678             :  * If it's the first table in this buffer, set up buffer's pointers
     679             :  * accordingly; otherwise, add a END=1 link entry to @topa to the current
     680             :  * "last" table and adjust the last table pointer to @topa.
     681             :  */
     682           0 : static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
     683             : {
     684           0 :         struct topa *last = buf->last;
     685             : 
     686           0 :         list_add_tail(&topa->list, &buf->tables);
     687             : 
     688           0 :         if (!buf->first) {
     689           0 :                 buf->first = buf->last = buf->cur = topa;
     690           0 :                 return;
     691             :         }
     692             : 
     693           0 :         topa->offset = last->offset + last->size;
     694           0 :         buf->last = topa;
     695             : 
     696           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
     697             :                 return;
     698             : 
     699           0 :         BUG_ON(last->last != TENTS_PER_PAGE - 1);
     700             : 
     701           0 :         TOPA_ENTRY(last, -1)->base = topa_pfn(topa);
     702           0 :         TOPA_ENTRY(last, -1)->end = 1;
     703             : }
     704             : 
     705             : /**
     706             :  * topa_table_full() - check if a ToPA table is filled up
     707             :  * @topa:       ToPA table.
     708             :  */
     709           0 : static bool topa_table_full(struct topa *topa)
     710             : {
     711             :         /* single-entry ToPA is a special case */
     712           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
     713           0 :                 return !!topa->last;
     714             : 
     715           0 :         return topa->last == TENTS_PER_PAGE - 1;
     716             : }
     717             : 
     718             : /**
     719             :  * topa_insert_pages() - create a list of ToPA tables
     720             :  * @buf:        PT buffer being initialized.
     721             :  * @gfp:        Allocation flags.
     722             :  *
     723             :  * This initializes a list of ToPA tables with entries from
     724             :  * the data_pages provided by rb_alloc_aux().
     725             :  *
     726             :  * Return:      0 on success or error code.
     727             :  */
     728           0 : static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp)
     729             : {
     730           0 :         struct topa *topa = buf->last;
     731           0 :         int order = 0;
     732           0 :         struct page *p;
     733             : 
     734           0 :         p = virt_to_page(buf->data_pages[buf->nr_pages]);
     735           0 :         if (PagePrivate(p))
     736           0 :                 order = page_private(p);
     737             : 
     738           0 :         if (topa_table_full(topa)) {
     739           0 :                 topa = topa_alloc(cpu, gfp);
     740           0 :                 if (!topa)
     741             :                         return -ENOMEM;
     742             : 
     743           0 :                 topa_insert_table(buf, topa);
     744             :         }
     745             : 
     746           0 :         if (topa->z_count == topa->last - 1) {
     747           0 :                 if (order == TOPA_ENTRY(topa, topa->last - 1)->size)
     748           0 :                         topa->z_count++;
     749             :         }
     750             : 
     751           0 :         TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
     752           0 :         TOPA_ENTRY(topa, -1)->size = order;
     753           0 :         if (!buf->snapshot &&
     754           0 :             !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
     755           0 :                 TOPA_ENTRY(topa, -1)->intr = 1;
     756           0 :                 TOPA_ENTRY(topa, -1)->stop = 1;
     757             :         }
     758             : 
     759           0 :         topa->last++;
     760           0 :         topa->size += sizes(order);
     761             : 
     762           0 :         buf->nr_pages += 1ul << order;
     763             : 
     764           0 :         return 0;
     765             : }
     766             : 
     767             : /**
     768             :  * pt_topa_dump() - print ToPA tables and their entries
     769             :  * @buf:        PT buffer.
     770             :  */
     771           0 : static void pt_topa_dump(struct pt_buffer *buf)
     772             : {
     773           0 :         struct topa *topa;
     774             : 
     775           0 :         list_for_each_entry(topa, &buf->tables, list) {
     776           0 :                 struct topa_page *tp = topa_to_page(topa);
     777             :                 int i;
     778             : 
     779             :                 pr_debug("# table @%p, off %llx size %zx\n", tp->table,
     780             :                          topa->offset, topa->size);
     781           0 :                 for (i = 0; i < TENTS_PER_PAGE; i++) {
     782           0 :                         pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
     783             :                                  &tp->table[i],
     784             :                                  (unsigned long)tp->table[i].base << TOPA_SHIFT,
     785             :                                  sizes(tp->table[i].size),
     786             :                                  tp->table[i].end ?  'E' : ' ',
     787             :                                  tp->table[i].intr ? 'I' : ' ',
     788             :                                  tp->table[i].stop ? 'S' : ' ',
     789             :                                  *(u64 *)&tp->table[i]);
     790           0 :                         if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
     791           0 :                              tp->table[i].stop) ||
     792           0 :                             tp->table[i].end)
     793             :                                 break;
     794           0 :                         if (!i && topa->z_count)
     795           0 :                                 i += topa->z_count;
     796             :                 }
     797             :         }
     798           0 : }
     799             : 
     800             : /**
     801             :  * pt_buffer_advance() - advance to the next output region
     802             :  * @buf:        PT buffer.
     803             :  *
     804             :  * Advance the current pointers in the buffer to the next ToPA entry.
     805             :  */
     806           0 : static void pt_buffer_advance(struct pt_buffer *buf)
     807             : {
     808           0 :         buf->output_off = 0;
     809           0 :         buf->cur_idx++;
     810             : 
     811           0 :         if (buf->cur_idx == buf->cur->last) {
     812           0 :                 if (buf->cur == buf->last)
     813           0 :                         buf->cur = buf->first;
     814             :                 else
     815           0 :                         buf->cur = list_entry(buf->cur->list.next, struct topa,
     816             :                                               list);
     817           0 :                 buf->cur_idx = 0;
     818             :         }
     819           0 : }
     820             : 
     821             : /**
     822             :  * pt_update_head() - calculate current offsets and sizes
     823             :  * @pt:         Per-cpu pt context.
     824             :  *
     825             :  * Update buffer's current write pointer position and data size.
     826             :  */
     827           0 : static void pt_update_head(struct pt *pt)
     828             : {
     829           0 :         struct pt_buffer *buf = perf_get_aux(&pt->handle);
     830           0 :         u64 topa_idx, base, old;
     831             : 
     832           0 :         if (buf->single) {
     833           0 :                 local_set(&buf->data_size, buf->output_off);
     834           0 :                 return;
     835             :         }
     836             : 
     837             :         /* offset of the first region in this table from the beginning of buf */
     838           0 :         base = buf->cur->offset + buf->output_off;
     839             : 
     840             :         /* offset of the current output region within this table */
     841           0 :         for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
     842           0 :                 base += TOPA_ENTRY_SIZE(buf->cur, topa_idx);
     843             : 
     844           0 :         if (buf->snapshot) {
     845           0 :                 local_set(&buf->data_size, base);
     846             :         } else {
     847           0 :                 old = (local64_xchg(&buf->head, base) &
     848           0 :                        ((buf->nr_pages << PAGE_SHIFT) - 1));
     849           0 :                 if (base < old)
     850           0 :                         base += buf->nr_pages << PAGE_SHIFT;
     851             : 
     852           0 :                 local_add(base - old, &buf->data_size);
     853             :         }
     854             : }
     855             : 
     856             : /**
     857             :  * pt_buffer_region() - obtain current output region's address
     858             :  * @buf:        PT buffer.
     859             :  */
     860           0 : static void *pt_buffer_region(struct pt_buffer *buf)
     861             : {
     862           0 :         return phys_to_virt(TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT);
     863             : }
     864             : 
     865             : /**
     866             :  * pt_buffer_region_size() - obtain current output region's size
     867             :  * @buf:        PT buffer.
     868             :  */
     869           0 : static size_t pt_buffer_region_size(struct pt_buffer *buf)
     870             : {
     871           0 :         return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx);
     872             : }
     873             : 
     874             : /**
     875             :  * pt_handle_status() - take care of possible status conditions
     876             :  * @pt:         Per-cpu pt context.
     877             :  */
     878           0 : static void pt_handle_status(struct pt *pt)
     879             : {
     880           0 :         struct pt_buffer *buf = perf_get_aux(&pt->handle);
     881           0 :         int advance = 0;
     882           0 :         u64 status;
     883             : 
     884           0 :         rdmsrl(MSR_IA32_RTIT_STATUS, status);
     885             : 
     886           0 :         if (status & RTIT_STATUS_ERROR) {
     887           0 :                 pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
     888           0 :                 pt_topa_dump(buf);
     889           0 :                 status &= ~RTIT_STATUS_ERROR;
     890             :         }
     891             : 
     892           0 :         if (status & RTIT_STATUS_STOPPED) {
     893           0 :                 status &= ~RTIT_STATUS_STOPPED;
     894             : 
     895             :                 /*
     896             :                  * On systems that only do single-entry ToPA, hitting STOP
     897             :                  * means we are already losing data; need to let the decoder
     898             :                  * know.
     899             :                  */
     900           0 :                 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
     901           0 :                     buf->output_off == pt_buffer_region_size(buf)) {
     902           0 :                         perf_aux_output_flag(&pt->handle,
     903             :                                              PERF_AUX_FLAG_TRUNCATED);
     904           0 :                         advance++;
     905             :                 }
     906             :         }
     907             : 
     908             :         /*
     909             :          * Also on single-entry ToPA implementations, interrupt will come
     910             :          * before the output reaches its output region's boundary.
     911             :          */
     912           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
     913           0 :             !buf->snapshot &&
     914           0 :             pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
     915           0 :                 void *head = pt_buffer_region(buf);
     916             : 
     917             :                 /* everything within this margin needs to be zeroed out */
     918           0 :                 memset(head + buf->output_off, 0,
     919           0 :                        pt_buffer_region_size(buf) -
     920             :                        buf->output_off);
     921           0 :                 advance++;
     922             :         }
     923             : 
     924           0 :         if (advance)
     925           0 :                 pt_buffer_advance(buf);
     926             : 
     927           0 :         wrmsrl(MSR_IA32_RTIT_STATUS, status);
     928           0 : }
     929             : 
     930             : /**
     931             :  * pt_read_offset() - translate registers into buffer pointers
     932             :  * @buf:        PT buffer.
     933             :  *
     934             :  * Set buffer's output pointers from MSR values.
     935             :  */
     936           0 : static void pt_read_offset(struct pt_buffer *buf)
     937             : {
     938           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
     939           0 :         struct topa_page *tp;
     940             : 
     941           0 :         if (!buf->single) {
     942           0 :                 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base);
     943           0 :                 tp = phys_to_virt(pt->output_base);
     944           0 :                 buf->cur = &tp->topa;
     945             :         }
     946             : 
     947           0 :         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask);
     948             :         /* offset within current output region */
     949           0 :         buf->output_off = pt->output_mask >> 32;
     950             :         /* index of current output region within this table */
     951           0 :         if (!buf->single)
     952           0 :                 buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7;
     953           0 : }
     954             : 
     955             : static struct topa_entry *
     956           0 : pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg)
     957             : {
     958           0 :         struct topa_page *tp;
     959           0 :         struct topa *topa;
     960           0 :         unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0;
     961             : 
     962             :         /*
     963             :          * Indicates a bug in the caller.
     964             :          */
     965           0 :         if (WARN_ON_ONCE(pg >= buf->nr_pages))
     966             :                 return NULL;
     967             : 
     968             :         /*
     969             :          * First, find the ToPA table where @pg fits. With high
     970             :          * order allocations, there shouldn't be many of these.
     971             :          */
     972           0 :         list_for_each_entry(topa, &buf->tables, list) {
     973           0 :                 if (topa->offset + topa->size > pg << PAGE_SHIFT)
     974           0 :                         goto found;
     975             :         }
     976             : 
     977             :         /*
     978             :          * Hitting this means we have a problem in the ToPA
     979             :          * allocation code.
     980             :          */
     981           0 :         WARN_ON_ONCE(1);
     982             : 
     983           0 :         return NULL;
     984             : 
     985           0 : found:
     986             :         /*
     987             :          * Indicates a problem in the ToPA allocation code.
     988             :          */
     989           0 :         if (WARN_ON_ONCE(topa->last == -1))
     990             :                 return NULL;
     991             : 
     992           0 :         tp = topa_to_page(topa);
     993           0 :         cur_pg = PFN_DOWN(topa->offset);
     994           0 :         if (topa->z_count) {
     995           0 :                 z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1);
     996           0 :                 start_idx = topa->z_count + 1;
     997             :         }
     998             : 
     999             :         /*
    1000             :          * Multiple entries at the beginning of the table have the same size,
    1001             :          * ideally all of them; if @pg falls there, the search is done.
    1002             :          */
    1003           0 :         if (pg >= cur_pg && pg < cur_pg + z_pg) {
    1004           0 :                 idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0);
    1005           0 :                 return &tp->table[idx];
    1006             :         }
    1007             : 
    1008             :         /*
    1009             :          * Otherwise, slow path: iterate through the remaining entries.
    1010             :          */
    1011           0 :         for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) {
    1012           0 :                 if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg)
    1013           0 :                         return &tp->table[idx];
    1014             : 
    1015           0 :                 cur_pg += TOPA_ENTRY_PAGES(topa, idx);
    1016             :         }
    1017             : 
    1018             :         /*
    1019             :          * Means we couldn't find a ToPA entry in the table that does match.
    1020             :          */
    1021           0 :         WARN_ON_ONCE(1);
    1022             : 
    1023           0 :         return NULL;
    1024             : }
    1025             : 
    1026             : static struct topa_entry *
    1027           0 : pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te)
    1028             : {
    1029           0 :         unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1);
    1030           0 :         struct topa_page *tp;
    1031           0 :         struct topa *topa;
    1032             : 
    1033           0 :         tp = (struct topa_page *)table;
    1034           0 :         if (tp->table != te)
    1035           0 :                 return --te;
    1036             : 
    1037           0 :         topa = &tp->topa;
    1038           0 :         if (topa == buf->first)
    1039           0 :                 topa = buf->last;
    1040             :         else
    1041           0 :                 topa = list_prev_entry(topa, list);
    1042             : 
    1043           0 :         tp = topa_to_page(topa);
    1044             : 
    1045           0 :         return &tp->table[topa->last - 1];
    1046             : }
    1047             : 
    1048             : /**
    1049             :  * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
    1050             :  * @buf:        PT buffer.
    1051             :  * @handle:     Current output handle.
    1052             :  *
    1053             :  * Place INT and STOP marks to prevent overwriting old data that the consumer
    1054             :  * hasn't yet collected and waking up the consumer after a certain fraction of
    1055             :  * the buffer has filled up. Only needed and sensible for non-snapshot counters.
    1056             :  *
    1057             :  * This obviously relies on buf::head to figure out buffer markers, so it has
    1058             :  * to be called after pt_buffer_reset_offsets() and before the hardware tracing
    1059             :  * is enabled.
    1060             :  */
    1061           0 : static int pt_buffer_reset_markers(struct pt_buffer *buf,
    1062             :                                    struct perf_output_handle *handle)
    1063             : 
    1064             : {
    1065           0 :         unsigned long head = local64_read(&buf->head);
    1066           0 :         unsigned long idx, npages, wakeup;
    1067             : 
    1068           0 :         if (buf->single)
    1069             :                 return 0;
    1070             : 
    1071             :         /* can't stop in the middle of an output region */
    1072           0 :         if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
    1073           0 :                 perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
    1074           0 :                 return -EINVAL;
    1075             :         }
    1076             : 
    1077             : 
    1078             :         /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
    1079           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
    1080             :                 return 0;
    1081             : 
    1082             :         /* clear STOP and INT from current entry */
    1083           0 :         if (buf->stop_te) {
    1084           0 :                 buf->stop_te->stop = 0;
    1085           0 :                 buf->stop_te->intr = 0;
    1086             :         }
    1087             : 
    1088           0 :         if (buf->intr_te)
    1089           0 :                 buf->intr_te->intr = 0;
    1090             : 
    1091             :         /* how many pages till the STOP marker */
    1092           0 :         npages = handle->size >> PAGE_SHIFT;
    1093             : 
    1094             :         /* if it's on a page boundary, fill up one more page */
    1095           0 :         if (!offset_in_page(head + handle->size + 1))
    1096           0 :                 npages++;
    1097             : 
    1098           0 :         idx = (head >> PAGE_SHIFT) + npages;
    1099           0 :         idx &= buf->nr_pages - 1;
    1100             : 
    1101           0 :         if (idx != buf->stop_pos) {
    1102           0 :                 buf->stop_pos = idx;
    1103           0 :                 buf->stop_te = pt_topa_entry_for_page(buf, idx);
    1104           0 :                 buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te);
    1105             :         }
    1106             : 
    1107           0 :         wakeup = handle->wakeup >> PAGE_SHIFT;
    1108             : 
    1109             :         /* in the worst case, wake up the consumer one page before hard stop */
    1110           0 :         idx = (head >> PAGE_SHIFT) + npages - 1;
    1111           0 :         if (idx > wakeup)
    1112             :                 idx = wakeup;
    1113             : 
    1114           0 :         idx &= buf->nr_pages - 1;
    1115           0 :         if (idx != buf->intr_pos) {
    1116           0 :                 buf->intr_pos = idx;
    1117           0 :                 buf->intr_te = pt_topa_entry_for_page(buf, idx);
    1118           0 :                 buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te);
    1119             :         }
    1120             : 
    1121           0 :         buf->stop_te->stop = 1;
    1122           0 :         buf->stop_te->intr = 1;
    1123           0 :         buf->intr_te->intr = 1;
    1124             : 
    1125           0 :         return 0;
    1126             : }
    1127             : 
    1128             : /**
    1129             :  * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
    1130             :  * @buf:        PT buffer.
    1131             :  * @head:       Write pointer (aux_head) from AUX buffer.
    1132             :  *
    1133             :  * Find the ToPA table and entry corresponding to given @head and set buffer's
    1134             :  * "current" pointers accordingly. This is done after we have obtained the
    1135             :  * current aux_head position from a successful call to perf_aux_output_begin()
    1136             :  * to make sure the hardware is writing to the right place.
    1137             :  *
    1138             :  * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
    1139             :  * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
    1140             :  * which are used to determine INT and STOP markers' locations by a subsequent
    1141             :  * call to pt_buffer_reset_markers().
    1142             :  */
    1143           0 : static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
    1144             : {
    1145           0 :         struct topa_page *cur_tp;
    1146           0 :         struct topa_entry *te;
    1147           0 :         int pg;
    1148             : 
    1149           0 :         if (buf->snapshot)
    1150           0 :                 head &= (buf->nr_pages << PAGE_SHIFT) - 1;
    1151             : 
    1152           0 :         if (!buf->single) {
    1153           0 :                 pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
    1154           0 :                 te = pt_topa_entry_for_page(buf, pg);
    1155             : 
    1156           0 :                 cur_tp = topa_entry_to_page(te);
    1157           0 :                 buf->cur = &cur_tp->topa;
    1158           0 :                 buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
    1159           0 :                 buf->output_off = head & (pt_buffer_region_size(buf) - 1);
    1160             :         } else {
    1161           0 :                 buf->output_off = head;
    1162             :         }
    1163             : 
    1164           0 :         local64_set(&buf->head, head);
    1165           0 :         local_set(&buf->data_size, 0);
    1166           0 : }
    1167             : 
    1168             : /**
    1169             :  * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
    1170             :  * @buf:        PT buffer.
    1171             :  */
    1172           0 : static void pt_buffer_fini_topa(struct pt_buffer *buf)
    1173             : {
    1174           0 :         struct topa *topa, *iter;
    1175             : 
    1176           0 :         if (buf->single)
    1177             :                 return;
    1178             : 
    1179           0 :         list_for_each_entry_safe(topa, iter, &buf->tables, list) {
    1180             :                 /*
    1181             :                  * right now, this is in free_aux() path only, so
    1182             :                  * no need to unlink this table from the list
    1183             :                  */
    1184           0 :                 topa_free(topa);
    1185             :         }
    1186             : }
    1187             : 
    1188             : /**
    1189             :  * pt_buffer_init_topa() - initialize ToPA table for pt buffer
    1190             :  * @buf:        PT buffer.
    1191             :  * @size:       Total size of all regions within this ToPA.
    1192             :  * @gfp:        Allocation flags.
    1193             :  */
    1194           0 : static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
    1195             :                                unsigned long nr_pages, gfp_t gfp)
    1196             : {
    1197           0 :         struct topa *topa;
    1198           0 :         int err;
    1199             : 
    1200           0 :         topa = topa_alloc(cpu, gfp);
    1201           0 :         if (!topa)
    1202             :                 return -ENOMEM;
    1203             : 
    1204           0 :         topa_insert_table(buf, topa);
    1205             : 
    1206           0 :         while (buf->nr_pages < nr_pages) {
    1207           0 :                 err = topa_insert_pages(buf, cpu, gfp);
    1208           0 :                 if (err) {
    1209           0 :                         pt_buffer_fini_topa(buf);
    1210           0 :                         return -ENOMEM;
    1211             :                 }
    1212             :         }
    1213             : 
    1214             :         /* link last table to the first one, unless we're double buffering */
    1215           0 :         if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
    1216           0 :                 TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first);
    1217           0 :                 TOPA_ENTRY(buf->last, -1)->end = 1;
    1218             :         }
    1219             : 
    1220           0 :         pt_topa_dump(buf);
    1221           0 :         return 0;
    1222             : }
    1223             : 
    1224           0 : static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
    1225             : {
    1226           0 :         struct page *p = virt_to_page(buf->data_pages[0]);
    1227           0 :         int ret = -ENOTSUPP, order = 0;
    1228             : 
    1229             :         /*
    1230             :          * We can use single range output mode
    1231             :          * + in snapshot mode, where we don't need interrupts;
    1232             :          * + if the hardware supports it;
    1233             :          * + if the entire buffer is one contiguous allocation.
    1234             :          */
    1235           0 :         if (!buf->snapshot)
    1236           0 :                 goto out;
    1237             : 
    1238           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
    1239           0 :                 goto out;
    1240             : 
    1241           0 :         if (PagePrivate(p))
    1242           0 :                 order = page_private(p);
    1243             : 
    1244           0 :         if (1 << order != nr_pages)
    1245           0 :                 goto out;
    1246             : 
    1247           0 :         buf->single = true;
    1248           0 :         buf->nr_pages = nr_pages;
    1249           0 :         ret = 0;
    1250           0 : out:
    1251           0 :         return ret;
    1252             : }
    1253             : 
    1254             : /**
    1255             :  * pt_buffer_setup_aux() - set up topa tables for a PT buffer
    1256             :  * @cpu:        Cpu on which to allocate, -1 means current.
    1257             :  * @pages:      Array of pointers to buffer pages passed from perf core.
    1258             :  * @nr_pages:   Number of pages in the buffer.
    1259             :  * @snapshot:   If this is a snapshot/overwrite counter.
    1260             :  *
    1261             :  * This is a pmu::setup_aux callback that sets up ToPA tables and all the
    1262             :  * bookkeeping for an AUX buffer.
    1263             :  *
    1264             :  * Return:      Our private PT buffer structure.
    1265             :  */
    1266             : static void *
    1267           0 : pt_buffer_setup_aux(struct perf_event *event, void **pages,
    1268             :                     int nr_pages, bool snapshot)
    1269             : {
    1270           0 :         struct pt_buffer *buf;
    1271           0 :         int node, ret, cpu = event->cpu;
    1272             : 
    1273           0 :         if (!nr_pages)
    1274             :                 return NULL;
    1275             : 
    1276             :         /*
    1277             :          * Only support AUX sampling in snapshot mode, where we don't
    1278             :          * generate NMIs.
    1279             :          */
    1280           0 :         if (event->attr.aux_sample_size && !snapshot)
    1281             :                 return NULL;
    1282             : 
    1283           0 :         if (cpu == -1)
    1284           0 :                 cpu = raw_smp_processor_id();
    1285           0 :         node = cpu_to_node(cpu);
    1286             : 
    1287           0 :         buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node);
    1288           0 :         if (!buf)
    1289             :                 return NULL;
    1290             : 
    1291           0 :         buf->snapshot = snapshot;
    1292           0 :         buf->data_pages = pages;
    1293           0 :         buf->stop_pos = -1;
    1294           0 :         buf->intr_pos = -1;
    1295             : 
    1296           0 :         INIT_LIST_HEAD(&buf->tables);
    1297             : 
    1298           0 :         ret = pt_buffer_try_single(buf, nr_pages);
    1299           0 :         if (!ret)
    1300             :                 return buf;
    1301             : 
    1302           0 :         ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
    1303           0 :         if (ret) {
    1304           0 :                 kfree(buf);
    1305           0 :                 return NULL;
    1306             :         }
    1307             : 
    1308             :         return buf;
    1309             : }
    1310             : 
    1311             : /**
    1312             :  * pt_buffer_free_aux() - perf AUX deallocation path callback
    1313             :  * @data:       PT buffer.
    1314             :  */
    1315           0 : static void pt_buffer_free_aux(void *data)
    1316             : {
    1317           0 :         struct pt_buffer *buf = data;
    1318             : 
    1319           0 :         pt_buffer_fini_topa(buf);
    1320           0 :         kfree(buf);
    1321           0 : }
    1322             : 
    1323           0 : static int pt_addr_filters_init(struct perf_event *event)
    1324             : {
    1325           0 :         struct pt_filters *filters;
    1326           0 :         int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
    1327             : 
    1328           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
    1329             :                 return 0;
    1330             : 
    1331           0 :         filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
    1332           0 :         if (!filters)
    1333             :                 return -ENOMEM;
    1334             : 
    1335           0 :         if (event->parent)
    1336           0 :                 memcpy(filters, event->parent->hw.addr_filters,
    1337             :                        sizeof(*filters));
    1338             : 
    1339           0 :         event->hw.addr_filters = filters;
    1340             : 
    1341           0 :         return 0;
    1342             : }
    1343             : 
    1344           0 : static void pt_addr_filters_fini(struct perf_event *event)
    1345             : {
    1346           0 :         kfree(event->hw.addr_filters);
    1347           0 :         event->hw.addr_filters = NULL;
    1348             : }
    1349             : 
    1350           0 : static inline bool valid_kernel_ip(unsigned long ip)
    1351             : {
    1352           0 :         return virt_addr_valid(ip) && kernel_ip(ip);
    1353             : }
    1354             : 
    1355           0 : static int pt_event_addr_filters_validate(struct list_head *filters)
    1356             : {
    1357           0 :         struct perf_addr_filter *filter;
    1358           0 :         int range = 0;
    1359             : 
    1360           0 :         list_for_each_entry(filter, filters, entry) {
    1361             :                 /*
    1362             :                  * PT doesn't support single address triggers and
    1363             :                  * 'start' filters.
    1364             :                  */
    1365           0 :                 if (!filter->size ||
    1366           0 :                     filter->action == PERF_ADDR_FILTER_ACTION_START)
    1367             :                         return -EOPNOTSUPP;
    1368             : 
    1369           0 :                 if (!filter->path.dentry) {
    1370           0 :                         if (!valid_kernel_ip(filter->offset))
    1371             :                                 return -EINVAL;
    1372             : 
    1373           0 :                         if (!valid_kernel_ip(filter->offset + filter->size))
    1374             :                                 return -EINVAL;
    1375             :                 }
    1376             : 
    1377           0 :                 if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
    1378             :                         return -EOPNOTSUPP;
    1379             :         }
    1380             : 
    1381             :         return 0;
    1382             : }
    1383             : 
    1384           0 : static void pt_event_addr_filters_sync(struct perf_event *event)
    1385             : {
    1386           0 :         struct perf_addr_filters_head *head = perf_event_addr_filters(event);
    1387           0 :         unsigned long msr_a, msr_b;
    1388           0 :         struct perf_addr_filter_range *fr = event->addr_filter_ranges;
    1389           0 :         struct pt_filters *filters = event->hw.addr_filters;
    1390           0 :         struct perf_addr_filter *filter;
    1391           0 :         int range = 0;
    1392             : 
    1393           0 :         if (!filters)
    1394             :                 return;
    1395             : 
    1396           0 :         list_for_each_entry(filter, &head->list, entry) {
    1397           0 :                 if (filter->path.dentry && !fr[range].start) {
    1398             :                         msr_a = msr_b = 0;
    1399             :                 } else {
    1400             :                         /* apply the offset */
    1401           0 :                         msr_a = fr[range].start;
    1402           0 :                         msr_b = msr_a + fr[range].size - 1;
    1403             :                 }
    1404             : 
    1405           0 :                 filters->filter[range].msr_a  = msr_a;
    1406           0 :                 filters->filter[range].msr_b  = msr_b;
    1407           0 :                 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER)
    1408           0 :                         filters->filter[range].config = 1;
    1409             :                 else
    1410           0 :                         filters->filter[range].config = 2;
    1411           0 :                 range++;
    1412             :         }
    1413             : 
    1414           0 :         filters->nr_filters = range;
    1415             : }
    1416             : 
    1417             : /**
    1418             :  * intel_pt_interrupt() - PT PMI handler
    1419             :  */
    1420           0 : void intel_pt_interrupt(void)
    1421             : {
    1422           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
    1423           0 :         struct pt_buffer *buf;
    1424           0 :         struct perf_event *event = pt->handle.event;
    1425             : 
    1426             :         /*
    1427             :          * There may be a dangling PT bit in the interrupt status register
    1428             :          * after PT has been disabled by pt_event_stop(). Make sure we don't
    1429             :          * do anything (particularly, re-enable) for this event here.
    1430             :          */
    1431           0 :         if (!READ_ONCE(pt->handle_nmi))
    1432             :                 return;
    1433             : 
    1434           0 :         if (!event)
    1435             :                 return;
    1436             : 
    1437           0 :         pt_config_stop(event);
    1438             : 
    1439           0 :         buf = perf_get_aux(&pt->handle);
    1440           0 :         if (!buf)
    1441             :                 return;
    1442             : 
    1443           0 :         pt_read_offset(buf);
    1444             : 
    1445           0 :         pt_handle_status(pt);
    1446             : 
    1447           0 :         pt_update_head(pt);
    1448             : 
    1449           0 :         perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
    1450             : 
    1451           0 :         if (!event->hw.state) {
    1452           0 :                 int ret;
    1453             : 
    1454           0 :                 buf = perf_aux_output_begin(&pt->handle, event);
    1455           0 :                 if (!buf) {
    1456           0 :                         event->hw.state = PERF_HES_STOPPED;
    1457           0 :                         return;
    1458             :                 }
    1459             : 
    1460           0 :                 pt_buffer_reset_offsets(buf, pt->handle.head);
    1461             :                 /* snapshot counters don't use PMI, so it's safe */
    1462           0 :                 ret = pt_buffer_reset_markers(buf, &pt->handle);
    1463           0 :                 if (ret) {
    1464           0 :                         perf_aux_output_end(&pt->handle, 0);
    1465           0 :                         return;
    1466             :                 }
    1467             : 
    1468           0 :                 pt_config_buffer(buf);
    1469           0 :                 pt_config_start(event);
    1470             :         }
    1471             : }
    1472             : 
    1473           0 : void intel_pt_handle_vmx(int on)
    1474             : {
    1475           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
    1476           0 :         struct perf_event *event;
    1477           0 :         unsigned long flags;
    1478             : 
    1479             :         /* PT plays nice with VMX, do nothing */
    1480           0 :         if (pt_pmu.vmx)
    1481             :                 return;
    1482             : 
    1483             :         /*
    1484             :          * VMXON will clear RTIT_CTL.TraceEn; we need to make
    1485             :          * sure to not try to set it while VMX is on. Disable
    1486             :          * interrupts to avoid racing with pmu callbacks;
    1487             :          * concurrent PMI should be handled fine.
    1488             :          */
    1489           0 :         local_irq_save(flags);
    1490           0 :         WRITE_ONCE(pt->vmx_on, on);
    1491             : 
    1492             :         /*
    1493             :          * If an AUX transaction is in progress, it will contain
    1494             :          * gap(s), so flag it PARTIAL to inform the user.
    1495             :          */
    1496           0 :         event = pt->handle.event;
    1497           0 :         if (event)
    1498           0 :                 perf_aux_output_flag(&pt->handle,
    1499             :                                      PERF_AUX_FLAG_PARTIAL);
    1500             : 
    1501             :         /* Turn PTs back on */
    1502           0 :         if (!on && event)
    1503           0 :                 wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
    1504             : 
    1505           0 :         local_irq_restore(flags);
    1506             : }
    1507             : EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
    1508             : 
    1509             : /*
    1510             :  * PMU callbacks
    1511             :  */
    1512             : 
    1513           0 : static void pt_event_start(struct perf_event *event, int mode)
    1514             : {
    1515           0 :         struct hw_perf_event *hwc = &event->hw;
    1516           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
    1517           0 :         struct pt_buffer *buf;
    1518             : 
    1519           0 :         buf = perf_aux_output_begin(&pt->handle, event);
    1520           0 :         if (!buf)
    1521           0 :                 goto fail_stop;
    1522             : 
    1523           0 :         pt_buffer_reset_offsets(buf, pt->handle.head);
    1524           0 :         if (!buf->snapshot) {
    1525           0 :                 if (pt_buffer_reset_markers(buf, &pt->handle))
    1526           0 :                         goto fail_end_stop;
    1527             :         }
    1528             : 
    1529           0 :         WRITE_ONCE(pt->handle_nmi, 1);
    1530           0 :         hwc->state = 0;
    1531             : 
    1532           0 :         pt_config_buffer(buf);
    1533           0 :         pt_config(event);
    1534             : 
    1535           0 :         return;
    1536             : 
    1537           0 : fail_end_stop:
    1538           0 :         perf_aux_output_end(&pt->handle, 0);
    1539           0 : fail_stop:
    1540           0 :         hwc->state = PERF_HES_STOPPED;
    1541             : }
    1542             : 
    1543           0 : static void pt_event_stop(struct perf_event *event, int mode)
    1544             : {
    1545           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
    1546             : 
    1547             :         /*
    1548             :          * Protect against the PMI racing with disabling wrmsr,
    1549             :          * see comment in intel_pt_interrupt().
    1550             :          */
    1551           0 :         WRITE_ONCE(pt->handle_nmi, 0);
    1552             : 
    1553           0 :         pt_config_stop(event);
    1554             : 
    1555           0 :         if (event->hw.state == PERF_HES_STOPPED)
    1556             :                 return;
    1557             : 
    1558           0 :         event->hw.state = PERF_HES_STOPPED;
    1559             : 
    1560           0 :         if (mode & PERF_EF_UPDATE) {
    1561           0 :                 struct pt_buffer *buf = perf_get_aux(&pt->handle);
    1562             : 
    1563           0 :                 if (!buf)
    1564             :                         return;
    1565             : 
    1566           0 :                 if (WARN_ON_ONCE(pt->handle.event != event))
    1567             :                         return;
    1568             : 
    1569           0 :                 pt_read_offset(buf);
    1570             : 
    1571           0 :                 pt_handle_status(pt);
    1572             : 
    1573           0 :                 pt_update_head(pt);
    1574             : 
    1575           0 :                 if (buf->snapshot)
    1576           0 :                         pt->handle.head =
    1577           0 :                                 local_xchg(&buf->data_size,
    1578             :                                            buf->nr_pages << PAGE_SHIFT);
    1579           0 :                 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
    1580             :         }
    1581             : }
    1582             : 
    1583           0 : static long pt_event_snapshot_aux(struct perf_event *event,
    1584             :                                   struct perf_output_handle *handle,
    1585             :                                   unsigned long size)
    1586             : {
    1587           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
    1588           0 :         struct pt_buffer *buf = perf_get_aux(&pt->handle);
    1589           0 :         unsigned long from = 0, to;
    1590           0 :         long ret;
    1591             : 
    1592           0 :         if (WARN_ON_ONCE(!buf))
    1593             :                 return 0;
    1594             : 
    1595             :         /*
    1596             :          * Sampling is only allowed on snapshot events;
    1597             :          * see pt_buffer_setup_aux().
    1598             :          */
    1599           0 :         if (WARN_ON_ONCE(!buf->snapshot))
    1600             :                 return 0;
    1601             : 
    1602             :         /*
    1603             :          * Here, handle_nmi tells us if the tracing is on
    1604             :          */
    1605           0 :         if (READ_ONCE(pt->handle_nmi))
    1606           0 :                 pt_config_stop(event);
    1607             : 
    1608           0 :         pt_read_offset(buf);
    1609           0 :         pt_update_head(pt);
    1610             : 
    1611           0 :         to = local_read(&buf->data_size);
    1612           0 :         if (to < size)
    1613           0 :                 from = buf->nr_pages << PAGE_SHIFT;
    1614           0 :         from += to - size;
    1615             : 
    1616           0 :         ret = perf_output_copy_aux(&pt->handle, handle, from, to);
    1617             : 
    1618             :         /*
    1619             :          * If the tracing was on when we turned up, restart it.
    1620             :          * Compiler barrier not needed as we couldn't have been
    1621             :          * preempted by anything that touches pt->handle_nmi.
    1622             :          */
    1623           0 :         if (pt->handle_nmi)
    1624           0 :                 pt_config_start(event);
    1625             : 
    1626             :         return ret;
    1627             : }
    1628             : 
    1629           0 : static void pt_event_del(struct perf_event *event, int mode)
    1630             : {
    1631           0 :         pt_event_stop(event, PERF_EF_UPDATE);
    1632           0 : }
    1633             : 
    1634           0 : static int pt_event_add(struct perf_event *event, int mode)
    1635             : {
    1636           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
    1637           0 :         struct hw_perf_event *hwc = &event->hw;
    1638           0 :         int ret = -EBUSY;
    1639             : 
    1640           0 :         if (pt->handle.event)
    1641           0 :                 goto fail;
    1642             : 
    1643           0 :         if (mode & PERF_EF_START) {
    1644           0 :                 pt_event_start(event, 0);
    1645           0 :                 ret = -EINVAL;
    1646           0 :                 if (hwc->state == PERF_HES_STOPPED)
    1647           0 :                         goto fail;
    1648             :         } else {
    1649           0 :                 hwc->state = PERF_HES_STOPPED;
    1650             :         }
    1651             : 
    1652             :         ret = 0;
    1653           0 : fail:
    1654             : 
    1655           0 :         return ret;
    1656             : }
    1657             : 
    1658           0 : static void pt_event_read(struct perf_event *event)
    1659             : {
    1660           0 : }
    1661             : 
    1662           0 : static void pt_event_destroy(struct perf_event *event)
    1663             : {
    1664           0 :         pt_addr_filters_fini(event);
    1665           0 :         x86_del_exclusive(x86_lbr_exclusive_pt);
    1666           0 : }
    1667             : 
    1668           0 : static int pt_event_init(struct perf_event *event)
    1669             : {
    1670           0 :         if (event->attr.type != pt_pmu.pmu.type)
    1671             :                 return -ENOENT;
    1672             : 
    1673           0 :         if (!pt_event_valid(event))
    1674             :                 return -EINVAL;
    1675             : 
    1676           0 :         if (x86_add_exclusive(x86_lbr_exclusive_pt))
    1677             :                 return -EBUSY;
    1678             : 
    1679           0 :         if (pt_addr_filters_init(event)) {
    1680           0 :                 x86_del_exclusive(x86_lbr_exclusive_pt);
    1681           0 :                 return -ENOMEM;
    1682             :         }
    1683             : 
    1684           0 :         event->destroy = pt_event_destroy;
    1685             : 
    1686           0 :         return 0;
    1687             : }
    1688             : 
    1689           0 : void cpu_emergency_stop_pt(void)
    1690             : {
    1691           0 :         struct pt *pt = this_cpu_ptr(&pt_ctx);
    1692             : 
    1693           0 :         if (pt->handle.event)
    1694           0 :                 pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
    1695           0 : }
    1696             : 
    1697           0 : int is_intel_pt_event(struct perf_event *event)
    1698             : {
    1699           0 :         return event->pmu == &pt_pmu.pmu;
    1700             : }
    1701             : 
    1702           1 : static __init int pt_init(void)
    1703             : {
    1704           1 :         int ret, cpu, prior_warn = 0;
    1705             : 
    1706           1 :         BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
    1707             : 
    1708           1 :         if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
    1709             :                 return -ENODEV;
    1710             : 
    1711           0 :         get_online_cpus();
    1712           0 :         for_each_online_cpu(cpu) {
    1713           0 :                 u64 ctl;
    1714             : 
    1715           0 :                 ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
    1716           0 :                 if (!ret && (ctl & RTIT_CTL_TRACEEN))
    1717           0 :                         prior_warn++;
    1718             :         }
    1719           0 :         put_online_cpus();
    1720             : 
    1721           0 :         if (prior_warn) {
    1722           0 :                 x86_add_exclusive(x86_lbr_exclusive_pt);
    1723           0 :                 pr_warn("PT is enabled at boot time, doing nothing\n");
    1724             : 
    1725           0 :                 return -EBUSY;
    1726             :         }
    1727             : 
    1728           0 :         ret = pt_pmu_hw_init();
    1729           0 :         if (ret)
    1730             :                 return ret;
    1731             : 
    1732           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) {
    1733           0 :                 pr_warn("ToPA output is not supported on this CPU\n");
    1734           0 :                 return -ENODEV;
    1735             :         }
    1736             : 
    1737           0 :         if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
    1738           0 :                 pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
    1739             : 
    1740           0 :         pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
    1741           0 :         pt_pmu.pmu.attr_groups           = pt_attr_groups;
    1742           0 :         pt_pmu.pmu.task_ctx_nr           = perf_sw_context;
    1743           0 :         pt_pmu.pmu.event_init            = pt_event_init;
    1744           0 :         pt_pmu.pmu.add                   = pt_event_add;
    1745           0 :         pt_pmu.pmu.del                   = pt_event_del;
    1746           0 :         pt_pmu.pmu.start                 = pt_event_start;
    1747           0 :         pt_pmu.pmu.stop                  = pt_event_stop;
    1748           0 :         pt_pmu.pmu.snapshot_aux          = pt_event_snapshot_aux;
    1749           0 :         pt_pmu.pmu.read                  = pt_event_read;
    1750           0 :         pt_pmu.pmu.setup_aux             = pt_buffer_setup_aux;
    1751           0 :         pt_pmu.pmu.free_aux              = pt_buffer_free_aux;
    1752           0 :         pt_pmu.pmu.addr_filters_sync     = pt_event_addr_filters_sync;
    1753           0 :         pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
    1754           0 :         pt_pmu.pmu.nr_addr_filters       =
    1755           0 :                 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges);
    1756             : 
    1757           0 :         ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
    1758             : 
    1759           0 :         return ret;
    1760             : }
    1761             : arch_initcall(pt_init);

Generated by: LCOV version 1.14